"""
Functions for data profiling and auto-generation of Deirokay Validation
Documents.
"""
import warnings
from typing import List, Optional
from deirokay.backend import detect_backend
from deirokay.enums import Backend
from deirokay.exceptions import UnsupportedBackend
from .__version__ import __version__
from ._typing import (DeirokayDataSource, DeirokayStatement,
DeirokayValidationDocument, DeirokayValidationItem)
from .fs import fs_factory
from .statements import STATEMENTS_MAP
def _generate_statements(df_scope: DeirokayDataSource,
backend: Backend) -> List[DeirokayStatement]:
statements: List[DeirokayStatement] = []
for stmt_cls in STATEMENTS_MAP.values():
try:
execution_cls = stmt_cls.attach_backend(backend)
statement = execution_cls.profile(df_scope)
statements.append(statement)
except UnsupportedBackend:
pass
except NotImplementedError:
pass
except Exception as e:
columns = list(df_scope.columns)
warnings.warn(
f'Unexpected error when profiling scope {columns}'
f' using {stmt_cls.__name__} statement: {e}\n\n'
'Please, consider reporting this issue to the '
'developers.',
RuntimeWarning
)
return statements
def _generate_items(df: DeirokayDataSource,
backend: Backend) -> List[DeirokayValidationItem]:
items: List[DeirokayValidationItem] = []
df_columns = list(df.columns)
scope__table_stmt = [df_columns] + df_columns
for scope in scope__table_stmt:
df_scope = df[scope] if isinstance(scope, list) else df[[scope]]
item = {
'scope': scope,
'statements': _generate_statements(df_scope, backend)
} # type: DeirokayValidationItem
if item['statements']:
items.append(item)
return items
[docs]def profile(df: DeirokayDataSource,
document_name: str,
save_to: Optional[str] = None) -> DeirokayValidationDocument:
"""Generate a validation document from a given template DataFrame
using profiling methods for builtin Deirokay statements.
By default, statement objects are generated for the entire template
DataFrame (the entire set of columns), and then for each of its
columns individually.
This function should be used only as a draft for a validation
document or as a means to quickly launch a first version with
minimum efforts.
The user is encouraged to correct and supplement the generated
document to better meet their expectations.
Parameters
----------
df : DataFrame
The DataFrame to use as template, ideally parsed with Deirokay
`data_reader`.
document_name : str
The validation document name.
save_to : Optional[str], optional
Path (lcaol or S3) where to save the validation document to.
The file format is inferred by the its extension.
If None, no document will be saved. By default None.
Returns
-------
dict
The auto-generated validation document as Python `dict`.
"""
backend = detect_backend(df)
validation_document = {
'name': document_name,
'description': f'Auto generated using Deirokay {__version__}',
'items': _generate_items(df, backend)
} # type: DeirokayValidationDocument
if save_to:
fs_factory(save_to).write_dict(validation_document, indent=2)
return validation_document