"""
Set of functions related to Deirokay validation.
"""
import json
import warnings
from copy import deepcopy
from datetime import datetime
from os.path import splitext
from typing import Optional, Union
from jinja2 import BaseLoader
from jinja2 import StrictUndefined as strict
from jinja2.nativetypes import NativeEnvironment
from deirokay._utils import check_columns_in_df_columns, render_dict
from deirokay.backend import detect_backend
from deirokay.enums import SeverityLevel
from deirokay.exceptions import ValidationError
from deirokay.fs import FileSystem, LocalFileSystem, fs_factory
from deirokay.history_template import get_series
from deirokay.statements.loader import statement_factory
from ._typing import (DeirokayDataSource, DeirokayValidationDocument,
DeirokayValidationResultDocument)
[docs]def validate(
df: DeirokayDataSource, *,
against: Union[str, DeirokayValidationDocument],
save_to: Optional[str] = None,
save_format: str = None,
current_date: Optional[datetime] = None,
raise_exception: bool = True,
exception_level: SeverityLevel = SeverityLevel.CRITICAL,
template: Optional[dict] = None
) -> DeirokayValidationResultDocument:
"""Validate a Deirokay DataFrame against a well-defined Validation
Document.
Parameters
----------
df : pandas.DataFrame
DataFrame preferencially parsed by Deirokay `data_reader`.
against : Union[str, dict]
A `dict`-like Validation Document or a local/S3 path to a
validation file in either YAML or JSON format.
save_to : str, optional
Path to folder where the validation result document will be
saved to. An subfolder named the same as the validation
document named will be created in this path, and the results
will be saved inside following the
`<current_date>.<save_format>` pattern. `<current_date>` is
formatted as `%Y%m%dT%H%M%S` and `<save_format>` can be either
`yaml` or `json`.
If None, no validation log will be saved. By default None.
save_format : str, optional
Format in which to save the validation document
(`yaml` or `json`).
Only valid when `save_to` is not None.
If None and `against` is a Python dictionary, defaults to YAML.
If None and `against` is a valid path to a YAML or JSON file,
it will keep this format.
By default None
current_date : datetime, optional
Python `datetime.datetime` to use in log file name when saving
validation result.
Only valid when `save_to` is not None.
If None, defaults to `datetime.utcnow()` value and raises a
warning.
By default None
raise_exception : bool, optional
Whether or not to raise a ValidationError exception whenever a
statement whose level is greater or equal to `exception_level`
fails.
By default True
exception_level : SeverityLevel, optional
Minimum statement severity to raise exception for when
statement validation fails.
Only valid when `raise_exception` is True.
By default SeverityLevel.CRITICAL (5).
template : dict, optional
Map of custom templates to be replaced in validation document
before evaluation of statements. For mapped values, if
callable, the returned value is used instead.
Returns
-------
DeirokayValidationResultDocument
Validation Result Document dict.
Raises
------
ValueError
`save_to` parameter is not a directory neither an S3 path.
ValidationError
Validation failed for at least one statement whose severity is
greater or equal to `exception_level`.
"""
backend = detect_backend(df)
if save_to:
save_to_fs = fs_factory(save_to)
if isinstance(save_to_fs, LocalFileSystem) and not save_to_fs.isdir():
raise ValueError('The `save_to` parameter must be an existing'
' directory or an S3 path.')
if isinstance(against, str):
save_format = save_format or splitext(against)[1].lstrip('.')
validation_document = fs_factory(against).read_dict()
else:
save_format = save_format or 'yaml'
validation_document = deepcopy(against)
assert save_format.lower() in ('json', 'yaml', 'yml'), (
f'Not a valid format {save_format}'
)
# Render templates
template = dict(
series=lambda x, y: get_series(x, y, read_from=save_to_fs),
**(template or {})
)
render_dict(NativeEnvironment(loader=BaseLoader(), undefined=strict),
dict_=validation_document,
template=template)
for item in validation_document['items']:
scope = item['scope']
scope = [scope] if not isinstance(scope, list) else scope
check_columns_in_df_columns(scope, df.columns)
df_scope = df[scope]
for stmt in item.get('statements'):
report = statement_factory(stmt, backend)(df_scope)
if report['result'] is True:
report['result'] = 'pass'
else:
report['result'] = 'fail'
stmt['report'] = report
if save_to:
_save_validation_document(validation_document, save_to_fs,
save_format, current_date)
if raise_exception:
raise_validation(validation_document, exception_level)
return validation_document
[docs]def raise_validation(validation_result_document: dict,
exception_level: SeverityLevel) -> None:
"""Check for a validation result `dict` and raise a
`ValidationError` exception whenever a statement whose severity
level is greater or equal to `exception_level` fails.
Parameters
----------
validation_result_document : dict
Validation Result Document generated by a Deirokay validation.
exception_level : SeverityLevel
Integer for the minimum severity level to raise exception for.
Raises
------
ValidationError
Validation failed for at least one statement whose severity is
greater or equal to `exception_level`.
"""
highest_level = None
for item in validation_result_document['items']:
scope = item['scope']
for stmt in item['statements']:
severity = stmt.get('severity', SeverityLevel.CRITICAL)
result = stmt['report']['result']
if result == 'fail':
if severity >= exception_level:
if highest_level is None or severity > highest_level:
highest_level = severity
print(f'Statement failed for scope {scope}:')
print(json.dumps(stmt, indent=4))
if highest_level is not None:
print(f'Severity level threshold was {exception_level}.')
raise ValidationError(
highest_level,
f'Validation failed with severity level {highest_level}.'
)
def _save_validation_document(document: dict,
save_to: FileSystem,
save_format: Optional[str] = None,
current_date: Optional[datetime] = None) -> None:
if current_date is None:
warnings.warn(
'Document is being saved using the current date returned by the'
' `datetime.utcnow()` method. Instead, prefer to explicitly pass a'
' `current_date` argument to `validate`.', Warning
)
current_date = datetime.utcnow()
current_date = current_date.strftime('%Y%m%dT%H%M%S') # type: ignore
document_name = document['name']
folder_path = save_to / document_name
if isinstance(folder_path, LocalFileSystem):
folder_path.mkdir(parents=True, exist_ok=True)
file_path = folder_path / f'{current_date}.{save_format}'
print(f'Saving validation document to "{file_path!s}".')
file_path.write_dict(document, indent=2)