Source code for deirokay.statements.builtin.not_null

"""
Statement to check the number of not-null rows in a scope.
"""
import dask.dataframe  # lazy module
import pandas  # lazy module

from deirokay._typing import DeirokayStatement
from deirokay.enums import Backend

from ..multibackend import profile, report
from .base_statement import BaseStatement


[docs]class NotNull(BaseStatement): """Check if the rows of a scoped DataFrame are not null, possibly setting boundaries for the minimum and maximum percentage of not-null rows. The available options are: * `at_least_%`: The minimum percentage of not-null rows. Default: 100.0. * `at_most_%`: The maximum percentage of not-null rows. Default: 100.0. * `multicolumn_logic`: The logic to use when checking for not-null values in multicolumn scopes (either 'any' or 'all'). Default: 'any'. Be careful When using multicolumn scopes: the `any` logic considers a row as null only if all columns are null. The `all` logic considers a row as null when any of its columns is null. Examples -------- * You want to ensure that less than 1% of the values in a column `foo` are null. You can declare the following validation item: .. code-block:: json { "scope": "foo", "statements": [ { "name": "not_null", "at_least_%": 99.0 } ] } You noticed that you imposed a unrealistic value for `at_least_%`, and maybe less than 10% should be a reasonable percentage of null values. Still, you don't want to lose track of that ideal <= 1% checks, since you intend to improve your data quality in the near future. You may take advantage of `severity` to set different exception levels for different values of `at_least_%`: .. code-block:: json { "scope": "foo", "statements": [ { "name": "not_null", "at_least_%": 99.0, "severity": 3 }, { "name": "not_null", "at_least_%": 90.0, "severity": 5 } ] } This way, values between 90% and 99% will only raise a warning, while values below 90% will raise a validation exception (by default). * You don't tolerate any null values in a list of columns: .. code-block:: json { "scope": ["foo", "bar", "baz", "qux"], "statements": [ { "name": "not_null", "multicolumn_logic": "all" } ] } """ name = 'not_null' expected_parameters = ['at_least_%', 'at_most_%', 'multicolumn_logic'] supported_backends = [Backend.PANDAS, Backend.DASK] def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.at_least_perc = self.options.get('at_least_%', 100.0) self.at_most_perc = self.options.get('at_most_%', 100.0) self.multicolumn_logic = self.options.get('multicolumn_logic', 'any') assert self.multicolumn_logic in ('any', 'all') def _report_common(self, df): if self.multicolumn_logic == 'all': # REMINDER: ~all == any not_nulls = ~df.isnull().any(axis=1) else: not_nulls = ~df.isnull().all(axis=1) null_rows = int(sum(~not_nulls)) not_null_rows = int(sum(not_nulls)) return { 'null_rows': null_rows, 'null_rows_%': 100.0*null_rows/len(df), 'not_null_rows': not_null_rows, 'not_null_rows_%': 100.0*not_null_rows/len(df), } @report(Backend.PANDAS) def _report_pandas(self, df: 'pandas.DataFrame') -> dict: return self._report_common(df) @report(Backend.DASK) def _report_dask(self, df: 'dask.dataframe.DataFrame') -> dict: return self._report_common(df) # docstr-coverage:inherited
[docs] def result(self, report: dict) -> bool: if not report.get('not_null_rows_%') >= self.at_least_perc: return False if not report.get('not_null_rows_%') <= self.at_most_perc: return False return True
@staticmethod def _profile_common(df): statement = { 'type': 'not_null' } # type: DeirokayStatement not_nulls = ~df.isnull().all(axis=1) at_least_perc = float(100.0*sum(not_nulls)/len(not_nulls)) if at_least_perc == 0.0: raise NotImplementedError( 'Statement is useless when all rows are null.' ) if at_least_perc != 100.0: statement['at_least_%'] = at_least_perc return statement @profile(Backend.PANDAS) @staticmethod def _profile_pandas(df: 'pandas.DataFrame') -> DeirokayStatement: return NotNull._profile_common(df) @profile(Backend.DASK) @staticmethod def _profile_dask(df: 'dask.dataframe.DataFrame') -> DeirokayStatement: return NotNull._profile_common(df)