Source code for deirokay.statements.builtin.row_count

"""
Statement to check the number of rows in a scope.
"""
from typing import List

import dask.dataframe  # lazy module
import pandas  # lazy module

from deirokay._typing import DeirokayStatement
from deirokay.enums import Backend

from ..multibackend import profile, report
from .base_statement import BaseStatement


[docs]class RowCount(BaseStatement): """Check if the number of rows (or the number of of distinct rows) in a scope is between a minimum and maximum value. The available options are: * `min`: The minimum number of rows. If None, no minimum is enforced. Default: None. * `max`: The maximum number of rows. If None, no maximum is enforced. Default: None. * `distinct`: If True, check the number of distinct rows instead of the total number of rows. Default: False. Providing no `min` or `max` parameters, the statement will act only as a logger for its statistics. When counting the total number of rows (`distinct=False`), this statement may be applied to any scope of your DataFrame, since every column would have the same number of rows. By convention, you should apply it to a scope containing all the columns of your DataFrame. To count the number of (not-)null rows, you should use the `not_null` statement instead. To count the number of unique rows, use the `unique` statement. Examples -------- * After some historial analysis of your data, you found that the number of rows is always greater or equal to than 42. You can declare the following validation item to represent this rule: .. code-block:: json { "scope": ["foo", "bar"], "statements": [ { "name": "row_count", "min": 42 } ] } * You have a table of daily transactions from all branches of a company. Not all branches have transactions for every day, and new branches may be added at any time. You want to ensure that the number of branches that appears in your data does not vary sharply downwards (below 5% of its 7-day historical average), which could be a sign of failure to receive transactions from some branches. You can declare the following validation item (in YAML format) to check this rule: .. code-block:: yaml scope: branch_name statements: - name: row_count distinct: True min: > {{ 0.95 * ( series("transactions", 7).branch_name.row_count.distinct_rows.mean() # noqa E501 | default(19, true)) | float ) }} There are many things going on here: * In YAML, the ">" operator is used to collapse a multi-line string into a single line. In JSON you would have to put everything in the same line; * The "{{}}" braces are used to indicate that the following expression is a Jinja2 template. * The `series` function is a built-in Deirokay method used to get the 7-day historical validations. Further down, the `mean` function is used to compute the 7-day average of the `distinct_rows` metric returned by the `row_count` statement in the `branch_name` scope. * The "|" operator inside the Jinja2 template is used to apply a function to the result of the previous expression, such that in the end we obtain a float. * The `default` function is used to set a default value if the previous expression is `None`. * The `float` function is used to convert the result of the previous expression, which is a `numpy.float64`, to a float, which can be properly serialized in JSON or YAML format when the validation logs are generated. For this example to work, you will need to declare in your `deirokay.validate` call the `save_to` parameters, so that the validation logs can be saved and later used to provide historical analysis. .. code-block:: python from deirokay import validate validate(df, against=assertions, save_to='logs') """ name = 'row_count' expected_parameters = ['min', 'max', 'distinct'] supported_backends: List[Backend] = [Backend.PANDAS, Backend.DASK] def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.min = self.options.get('min', None) self.max = self.options.get('max', None) self.distinct = self.options.get('distinct', False) def _report_common(self, df): row_count = len(df) distinct_count = len(df.drop_duplicates()) return { 'rows': row_count, 'distinct_rows': distinct_count, } @report(Backend.PANDAS) def _report_pandas(self, df: 'pandas.DataFrame') -> dict: return self._report_common(df) @report(Backend.DASK) def _report_dask(self, df: 'dask.dataframe.DataFrame') -> dict: return self._report_common(df) # docstr-coverage:inherited
[docs] def result(self, report: dict) -> bool: if self.distinct: count = report['distinct_rows'] else: count = report['rows'] if self.min is not None: if not count >= self.min: return False if self.max is not None: if not count <= self.max: return False return True
@staticmethod def _profile_common(df): statement: DeirokayStatement if len(df.columns) > 1: count = len(df) statement = { 'type': 'row_count', 'min': count, 'max': count } else: count = len(df.drop_duplicates()) statement = { 'type': 'row_count', 'distinct': True, 'min': count, 'max': count } return statement @profile(Backend.PANDAS) @staticmethod def _profile_pandas(df: 'pandas.DataFrame') -> DeirokayStatement: return RowCount._profile_common(df) @profile(Backend.DASK) @staticmethod def _profile_dask(df: 'dask.dataframe.DataFrame') -> DeirokayStatement: return RowCount._profile_common(df)