Source code for deirokay.statements.builtin.row_count

"""
Statement to check the number of rows in a scope.
"""
from typing import List

import dask.dataframe  # lazy module
import pandas  # lazy module

from deirokay._typing import DeirokayStatement
from deirokay.enums import Backend

from ..multibackend import profile, report
from .base_statement import BaseStatement


[docs]class RowCount(BaseStatement):
    """Check if the number of rows (or the number of of distinct rows)
    in a scope is between a minimum and maximum value.

    The available options are:

    * `min`: The minimum number of rows. If None, no minimum is
      enforced. Default: None.
    * `max`: The maximum number of rows. If None, no maximum is
      enforced. Default: None.
    * `distinct`: If True, check the number of distinct rows instead of
      the total number of rows. Default: False.

    Providing no `min` or `max` parameters, the statement will act only
    as a logger for its statistics.

    When counting the total number of rows (`distinct=False`), this
    statement may be applied to any scope of your DataFrame, since
    every column would have the same number of rows.
    By convention, you should apply it to a scope containing all the
    columns of your DataFrame.

    To count the number of (not-)null rows, you should use the `not_null`
    statement instead.
    To count the number of unique rows, use the `unique` statement.

    Examples
    --------
    * After some historial analysis of your data, you found that the
      number of rows is always greater or equal to than 42.
      You can declare the following validation item to represent this
      rule:

    .. code-block:: json

        {
            "scope": ["foo", "bar"],
            "statements": [
                {
                    "name": "row_count",
                    "min": 42
                }
            ]
        }

    * You have a table of daily transactions from all branches of a
      company. Not all branches have transactions for every day, and
      new branches may be added at any time. You want to ensure that
      the number of branches that appears in your data does not vary
      sharply downwards (below 5% of its 7-day historical average),
      which could be a sign of failure to receive transactions from
      some branches.
      You can declare the following validation item (in YAML format) to
      check this rule:

    .. code-block:: yaml

        scope: branch_name
        statements:
        - name: row_count
          distinct: True
          min: >
            {{ 0.95 * (
              series("transactions", 7).branch_name.row_count.distinct_rows.mean()  # noqa E501
              | default(19, true))
              | float
            ) }}

    There are many things going on here:

    * In YAML, the ">" operator is used to collapse a multi-line string
      into a single line. In JSON you would have to put everything in
      the same line;
    * The "{{}}" braces are used to indicate that the following
      expression is a Jinja2 template.
    * The `series` function is a built-in Deirokay method
      used to get the 7-day historical validations.
      Further down, the `mean` function is used to
      compute the 7-day average of the `distinct_rows` metric returned
      by the `row_count` statement in the `branch_name` scope.
    * The "|" operator inside the Jinja2 template is used to apply
      a function to the result of the previous expression, such that in
      the end we obtain a float.
    * The `default` function is used to set a default value if the
      previous expression is `None`.
    * The `float` function is used to convert the result of the
      previous expression, which is a `numpy.float64`, to a float,
      which can be properly serialized in JSON or YAML format when the
      validation logs are generated.

    For this example to work, you will need to declare in your 
    `deirokay.validate` call the `save_to` parameters, so that the
    validation logs can be saved and later used to provide historical
    analysis.

    .. code-block:: python

        from deirokay import validate

        validate(df, against=assertions, save_to='logs')

    """

    name = 'row_count'
    expected_parameters = ['min', 'max', 'distinct']
    supported_backends: List[Backend] = [Backend.PANDAS, Backend.DASK]

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.min = self.options.get('min', None)
        self.max = self.options.get('max', None)
        self.distinct = self.options.get('distinct', False)

    def _report_common(self, df):
        row_count = len(df)
        distinct_count = len(df.drop_duplicates())

        return {
            'rows': row_count,
            'distinct_rows': distinct_count,
        }

    @report(Backend.PANDAS)
    def _report_pandas(self, df: 'pandas.DataFrame') -> dict:
        return self._report_common(df)

    @report(Backend.DASK)
    def _report_dask(self, df: 'dask.dataframe.DataFrame') -> dict:
        return self._report_common(df)

    # docstr-coverage:inherited
[docs]    def result(self, report: dict) -> bool:
        if self.distinct:
            count = report['distinct_rows']
        else:
            count = report['rows']

        if self.min is not None:
            if not count >= self.min:
                return False
        if self.max is not None:
            if not count <= self.max:
                return False
        return True

    @staticmethod
    def _profile_common(df):
        statement: DeirokayStatement

        if len(df.columns) > 1:
            count = len(df)
            statement = {
                'type': 'row_count',
                'min': count,
                'max': count
            }
        else:
            count = len(df.drop_duplicates())
            statement = {
                'type': 'row_count',
                'distinct': True,
                'min': count,
                'max': count
            }
        return statement

    @profile(Backend.PANDAS)
    @staticmethod
    def _profile_pandas(df: 'pandas.DataFrame') -> DeirokayStatement:
        return RowCount._profile_common(df)

    @profile(Backend.DASK)
    @staticmethod
    def _profile_dask(df: 'dask.dataframe.DataFrame') -> DeirokayStatement:
        return RowCount._profile_common(df)