Source code for deirokay.parser.treaters.builtin.boolean_treater

"""
Classes and functions to treat column data types according to
Deirokay data types.
"""

from typing import Iterable, List, Optional, Union

import dask.dataframe  # lazy module
import numpy  # lazy module
import pandas  # lazy module

from deirokay._typing import DeirokaySerializedSeries
from deirokay.enums import Backend, DTypes

from ..multibackend import serialize, treat
from .validator import Validator


[docs]class BooleanTreater(Validator): """Treater for boolean-like variables Parameters ---------- truthies : List[str], optional List of strings to be considered as True, by default ['true', 'True'] falsies : List[str], optional List of strings to be considered as False, by default ['false', 'False'] ignore_case : bool, optional Ignore case when evaluating list of values, by default False default_value : Optional[bool], optional Value to replace null values (None, pandas.NA or numpy,NaN). By default None. """ supported_backends = [Backend.PANDAS, Backend.DASK] supported_dtype = DTypes.BOOLEAN supported_primitives = [bool] def __init__(self, truthies: List[str] = ['true', 'True'], falsies: List[str] = ['false', 'False'], ignore_case: bool = False, default_value: Optional[bool] = None, **kwargs): super().__init__(**kwargs) assert default_value in (True, False, None) self.ignore_case = ignore_case self.default_value = default_value if self.ignore_case: self.truthies = set(truthy.lower() for truthy in truthies) self.falsies = set(falsy.lower() for falsy in falsies) else: self.truthies = set(truthies) self.falsies = set(falsies) if self.truthies & self.falsies: raise ValueError('Truthies and Falsies sets should be' ' disjoint sets.') def _evaluate(self, value: Union[bool, str, None]) -> Union[bool, None]: if value is True: return True if value is False: return False if value is None or value is numpy.nan or value is pandas.NA: return self.default_value _value = value.lower() if self.ignore_case else value if _value in self.truthies: return True if _value in self.falsies: return False raise ValueError(f'Unexpected boolean value: "{value}"' f' ({value.__class__})\n' f'Expected values: {self.truthies | self.falsies}') @treat(Backend.PANDAS) def _treat_pandas(self, series: Iterable) -> 'pandas.Series': series = super()._treat_pandas(series) series = series.apply(self._evaluate).astype('boolean') # Validate again super()._treat_pandas(series) return series @treat(Backend.DASK) def _treat_dask( self, series: Iterable ) -> 'dask.dataframe.Series': series = super()._treat_dask(series) series = series.apply( self._evaluate, meta=(series.name, 'object') ).astype('boolean') # Validate again super()._treat_dask(series) return series @staticmethod def _serialize_common(series): def _convert(item): if item is pandas.NA: return None return bool(item) return { 'values': [_convert(item) for item in series], 'parser': { 'dtype': BooleanTreater.supported_dtype.value } } @serialize(Backend.PANDAS) @staticmethod def _serialize_pandas( series: 'pandas.Series' ) -> DeirokaySerializedSeries: return BooleanTreater._serialize_common(series) @serialize(Backend.DASK) @staticmethod def _serialize_dask( series: 'dask.dataframe.Series' ) -> DeirokaySerializedSeries: return BooleanTreater._serialize_common(series)