Source code for deirokay.parser.reader.pandas_reader

from posixpath import splitext
from typing import Any, Dict, List, Union

import pandas  # lazy module

from deirokay.fs import fs_factory


[docs]def read(data: Union[str, 'pandas.DataFrame'], columns: List[str], sql: bool = False, **kwargs) -> 'pandas.DataFrame': """Infer the file type by its extension and call the proper `pandas` method to parse it. Parameters ---------- data : Union[DataFrame, dd.DataFrame, str] Path to file or SQL query, or DataFrame object columns : List[str] List of columns to be parsed. sql : bool, optional Whether or not `data` should be interpreted as a path to a file or a SQL query. **kwargs : dict Arguments to be passed to `pandas` methods when reading. Returns ------- pandas.DataFrame The pandas DataFrame. """ default_kwargs: Dict[str, Any] = {} if isinstance(data, pandas.DataFrame): return data[columns].copy() if not isinstance(data, str): raise TypeError(f'Unexpected type for `data` ({data.__class__})') if sql: read_ = pandas.read_sql else: file_extension = splitext(data)[1].lstrip('.').lower() if file_extension == 'sql': query = fs_factory(data).read() return read(query, columns, sql=True, **kwargs) if file_extension in ('xls', 'xlsx'): file_extension = 'excel' if file_extension == 'csv': default_kwargs.update({ 'dtype': str, 'skipinitialspace': True, }) read_ = getattr(pandas, f'read_{file_extension}', None) if read_ is None: raise TypeError(f'File type "{file_extension}" not supported') default_kwargs.update(kwargs) # try `columns` argument try: return read_(data, columns=columns, **default_kwargs) except TypeError as e: if 'columns' not in str(e): raise e # try `usecols` argument try: return read_(data, usecols=columns, **default_kwargs) except TypeError as e: if 'usecols' not in str(e): raise e # give up, read everything, filter columns later return read_(data, **default_kwargs)[columns]