Module datasae.converter
Converter library.
A class called Config
that represents a configuration object for reading
data source configurations from a JSON or YAML file.
Sub-modules
datasae.converter.gsheet
-
Google Spreadsheet library.
datasae.converter.local
-
local library.
datasae.converter.s3
-
s3 library.
datasae.converter.sql
-
sql library.
Classes
class CaseInsensitiveEnum (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
A case-insensitive enumeration class.
A case-insensitive enumeration class that allows for case-insensitive comparison of enum values and provides a case-insensitive lookup of enum members.
Expand source code
class CaseInsensitiveEnum(str, Enum): """ A case-insensitive enumeration class. A case-insensitive enumeration class that allows for case-insensitive comparison of enum values and provides a case-insensitive lookup of enum members. """ def __eq__(self, __value: str) -> bool: """ __eq__ methods. Overrides the __eq__ method to perform case-insensitive comparison of enum values. Args: __value (str): The value to compare with the enum value. Returns: bool: True if the values are equal (case-insensitive), False otherwise. """ return super().__eq__(__value.lower() if __value else __value) @classmethod def _missing_(cls, value: str) -> CaseInsensitiveEnum: """ _missing_ method. Overrides the _missing_ method to perform case-insensitive lookup of enum members. Args: value (str): The value to lookup in the enum members. Returns: CaseInsensitiveEnum: The enum member with the matching value (case- insensitive). """ value = value.lower() if value else value for member in cls: if member.value.lower() == value: return member
Ancestors
- builtins.str
- enum.Enum
Subclasses
class Config (file_path: str)
-
A class that represents a configuration object.
Args
file_path
:str
- The source path of the .json or .yaml file.
Example Usage: config = Config("data.json") data_source = config("source1") print(data_source.connection)
Attributes
__file
:str
- The source path of the file.
__file_type
:str
- The type of the file.
Methods
call(name): Returns a data source configuration from a file.
Expand source code
class Config: """ A class that represents a configuration object. Args: file_path (str): The source path of the .json or .yaml file. Example Usage: config = Config("data.json") data_source = config("source1") print(data_source.connection) Attributes: __file (str): The source path of the file. __file_type (str): The type of the file. Methods: __call__(name): Returns a data source configuration from a file. """ file_path: str @staticmethod def config(file_path: str) -> dict: """ Config.config static method. Reads a file and returns its contents as a dictionary. Args: file_path (str): Source path of your .json or .yaml file. Returns: dict: The contents of the file as a dictionary. """ file: Path = Path(file_path) file_type: FileType = FileType(file.suffix) data: dict = {} with open(file) as file_obj: if file_type is FileType.JSON: data = json.loads(file_obj.read()) elif file_type in (FileType.YAML, FileType.YML): data = yaml.safe_load(file_obj) return data def __call__(self, name: str) -> DataSource: """ Return data source configuration from file. Args: name (str): Name of data source. Returns: DataSource: An instance class of data source containing configuration properties. """ data_source: dict = { 'name': name, 'file_path': self.file_path, **{ key: value for key, value in Config.config( self.file_path ).get(name, {}).items() if key != 'checker' } } data_source_type: str = data_source.pop('type') if data_source_type.lower() == 'local': try: from .local import Local except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter]'" ) # pragma: no cover raise # pragma: no cover data_source_type = Local elif data_source_type.lower() == 'gsheet': try: from .gsheet import GSheet except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter,gsheet]'" ) # pragma: no cover raise # pragma: no cover data_source_type = GSheet elif data_source_type.lower() == 's3': try: from .s3 import S3 except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter,s3]'" ) # pragma: no cover raise # pragma: no cover data_source_type = S3 elif data_source_type.lower() == 'sql': try: from .sql import Sql except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter,sql]'" ) # pragma: no cover raise # pragma: no cover data_source_type = Sql else: try: # Dynamic instantiation from string name of a class in # dynamically imported module? # https://stackoverflow.com/questions/4821104/dynamic-instantiation-from-string-name-of-a-class-in-dynamically-imported-module data_source_type = locate(data_source_type) except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter,gsheet,s3,sql]'" ) # pragma: no cover raise # pragma: no cover return data_source_type(**data_source) @property def checker(self) -> dict[str, list[dict]]: """ Checker is instance's attribute. Creates all of checker result based on the configuration provided in the checker section of the data source's configuration file. """ return { name: self(name).checker for name in self.config(self.file_path).keys() }
Class variables
var file_path : str
Static methods
def config(file_path: str) ‑> dict
-
Config.config static method.
Reads a file and returns its contents as a dictionary.
Args
file_path
:str
- Source path of your .json or .yaml file.
Returns
dict
- The contents of the file as a dictionary.
Instance variables
prop checker : dict[str, list[dict]]
-
Checker is instance's attribute.
Creates all of checker result based on the configuration provided in the checker section of the data source's configuration file.
Expand source code
@property def checker(self) -> dict[str, list[dict]]: """ Checker is instance's attribute. Creates all of checker result based on the configuration provided in the checker section of the data source's configuration file. """ return { name: self(name).checker for name in self.config(self.file_path).keys() }
class DataSource (name: str, file_path: str)
-
DataSource class.
A class that converts data of different file types into a Pandas DataFrame.
Expand source code
class DataSource: """ DataSource class. A class that converts data of different file types into a Pandas DataFrame. """ name: str file_path: str @property def checker(self) -> list[dict]: """ Checker is instance's attribute. Creates a list of checker result based on the configuration provided in the checker section of the data source's configuration file. """ checker_list: list[dict] = Config.config( self.file_path )[self.name].get('checker', []) for checker in checker_list: data: pd.DataFrame = self(**{ key: value for key, value in checker.items() if key != 'column' }) for column_name, data_type_list in checker['column'].items(): for data_type, rules in data_type_list.items(): try: check_data: Any = { 'boolean': Boolean, 'float': Float, 'integer': Integer, 'string': String, 'timestamp': Timestamp }.get( data_type.lower(), locate(data_type) )(data) except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter]'" ) # pragma: no cover raise # pragma: no cover for method_name, params in rules.items(): method = getattr(check_data, method_name) rules[method_name] = dict( params=params, result=method(**params, column=column_name) if isinstance(params, dict) else method( *( params if isinstance(params, list) else ([params] if params else []) ), column=column_name ) ) return checker_list @property def connection(self) -> dict: """ Return connection to data source. Returns: dict: Key-value parameters for connection to datasource. """ return { key: value for key, value in self.__dict__.items() if key not in DataSource.__annotations__.keys() } def __call__( self, file_type: FileType, data: bytes | str, *args, **kwargs ) -> pd.DataFrame | bytes: """ __call__ method. Converter from various file type into Pandas DataFrame. Args: file_type (FileType): _description_ data (bytes | str): Data's bytes or sql query needed convert to dataframe. Returns: DataFrame | bytes: Pandas DataFrame or bytes if file type not support. """ if file_type in list(FileType): func: Callable = None if file_type is FileType.CSV: func = pd.read_csv elif file_type is FileType.JSON: func = pd.read_json elif file_type is FileType.PARQUET: func = pd.read_parquet elif file_type is FileType.SQL: func = pd.read_sql_query elif file_type is FileType.XLSX: func = pd.read_excel if func: with warnings.catch_warnings(record=True): warnings.simplefilter('always') if file_type is FileType.SQL: data = func(data, self.connection, *args, **kwargs) else: data = func( StringIO(data.decode()) if file_type in (FileType.CSV, FileType.JSON) else BytesIO(data), *args, **kwargs ) return data
Subclasses
Class variables
var file_path : str
var name : str
Instance variables
prop checker : list[dict]
-
Checker is instance's attribute.
Creates a list of checker result based on the configuration provided in the checker section of the data source's configuration file.
Expand source code
@property def checker(self) -> list[dict]: """ Checker is instance's attribute. Creates a list of checker result based on the configuration provided in the checker section of the data source's configuration file. """ checker_list: list[dict] = Config.config( self.file_path )[self.name].get('checker', []) for checker in checker_list: data: pd.DataFrame = self(**{ key: value for key, value in checker.items() if key != 'column' }) for column_name, data_type_list in checker['column'].items(): for data_type, rules in data_type_list.items(): try: check_data: Any = { 'boolean': Boolean, 'float': Float, 'integer': Integer, 'string': String, 'timestamp': Timestamp }.get( data_type.lower(), locate(data_type) )(data) except ModuleNotFoundError: # pragma: no cover logging.error( 'Please run this on your terminal:' ) # pragma: no cover logging.error( "pip install 'DataSae[converter]'" ) # pragma: no cover raise # pragma: no cover for method_name, params in rules.items(): method = getattr(check_data, method_name) rules[method_name] = dict( params=params, result=method(**params, column=column_name) if isinstance(params, dict) else method( *( params if isinstance(params, list) else ([params] if params else []) ), column=column_name ) ) return checker_list
prop connection : dict
-
Return connection to data source.
Returns
dict
- Key-value parameters for connection to datasource.
Expand source code
@property def connection(self) -> dict: """ Return connection to data source. Returns: dict: Key-value parameters for connection to datasource. """ return { key: value for key, value in self.__dict__.items() if key not in DataSource.__annotations__.keys() }
class FileType (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
FileType enumeration.
Represents different types of file formats with case-insensitive comparison and lookup of enum values.
Expand source code
class FileType(CaseInsensitiveEnum): """ FileType enumeration. Represents different types of file formats with case-insensitive comparison and lookup of enum values. """ CSV = '.csv' JSON = '.json' PARQUET = '.parquet' SQL = '.sql' YAML = '.yaml' YML = '.yml' XLSX = '.xlsx'
Ancestors
- CaseInsensitiveEnum
- builtins.str
- enum.Enum
Class variables
var CSV
var JSON
var PARQUET
var SQL
var XLSX
var YAML
var YML