Source code for schemaflow.pipe

import schemaflow.types
import schemaflow.ops
import schemaflow.exceptions as _exceptions


def _check_schema_keys(schema: dict, expected: dict, error_location: list, raise_: bool=False):
    schema_keys = set(schema.keys())
    required_keys = set(expected.keys())

    exceptions = []
    if len(required_keys - schema_keys):
        exception = _exceptions.WrongSchema(required_keys, schema_keys, error_location)
        if raise_:
            raise exception
        exceptions.append(exception)
    return exceptions


def _check_schema_types(schema: dict, expected: dict, error_location: str, raise_: bool=False):
    exceptions = []
    for key, expected_type in expected.items():
        if key in schema:
            try:
                new_exceptions = expected_type.check_schema(schema[key], raise_)
            except _exceptions.SchemaFlowError as e:
                e.locations.append(error_location % key)
                raise e
            for exception in new_exceptions:
                exception.locations.append(error_location % key)
            exceptions += new_exceptions
    return exceptions


[docs]class Pipe: """ A Pipe represents a stateful data transformation. Data in this context consists of a Python dictionary whose each value is a type with some representation of data, either in-memory (e.g. ``float``, ``pandas.DataFrame``) of remote (e.g. ``pyspark.sql.DataFrame``, ``sqlalchemy``). A :class:`Pipe` is defined by: - a method :meth:`transform` that: - uses the keys :attr:`transform_modifies` from ``data`` - uses the :attr:`state` - modifies the keys in :attr:`transform_modifies` in ``data`` - a method :meth:`fit` that: - uses (training) keys :attr:`fit_requires` from ``data`` - uses (passed) :attr:`fit_parameters` - modifies the keys :attr:`fitted_parameters` in :attr:`state` - a set of :attr:`requirements` (a set of package names, e.g. ``{'pandas'}``) of the transformation All :attr:`transform_modifies` and :attr:`fit_requires` have a :class:`~schemaflow.types.Type` that can be used to check that the Pipe's input is consistent, with - :meth:`check_fit` - :meth:`check_transform` The existence of the requirements can be checked using - :meth:`check_requirements` The rational is that you can run ``check_*`` with access only to the data's schema. This is specially important when the schemaflow is an expensive operation. """ #: set of packages required by the Pipe. requirements = set() #: the data schema required in :meth:`~fit`; #: a dictionary ``str``: :class:`~schemaflow.types.Type`. fit_requires = {} #: the data schema required in :meth:`~transform`; #: a dictionary ``str``: :class:`~schemaflow.type.Type`. transform_requires = {} #: parameters' schema passed to :meth:`~fit` fit_parameters = {} #: schema of the parameters assigned in :meth:`~fit` fitted_parameters = {} #: type and key of :meth:`~transform` transform_modifies = {} def __init__(self): self.state = {} #: A dictionary with the states of the Pipe. Use [] operator to access and modify it. def __setitem__(self, key, value): self.state.__setitem__(key, value) def __getitem__(self, key): if key not in self.state: raise _exceptions.NotFittedError(self, key) return self.state.__getitem__(key) @property def check_requirements(self): """ Checks for requirements. :return: a list of exceptions with missing requirements """ exceptions = [] for requirement in self.requirements: if not schemaflow.types._requirement_fulfilled(requirement): exceptions.append(_exceptions.MissingRequirement(self.__class__, requirement)) all_types = list(self.fit_requires.values()) + list(self.transform_requires.values()) + \ list(self.fit_parameters.values()) + list(self.fitted_parameters.values()) for value_type in all_types: if isinstance(value_type, schemaflow.types.Type): for requirement in value_type.requirements: if not schemaflow.types._requirement_fulfilled(requirement): exceptions.append(_exceptions.MissingRequirement(value_type.__class__, requirement)) return exceptions def check_transform_modifies(self, input_schema: dict, output_schema: dict): expected_schema = self._transform_schema(input_schema.copy()) expected_schema = {key: schemaflow.types._get_type(value) for key, value in expected_schema.items()} exceptions = _check_schema_keys(output_schema, expected_schema, ['in modified data from transform']) exceptions += _check_schema_types(output_schema, expected_schema, 'in result \'%s\' of modified data from transform') return exceptions
[docs] def check_fit(self, data: dict, parameters: dict=None, raise_: bool=False): """ Checks that a given data has a valid schema to be used in :meth:`fit`. :param data: a dictionary with either ``(str, Type)`` or ``(str, instance)`` :param parameters: a dictionary with either ``(str, Type)`` or ``(str, instance)`` :param raise_: whether it should raise the first found exception or list them all (default: list them) :return: a list of (subclasses of) :class:`~pipeline.exceptions.PipelineError` with all failed checks. """ if parameters is None: parameters = {} exceptions = _check_schema_keys(data, self.fit_requires, ['in fit'], raise_) expected_schema = {key: schemaflow.types._get_type(value) for key, value in self.fit_requires.items()} exceptions += _check_schema_types(data, expected_schema, 'in argument \'%s\' of fit', raise_) # check that parameters are correct parameters_keys = set(parameters.keys()) expected_parameters = set(self.fit_parameters.keys()) if parameters_keys != expected_parameters: exception = _exceptions.WrongParameter( expected_parameters, parameters_keys, ['in fit']) if raise_: raise exception exceptions.append(exception) expected_schema = {key: schemaflow.types._get_type(value) for key, value in self.fit_parameters.items()} exceptions += _check_schema_types(parameters, expected_schema, 'in parameter \'%s\' of fit', raise_) return exceptions
[docs] def check_transform(self, data: dict, raise_: bool=False): """ Checks that a given data has a valid schema to be used in :meth:`transform`. :param data: a dictionary with either ``(str, Type)`` or ``(str, instance)`` :param raise_: whether it should raise the first found exception or list them all (default: list them) :return: a list of (subclasses of) :class:`schemaflow.exceptions.SchemaFlowError` with all missing arguments. """ exceptions = _check_schema_keys(data, self.transform_requires, ['in transform'], raise_) expected_schema = {key: schemaflow.types._get_type(value) for key, value in self.transform_requires.items()} exceptions += _check_schema_types(data, expected_schema, 'in argument \'%s\' of transform', raise_) return exceptions
def _transform_schema(self, schema: dict): for key, value in self.transform_modifies.items(): if not isinstance(value, list): value = [value] for transformation in value: if isinstance(transformation, schemaflow.ops.Operation): schema = transformation.transform(key, schema) else: schema[key] = transformation return schema
[docs] def transform_schema(self, schema: dict): """ Transforms the ``schema`` into a new schema based on :attr:`~transform_modifies`. :param schema: a dictionary of pairs ``str`` :class:`~schemaflow.types.Type`. :return: the new schema. """ self.check_transform(schema, True) return self._transform_schema(schema)
[docs] def fit(self, data: dict, parameters: dict=None): """ Modifies the instance's :attr:`state`. :param data: a dictionary of pairs ``(str, object)``. :param parameters: a dictionary of pairs ``(str, object)``. :return: ``None`` """
[docs] def transform(self, data: dict): """ Modifies the data keys identified in :attr:`transform_modifies`. :param data: a dictionary of pairs ``(str, object)``. :return: the modified data """ return data