Source code for schemaflow.types

import datetime
import importlib.util

import schemaflow.exceptions as _exceptions


def _requirement_fulfilled(requirement: str):
    """
    Returns whether a requirement is fulfilled.

    :return: bool
    """
    return importlib.util.find_spec(requirement) is not None


def _all_subclasses(cls):
    return set(cls.__subclasses__()).union(
        [s for c in cls.__subclasses__() for s in _all_subclasses(c)])


def infer_schema(data: dict):
    subclasses = list(subclass for subclass in _all_subclasses(Type)
                      if subclass.requirements_fulfilled and not subclass.__name__.startswith('_'))

    schema = {}
    for key, value in data.items():
        for subclass in subclasses:
            if isinstance(value, subclass.base_type()):
                schema[key] = subclass.infer(value)
                break
        else:
            schema[key] = type(value)
    return schema


def _get_type(instance_type):
    if not isinstance(instance_type, Type):
        instance_type = _LiteralType(instance_type)
    return instance_type


[docs]class Type:
    """
    The base type of all types. Used to declare new types to be used in :class:`schemaflow.pipe.Pipe`.

    The class attribute :attr:`requirements` (a set of strings) is used to define if using this type has
    package requirements (e.g. `numpy`).
    """
    requirements = {}  #: set of packages required for this type to be usable.

[docs]    @classmethod
    def base_type(cls):
        """
        A class property that returns the underlying type of this Type.
        :return:
        """
        raise NotImplementedError

    @classmethod
    def infer(cls, instance):
        raise NotImplementedError

[docs]    @classmethod
    def requirements_fulfilled(cls):
        """
        Returns whether this Type has its requirements fulfilled.

        :return: bool
        """
        return all(_requirement_fulfilled(requirement) for requirement in cls.requirements)

    def _check_as_instance(self, instance: object, raise_: bool):
        raise NotImplementedError

    def _check_as_type(self, instance, raise_: bool):
        if type(instance) != type(self):
            exception = _exceptions.WrongType(instance, self)
            if raise_:
                raise exception
            return [exception]
        return []

[docs]    def check_schema(self, instance: object, raise_: bool=False):
        """
        Checks that the instance has the correct type and schema (composite types).

        :param instance: a datum in either its representation form or on its schema form.
        :param raise_:
        :return: a list of exceptions
        """
        if isinstance(instance, Type):
            return self._check_as_type(instance, raise_)
        else:
            return self._check_as_instance(instance, raise_)

    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.__dict__ == other.__dict__


class _LiteralType(Type):
    """
    A :class:`Type` that wraps literal types (e.g. ``float``). Used internally only.
    """
    def __init__(self, base_type):
        assert not isinstance(base_type, Type)
        assert not isinstance(base_type, (list, tuple))
        self._base_type = base_type

    def __repr__(self):
        return 'L(%s)' % self._base_type

    @property
    def base_type(self):
        return self._base_type

    def _check_as_instance(self, instance: object, raise_: bool):
        if not isinstance(instance, self._base_type):
            exception = _exceptions.WrongType(self._base_type, type(instance))
            if raise_:
                raise exception
            return [exception]
        return []


class _DataFrame(Type):
    """
    Abstract schemaflow representation of a DataFrame. See subclasses for Pandas and PySpark.
    """
    def __init__(self, schema: dict):
        """
        :param schema: dictionary of `(column_name, type)`.
        """
        self.schema = schema.copy()
        for column, base_type in self.schema.items():
            if not isinstance(base_type, Type):
                self.schema[column] = _LiteralType(base_type)

    def __getitem__(self, key):
        return self.schema[key]

    def __setitem__(self, key, value):
        self.schema[key] = value

    def __delitem__(self, key):
        del self.schema[key]

    @classmethod
    def infer(cls, instance):
        assert isinstance(instance, cls.base_type())
        return cls(schema=cls._get_schema(instance))

    @staticmethod
    def _get_schema(instance):
        """
        Return the DataFrame's schema from an instance
        """
        raise NotImplementedError

    def _check_schema(self, schema, raise_: bool):
        exceptions = []
        for column in self.schema:
            if column not in schema:
                exception = _exceptions.WrongSchema(column, set(schema.keys()))
                if raise_:
                    raise exception
                exceptions.append(exception)
            else:
                column_type = schema[column].base_type
                expected_type = self.schema[column].base_type
                if expected_type != column_type:
                    exception = _exceptions.WrongType(
                        expected_type, column_type, 'column \'%s\'' % column)
                    if raise_:
                        raise exception
                    exceptions.append(exception)
        return exceptions

    def _check_as_type(self, instance, raise_: bool):
        exceptions = super()._check_as_type(instance, raise_)
        if not exceptions:
            exceptions += self._check_schema(instance.schema, raise_)
        return exceptions

    def _check_as_instance(self, instance: object, raise_: bool):
        if not isinstance(instance, self.base_type()):
            exception = _exceptions.WrongType(self.base_type(), type(instance))
            if raise_:
                raise exception
            return [exception]
        return self._check_schema(self._get_schema(instance), raise_)

    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__, self.schema)


[docs]class PySparkDataFrame(_DataFrame):
    """
    Representation of a pyspark.sql.DataFrame. Requires ``pyspark``.
    """
    requirements = {'pyspark'}

[docs]    @classmethod
    def base_type(cls):
        import pyspark.sql
        return pyspark.sql.DataFrame

    @staticmethod
    def _get_schema(instance):
        import pyspark.sql.types
        import numpy
        mapping = {
            pyspark.sql.types.LongType: _LiteralType(int),
            pyspark.sql.types.DoubleType: _LiteralType(float),
            pyspark.sql.types.BooleanType: _LiteralType(bool),
            pyspark.sql.types.StringType: _LiteralType(numpy.dtype('O')),
            pyspark.sql.types.DateType: _LiteralType(datetime.date),
            pyspark.sql.types.TimestampType: _LiteralType(datetime.datetime),
        }
        return dict((f.name, mapping[type(f.dataType)]) for f in instance.schema.fields)


[docs]class PandasDataFrame(_DataFrame):
    """
    Representation of a pandas.DataFrame. Requires ``pandas``.
    """
    requirements = {'pandas'}

[docs]    @classmethod
    def base_type(cls):
        import pandas
        return pandas.DataFrame

    @staticmethod
    def _get_schema(instance):
        return dict((column, _get_type(column_dtype)) for column, column_dtype in instance.dtypes.items())


class _Container(Type):

    @classmethod
    def base_type(cls):
        return list

    def __init__(self, items_type):
        if not isinstance(items_type, Type):
            items_type = _LiteralType(items_type)
        self._items_type = items_type

    @classmethod
    def infer(cls, instance):
        assert isinstance(instance, cls.base_type())

        inferred_items_type = object
        if len(instance):
            inferred_items_type = type(instance[0])
            if any(type(item) != inferred_items_type for item in instance):
                inferred_items_type = object

        return cls(inferred_items_type)

    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__, self._items_type.base_type.__name__)

    def _check_as_type(self, instance, raise_: bool):
        exceptions = super()._check_as_type(instance, raise_)
        if not exceptions and self._items_type.base_type != instance._items_type.base_type:
            exception = _exceptions.WrongType(self, instance)
            if raise_:
                raise exception
            exceptions += [exception]
        return exceptions

    def _check_as_instance(self, instance: object, raise_: bool):
        if not isinstance(instance, self.base_type()):
            exception = _exceptions.WrongType(self.base_type(), type(instance))
            if raise_:
                raise exception
            return [exception]

        exceptions = []
        if not exceptions:
            for i, item in enumerate(instance):
                exceptions += self._items_type.check_schema(item, raise_)
        return exceptions


[docs]class List(_Container):
    pass


[docs]class Tuple(_Container):
[docs]    @classmethod
    def base_type(cls):
        return tuple


[docs]class Array(_Container):
    """
    Representation of a numpy.array. Requires ``numpy``.
    """
    requirements = {'numpy'}

    def __init__(self, items_type: type, shape=None):
        import numpy
        assert isinstance(items_type, numpy.dtype) or issubclass(items_type, (numpy.generic, float, int, bool))
        if items_type == float:
            items_type = numpy.float64
        super().__init__(_LiteralType(items_type))
        assert isinstance(shape, (type(None), tuple))
        self.shape = shape

    def __repr__(self):
        return '%s(%s, %s)' % (self.__class__.__name__, self._items_type.base_type, self.shape)

    @classmethod
    def infer(cls, instance):
        assert isinstance(instance, cls.base_type())

        return cls(instance.dtype, instance.shape)

    def _check_as_type(self, instance, raise_: bool):
        exceptions = super()._check_as_type(instance, raise_)
        if not exceptions:
            if not self._is_valid_shape(instance.shape):
                exception = _exceptions.WrongShape(self.shape, instance.shape)
                if raise_:
                    raise exception
                exceptions.append(exception)
        return exceptions

    def _check_as_instance(self, instance: object, raise_: bool):
        if not isinstance(instance, self.base_type()):
            exception = _exceptions.WrongType(self.base_type(), type(instance))
            if raise_:
                raise exception
            return [exception]

        exceptions = []
        assert hasattr(instance, 'dtype')
        if instance.dtype == self._items_type.base_type and hasattr(instance, 'shape'):
            if not self._is_valid_shape(instance.shape):
                exception = _exceptions.WrongShape(self.shape, instance.shape)
                if raise_:
                    raise exception
                exceptions.append(exception)
        else:
            exception = _exceptions.WrongType(self._items_type.base_type, instance.dtype)
            if raise_:
                raise exception
            exceptions.append(exception)
        return exceptions

    def _is_valid_shape(self, shape):
        if self.shape is not None:
            if len(shape) != len(self.shape):
                return False
            for required_d, d in zip(self.shape, shape):
                if required_d is not None and d != required_d:
                    return False
        return True

[docs]    @classmethod
    def base_type(cls):
        import numpy
        return numpy.ndarray