Condition Objects

You can find an introduction to Conditions in our Getting Started section.

An Introduction to Conditions :fontawesome-solid-leanpub:

Fittable conditions for pdpipe.

Classes

`UnfittedConditionError`

Bases: Exception

Raised when a transform is attempted with an unfitted condition.

Source code in pdpipe/cond.py

class UnfittedConditionError(Exception):
    """
    Raised when a transform is attempted with an unfitted condition.
    """

`Condition`

Bases: object

A fittable condition that returns a boolean value from a dataframe.

Parameters:

Name	Type	Description	Default
`func`	`callable`	A callable that given an input pandas.DataFrame objects returns a boolean value.	required
`fittable`	`bool, default False`	If set to True, this condition becomes fittable, and `func` is not called on calls of `transform()` of a fitted object. If set to False, the default, `func` is called on every call to transform. False by default.	`None`
`error_message`	`str, default None`	A string that describes the error when the condition fails.	`None`

Examples:

>>> import numpy as np; import pdpipe as pdp;
>>> cond = pdp.cond.Condition(lambda X: 'a' in X.columns)
>>> cond
<pdpipe.Condition: By function>
>>> col_drop = pdp.ColDrop(['lbl'], prec=cond)

Source code in pdpipe/cond.py

class Condition(object):
    """
    A fittable condition that returns a boolean value from a dataframe.

    Parameters
    ----------
    func : callable
        A callable that given an input pandas.DataFrame objects returns a
        boolean value.
    fittable : bool, default False
        If set to True, this condition becomes fittable, and `func` is not
        called on calls of `transform()` of a fitted object. If set to False,
        the default, `func` is called on every call to transform. False by
        default.
    error_message : str, default None
        A string that describes the error when the condition fails.

    Examples
    --------
    >>> import numpy as np; import pdpipe as pdp;
    >>> cond = pdp.cond.Condition(lambda X: 'a' in X.columns)
    >>> cond
    <pdpipe.Condition: By function>
    >>> col_drop = pdp.ColDrop(['lbl'], prec=cond)
    """

    def __init__(self, func, fittable=None, error_message=None):
        self._func = func
        self._fittable = fittable
        if error_message is not None:
            self._error_message = error_message

    def __call__(self, X):
        """
        Return column labels of qualified columns from an input dataframe.

        Parameters
        ----------
        X : pandas.DataFrame
            The input dataframe on which the condition is checked.

        Returns
        -------
        bool
            Either True of False.
        """
        try:
            return self.transform(X)
        except UnfittedConditionError:
            return self.fit_transform(X)

    def fit_transform(self, X):
        """
        Fit this condition and returns the result.

        Parameters
        ----------
        X : pandas.DataFrame
            The input dataframe on which the condition is checked.

        Returns
        -------
        bool
            Either True or False.
        """
        self._result = self._func(X)
        return self._result

    def fit(self, X):
        """
        Fit this condition on the input dataframe.

        Parameters
        ----------
        X : pandas.DataFrame
            The input dataframe on which the condition is checked.
        """
        self.fit_transform(X)

    def transform(self, X):
        """
        Return the result of this condition.

        Is this Condition is fittable, it will return the result that was
        determined when fitted, if it's fitted, and throw an exception
        if it is not.

        Parameters
        ----------
        X : pandas.DataFrame
            The input dataframe on which the condition is checked.

        Returns
        -------
        bool
            Either True or False.
        """
        if not self._fittable:
            return self._func(X)
        try:
            return self._result
        except AttributeError:
            raise UnfittedConditionError

    def __repr__(self):
        fstr = ""
        if self._func.__doc__:  # pragma: no cover
            fstr = f" - {self._func.__doc__}"
        return f"<pdpipe.Condition: By function{fstr}>"

    # --- overriding boolean operators ---

    # need this because inner-scope functions aren't pickle-able
    class _AndCondition(object):
        def __init__(self, first, second):
            self.first = first
            self.second = second

        def __call__(self, X):
            return self.first(X) and self.second(X)

    def __and__(self, other):
        try:
            _func = Condition._AndCondition(self._func, other._func)
            _func.__doc__ = (
                f"{self._func.__doc__ or 'Anonymous condition 1'} AND "
                f"{other._func.__doc__ or 'Anonymous condition 2'}"
            )
            return Condition(func=_func)
        except AttributeError:
            return NotImplemented

    class _XorCondition(object):
        def __init__(self, first, second):
            self.first = first
            self.second = second

        def __call__(self, X):
            return self.first(X) != self.second(X)

    def __xor__(self, other):
        try:
            _func = Condition._XorCondition(self._func, other._func)
            _func.__doc__ = (
                f"{self._func.__doc__ or 'Anonymous condition 1'} XOR "
                f"{other._func.__doc__ or 'Anonymous condition 2'}"
            )
            return Condition(func=_func)
        except AttributeError:
            return NotImplemented

    class _OrCondition(object):
        def __init__(self, first, second):
            self.first = first
            self.second = second

        def __call__(self, X):
            return self.first(X) or self.second(X)

    def __or__(self, other):
        try:
            _func = Condition._OrCondition(self._func, other._func)
            _func.__doc__ = (
                f"{self._func.__doc__ or 'Anonymous condition 1'} OR "
                f"{other._func.__doc__ or 'Anonymous condition 2'}"
            )
            return Condition(func=_func)
        except AttributeError:
            return NotImplemented

    class _NotCondition(object):
        def __init__(self, first):
            self.first = first

        def __call__(self, X):
            return not self.first(X)

    def __invert__(self):
        _func = Condition._NotCondition(self._func)
        _func.__doc__ = f"NOT {self._func.__doc__ or 'Anonymous condition'}"
        return Condition(func=_func)

Functions

`fit_transform(X)`

Fit this condition and returns the result.

Parameters:

Name	Type	Description	Default
`X`	`pandas.DataFrame`	The input dataframe on which the condition is checked.	required

Returns:

Type	Description
`bool`	Either True or False.

Source code in pdpipe/cond.py

def fit_transform(self, X):
    """
    Fit this condition and returns the result.

    Parameters
    ----------
    X : pandas.DataFrame
        The input dataframe on which the condition is checked.

    Returns
    -------
    bool
        Either True or False.
    """
    self._result = self._func(X)
    return self._result

`fit(X)`

Fit this condition on the input dataframe.

Parameters:

Name	Type	Description	Default
`X`	`pandas.DataFrame`	The input dataframe on which the condition is checked.	required

Source code in pdpipe/cond.py

def fit(self, X):
    """
    Fit this condition on the input dataframe.

    Parameters
    ----------
    X : pandas.DataFrame
        The input dataframe on which the condition is checked.
    """
    self.fit_transform(X)

`transform(X)`

Return the result of this condition.

Is this Condition is fittable, it will return the result that was determined when fitted, if it's fitted, and throw an exception if it is not.

Parameters:

Name	Type	Description	Default
`X`	`pandas.DataFrame`	The input dataframe on which the condition is checked.	required

Returns:

Type	Description
`bool`	Either True or False.

Source code in pdpipe/cond.py

def transform(self, X):
    """
    Return the result of this condition.

    Is this Condition is fittable, it will return the result that was
    determined when fitted, if it's fitted, and throw an exception
    if it is not.

    Parameters
    ----------
    X : pandas.DataFrame
        The input dataframe on which the condition is checked.

    Returns
    -------
    bool
        Either True or False.
    """
    if not self._fittable:
        return self._func(X)
    try:
        return self._result
    except AttributeError:
        raise UnfittedConditionError

`PerColumnCondition`

Bases: Condition

Check whether the columns of input dataframes satisfy a condition set.

Parameters:

Name	Type	Description	Default
`conditions`	`callable or list-like`	The condition, or set of conditions, that columns of input dataframes must satisfy. Conditions are callables that accept a `pandas.Series` object and return a `bool` value.	required
`conditions_reduce`	`str, default 'all'`	How condition satisfaction results are reduced per-column, in case of multiple conditions. 'all' requires a column to satisfy all conditions, while 'any' requires at least one condition to be satisfied.	`None`
`columns_reduce`	`str, default 'all'`	How condition satisfaction results are reduced among multiple columns. 'all' requires all columns of input dataframes to satisfy the given condition (in the case of multiple conditions, behaviour is determined by the `condition_reduce` parameter), while 'any' requires at least one column to satisfy it.	`None`
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp; import numpy as np;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.PerColumnCondition(
...     conditions=lambda x: x.dtype == np.int64,
... )
>>> cond
<pdpipe.Condition: Dataframes with all columns satisfying all conditions: anonymous condition>
>>> cond(X)
False
>>> cond = pdp.cond.PerColumnCondition(
...     conditions=lambda x: x.dtype == np.int64,
...     columns_reduce='any',
... )
>>> cond(X)
True
>>> cond = pdp.cond.PerColumnCondition(
...     conditions=[
...         lambda x: x.dtype == np.int64,
...         lambda x: x.dtype == object,
...     ],
... )
>>> cond(X)
False
>>> cond = pdp.cond.PerColumnCondition(
...     conditions=[
...         lambda x: x.dtype == np.int64,
...         lambda x: x.dtype == object,
...     ],
...     conditions_reduce='any',
... )
>>> cond(X)
True

Source code in pdpipe/cond.py

class PerColumnCondition(Condition):
    """
    Check whether the columns of input dataframes satisfy a condition set.

    Parameters
    ----------
    conditions : callable or list-like
        The condition, or set of conditions, that columns of input dataframes
        must satisfy. Conditions are callables that accept a `pandas.Series`
        object and return a `bool` value.
    conditions_reduce : str, default 'all'
        How condition satisfaction results are reduced per-column, in case of
        multiple conditions. 'all' requires a column to satisfy all conditions,
        while 'any' requires at least one condition to be satisfied.
    columns_reduce : str, default 'all'
        How condition satisfaction results are reduced among multiple columns.
        'all' requires all columns of input dataframes to satisfy the given
        condition (in the case of multiple conditions, behaviour is determined
        by the `condition_reduce` parameter), while 'any' requires at least one
        column to satisfy it.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp; import numpy as np;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.PerColumnCondition(
    ...     conditions=lambda x: x.dtype == np.int64,
    ... )
    >>> cond
    <pdpipe.Condition: Dataframes with all columns satisfying all \
conditions: anonymous condition>
    >>> cond(X)
    False
    >>> cond = pdp.cond.PerColumnCondition(
    ...     conditions=lambda x: x.dtype == np.int64,
    ...     columns_reduce='any',
    ... )
    >>> cond(X)
    True
    >>> cond = pdp.cond.PerColumnCondition(
    ...     conditions=[
    ...         lambda x: x.dtype == np.int64,
    ...         lambda x: x.dtype == object,
    ...     ],
    ... )
    >>> cond(X)
    False
    >>> cond = pdp.cond.PerColumnCondition(
    ...     conditions=[
    ...         lambda x: x.dtype == np.int64,
    ...         lambda x: x.dtype == object,
    ...     ],
    ...     conditions_reduce='any',
    ... )
    >>> cond(X)
    True
    """

    class _ConditionFunction(object):
        def __init__(self, conditions, cond_reduce, col_reduce):
            self.conditions = conditions
            self.cond_reduce = cond_reduce
            self.col_reduce = col_reduce

        def __call__(self, X):
            return self.col_reduce(
                [
                    self.cond_reduce([cond(X[lbl]) for cond in self.conditions])
                    for lbl in X.columns
                ]
            )

    def __init__(
        self, conditions, conditions_reduce=None, columns_reduce=None, **kwargs
    ):
        # handling default args and input types
        if not hasattr(conditions, "__iter__"):
            conditions = [conditions]
        if conditions_reduce is None:
            conditions_reduce = "all"
        if columns_reduce is None:
            columns_reduce = "all"
        # building class attributes
        self._conditions = conditions
        self._cond_reduce_str = conditions_reduce
        self._col_reduce_str = columns_reduce
        self._conditions_str = ", ".join(
            [c.__doc__ or "anonymous condition" for c in conditions]
        )
        if conditions_reduce == "all":
            self._cond_reduce = all
        elif conditions_reduce == "any":
            self._cond_reduce = any
        else:
            raise ValueError(
                (
                    "The only valid arguments to the `conditions_reduce` parameter"
                    " of PerColumnCondition are 'all' and 'any'!"
                )
            )
        if columns_reduce == "all":
            self._col_reduce = all
        elif columns_reduce == "any":
            self._col_reduce = any
        else:
            raise ValueError(
                (
                    "The only valid arguments to the `columns_reduce` parameter"
                    " of PerColumnCondition are 'all' and 'any'!"
                )
            )
        # building resulting function
        _func = PerColumnCondition._ConditionFunction(
            conditions=self._conditions,
            cond_reduce=self._cond_reduce,
            col_reduce=self._col_reduce,
        )
        doc_str = "Dataframes with {} columns satisfying {} conditions: {}"
        self._func_doc = doc_str.format(
            self._col_reduce_str, self._cond_reduce_str, self._conditions_str
        )
        _func.__doc__ = self._func_doc
        kwargs["func"] = _func
        super().__init__(**kwargs)

    def __repr__(self):
        return f"<pdpipe.Condition: {self._func_doc}>"

`HasAllColumns`

Bases: Condition

Check whether input dataframes contain a list of columns.

Parameters:

Name	Type	Description	Default
`labels`	`single label or list-like`	Column labels to check for.	required
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasAllColumns('num')
>>> cond
<pdpipe.Condition: Has all columns in num>
>>> cond(X)
True
>>> cond = pdp.cond.HasAllColumns(['num', 'chr'])
>>> cond(X)
True
>>> cond = pdp.cond.HasAllColumns(['num', 'gar'])
>>> cond(X)
False

Source code in pdpipe/cond.py

class HasAllColumns(Condition):
    """
    Check whether input dataframes contain a list of columns.

    Parameters
    ----------
    labels : single label or list-like
        Column labels to check for.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasAllColumns('num')
    >>> cond
    <pdpipe.Condition: Has all columns in num>
    >>> cond(X)
    True
    >>> cond = pdp.cond.HasAllColumns(['num', 'chr'])
    >>> cond(X)
    True
    >>> cond = pdp.cond.HasAllColumns(['num', 'gar'])
    >>> cond(X)
    False
    """

    def __init__(self, labels, **kwargs):
        if isinstance(labels, str) or not hasattr(labels, "__iter__"):
            labels = [labels]
        self._labels = labels
        self._labels_str = _list_str(self._labels)

        def _func(X):  # noqa: E306
            return all([lbl in X.columns for lbl in self._labels])

        _func.__doc__ = f"Dataframes with columns {self._labels_str}"
        super_kwargs = {
            "error_message": (
                f"Not all required columns {self._labels_str}"
                " present in the input dataframe."
            )
        }
        super_kwargs.update(**kwargs)
        super_kwargs["func"] = _func
        super().__init__(**super_kwargs)

    def __repr__(self):
        return f"<pdpipe.Condition: Has all columns in {self._labels_str}>"

`ColumnsFromList`

Bases: PerColumnCondition

Check whether input dataframes contain columns from a list.

Parameters:

Name	Type	Description	Default
`labels`	`single label or list-like`	Column labels to check for.	required
`columns_reduce`	`str, default 'all'`	How condition satisfaction results are reduced among multiple columns. 'all' requires all columns of input dataframes to satisfy the given condition, while 'any' requires at least one column to satisfy it.	`None`
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.ColumnsFromList('num')
>>> cond
<pdpipe.Condition: Dataframes with all columns satisfying all conditions: Series with labels in num>
>>> cond(X)
False
>>> cond = pdp.cond.ColumnsFromList(['num', 'chr', 'nur'])
>>> cond(X)
True
>>> cond = pdp.cond.ColumnsFromList(
...     ['num', 'gar'], columns_reduce='any')
>>> cond(X)
True

Source code in pdpipe/cond.py

class ColumnsFromList(PerColumnCondition):
    """
    Check whether input dataframes contain columns from a list.

    Parameters
    ----------
    labels : single label or list-like
        Column labels to check for.
    columns_reduce : str, default 'all'
        How condition satisfaction results are reduced among multiple columns.
        'all' requires all columns of input dataframes to satisfy the given
        condition, while 'any' requires at least one column to satisfy it.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.ColumnsFromList('num')
    >>> cond
    <pdpipe.Condition: Dataframes with all columns satisfying all \
conditions: Series with labels in num>
    >>> cond(X)
    False
    >>> cond = pdp.cond.ColumnsFromList(['num', 'chr', 'nur'])
    >>> cond(X)
    True
    >>> cond = pdp.cond.ColumnsFromList(
    ...     ['num', 'gar'], columns_reduce='any')
    >>> cond(X)
    True
    """

    class _SeriesLblCondition(object):
        def __init__(self, labels):
            self.labels = labels

        def __call__(self, series):
            return series.name in self.labels

    def __init__(self, labels, columns_reduce=None, **kwargs):
        if isinstance(labels, str) or not hasattr(labels, "__iter__"):
            labels = [labels]
        self._labels = labels
        self._labels_str = _list_str(self._labels)
        _func = ColumnsFromList._SeriesLblCondition(self._labels)
        _func.__doc__ = f"Series with labels in {self._labels_str}"
        kwargs["conditions"] = [_func]
        kwargs["columns_reduce"] = columns_reduce
        super().__init__(**kwargs)

`HasNoColumn`

Bases: Condition

Check whether input dataframes contains no column from a list.

Parameters:

Name	Type	Description	Default
`labels`	`single label or list-like`	Column labels to check for.	required
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasNoColumn('num')
>>> cond
<pdpipe.Condition: Has no column in num>
>>> cond(X)
False
>>> cond = pdp.cond.HasNoColumn(['num', 'gar'])
>>> cond(X)
False
>>> cond = pdp.cond.HasNoColumn(['ph', 'gar'])
>>> cond(X)
True

Source code in pdpipe/cond.py

class HasNoColumn(Condition):
    """
    Check whether input dataframes contains no column from a list.

    Parameters
    ----------
    labels : single label or list-like
        Column labels to check for.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasNoColumn('num')
    >>> cond
    <pdpipe.Condition: Has no column in num>
    >>> cond(X)
    False
    >>> cond = pdp.cond.HasNoColumn(['num', 'gar'])
    >>> cond(X)
    False
    >>> cond = pdp.cond.HasNoColumn(['ph', 'gar'])
    >>> cond(X)
    True
    """

    class _NoColumnsFunc(object):
        def __init__(self, labels):
            self.labels = labels

        def __call__(self, X):
            return all([lbl not in X.columns for lbl in self.labels])

    def __init__(self, labels, **kwargs):
        if isinstance(labels, str) or not hasattr(labels, "__iter__"):
            labels = [labels]
        self._labels = labels
        self._labels_str = _list_str(self._labels)
        _func = HasNoColumn._NoColumnsFunc(self._labels)
        _func.__doc__ = f"Dataframes with no column from {self._labels_str}"
        super_kwargs = {
            "error_message": (
                f"One or more of the prohibited columns {self._labels_str}"
                " present in the input dataframe."
            )
        }
        super_kwargs.update(**kwargs)
        super_kwargs["func"] = _func
        super().__init__(**super_kwargs)

    def __repr__(self):
        return f"<pdpipe.Condition: Has no column in {self._labels_str}>"

`HasAtMostMissingValues`

Bases: Condition

Check if a dataframes has no more than X missing values across all columns.

Parameters:

Name	Type	Description	Default
`n_missing`	`int or float`	If int, then interpreted as the maximal allowed number of missing values in input dataframes. If float, interpreted as the maximal allowed ratio of missing values in input dataframes.	required
`**kwargs`		Additionally accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[None,'a',5],[5,None,7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasAtMostMissingValues(1)
>>> cond
<pdpipe.Condition: Has at most 1 missing values>
>>> cond(X)
False
>>> cond = pdp.cond.HasAtMostMissingValues(2)
>>> cond(X)
True
>>> cond = pdp.cond.HasAtMostMissingValues(0.4)
>>> cond(X)
True
>>> cond = pdp.cond.HasAtMostMissingValues(0.2)
>>> cond(X)
False

Source code in pdpipe/cond.py

class HasAtMostMissingValues(Condition):
    """
    Check if a dataframes has no more than X missing values across all columns.

    Parameters
    ----------
    n_missing : int or float
        If int, then interpreted as the maximal allowed number of missing
        values in input dataframes. If float, interpreted as the maximal
        allowed ratio of missing values in input dataframes.
    **kwargs
        Additionally accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[None,'a',5],[5,None,7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasAtMostMissingValues(1)
    >>> cond
    <pdpipe.Condition: Has at most 1 missing values>
    >>> cond(X)
    False
    >>> cond = pdp.cond.HasAtMostMissingValues(2)
    >>> cond(X)
    True
    >>> cond = pdp.cond.HasAtMostMissingValues(0.4)
    >>> cond(X)
    True
    >>> cond = pdp.cond.HasAtMostMissingValues(0.2)
    >>> cond(X)
    False
    """

    class _IntMissingValuesFunc(object):
        def __init__(self, n_missing):
            self.n_missing = n_missing

        def __call__(self, X):
            nmiss = X.isna().sum().sum()
            return nmiss <= self.n_missing

    class _FloatMissingValuesFunc(object):
        def __init__(self, n_missing):
            self.n_missing = n_missing

        def __call__(self, X):
            nmiss = X.isna().sum().sum()
            return (nmiss / X.size) <= self.n_missing

    def __init__(self, n_missing, **kwargs):
        self._n_missing = n_missing
        if isinstance(n_missing, int):
            _func = HasAtMostMissingValues._IntMissingValuesFunc(n_missing)
        elif isinstance(n_missing, float):
            _func = HasAtMostMissingValues._FloatMissingValuesFunc(n_missing)
        else:
            raise ValueError("n_missing should be of type int or float!")
        _func.__doc__ = f"Dataframes with at most {self._n_missing} missing values"
        super_kwargs = {
            "error_message": (
                "Input dataframe cannot have more than"
                f" {self._n_missing} missing values."
            )
        }
        super_kwargs.update(**kwargs)
        super_kwargs["func"] = _func
        super().__init__(**super_kwargs)

    def __repr__(self):
        return f"<pdpipe.Condition: " f"Has at most {self._n_missing} missing values>"

`HasNoMissingValues`

Bases: HasAtMostMissingValues

Check whether input dataframes has no missing values.

Parameters:

Name	Type	Description	Default
`**kwargs`		Accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[None,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasNoMissingValues()
>>> cond
<pdpipe.Condition: Has no missing values>
>>> cond(X)
False

Source code in pdpipe/cond.py

class HasNoMissingValues(HasAtMostMissingValues):
    """
    Check whether input dataframes has no missing values.

    Parameters
    ----------
    **kwargs
        Accepts all keyword arguments of the constructor of Condition. See the
        documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[None,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasNoMissingValues()
    >>> cond
    <pdpipe.Condition: Has no missing values>
    >>> cond(X)
    False
    """

    def __init__(self, **kwargs):
        super_kwargs = {
            "error_message": "Input dataframe cannot contain missing values."
        }
        super_kwargs.update(**kwargs)
        super_kwargs["n_missing"] = 0
        super().__init__(**super_kwargs)

    def __repr__(self):
        return "<pdpipe.Condition: Has no missing values>"

`AlwaysTrue`

Bases: Condition

A condition letting all dataframes through, always returning True.

Parameters:

Name	Type	Description	Default
`**kwargs`		Accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.AlwaysTrue()
>>> cond
<pdpipe.Condition: AlwaysTrue>
>>> cond(X)
True

Source code in pdpipe/cond.py

class AlwaysTrue(Condition):
    """
    A condition letting all dataframes through, always returning True.

    Parameters
    ----------
    **kwargs
        Accepts all keyword arguments of the constructor of Condition. See the
        documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.AlwaysTrue()
    >>> cond
    <pdpipe.Condition: AlwaysTrue>
    >>> cond(X)
    True
    """

    def __init__(self, **kwargs):
        super_kwargs = {}
        super_kwargs.update(**kwargs)
        super_kwargs["func"] = _AlwaysTrue
        super().__init__(**super_kwargs)

    def __repr__(self):
        return "<pdpipe.Condition: AlwaysTrue>"

`HasAtMostNQualifyingColumns`

Bases: Condition

Check whether a dataframe has at most N columns statisfying a qualifier.

Parameters:

Name	Type	Description	Default
`n`	`int`	The maximal number of columns that should satisfy the qualifier.	required
`qualifier`	`callable`	A function that takes a pandas.DataFrame and returns the labels of the subset of qualifying columns. See the pdp.cq module.	required
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasAtMostNQualifyingColumns(
...     n=2, qualifier=pdp.cq.StartsWith('n'))
>>> cond
<pdpipe.Condition: Has at most 2 columns qualifying <ColumnQualifier: Columns starting with n>>
>>> cond(X)
True
>>> cond = pdp.cond.HasAtMostNQualifyingColumns(
...     n=1, qualifier=pdp.cq.StartsWith('n'))
>>> cond(X)
False

Source code in pdpipe/cond.py

class HasAtMostNQualifyingColumns(Condition):
    """
    Check whether a dataframe has at most N columns statisfying a qualifier.

    Parameters
    ----------
    n : int
        The maximal number of columns that should satisfy the qualifier.
    qualifier : callable
        A function that takes a pandas.DataFrame and returns the labels of the
        subset of qualifying columns. See the pdp.cq module.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasAtMostNQualifyingColumns(
    ...     n=2, qualifier=pdp.cq.StartsWith('n'))
    >>> cond
    <pdpipe.Condition: Has at most 2 columns qualifying <ColumnQualifier: Columns starting with n>>
    >>> cond(X)
    True
    >>> cond = pdp.cond.HasAtMostNQualifyingColumns(
    ...     n=1, qualifier=pdp.cq.StartsWith('n'))
    >>> cond(X)
    False
    """  # noqa: E501

    class _AtMostNQualifyingCallable:
        def __init__(self, n, qualifier):
            self._n = n
            self._qualifier = qualifier

        def __call__(self, X):
            return len(self._qualifier(X)) <= self._n

    def __init__(self, n: int, qualifier: callable, **kwargs):
        _func = HasAtMostNQualifyingColumns._AtMostNQualifyingCallable(n, qualifier)
        _func.__doc__ = (
            f"Dataframes with at most {n} columns qualifying " f"{qualifier}"
        )
        self._rpr = (
            f"<pdpipe.Condition: Has at most {n} columns qualifying " f"{qualifier}>"
        )
        super_kwargs = {
            "error_message": (
                f"More than {n} columns qualify {qualifier} in the input " "dataframe!"
            )
        }
        super_kwargs.update(**kwargs)
        super_kwargs["func"] = _func
        super().__init__(**super_kwargs)

    def __repr__(self):
        return self._rpr

`HasAtLeastNQualifyingColumns`

Bases: Condition

Check if a dataframe has at least N columns statisfying a qualifier.

Parameters:

Name	Type	Description	Default
`n`	`int`	The minimal number of columns that should satisfy the qualifier.	required
`qualifier`	`callable`	A function that takes a pandas.DataFrame and returns the labels of the subset of qualifying columns. See the pdp.cq module.	required
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasAtLeastNQualifyingColumns(
...     n=2, qualifier=pdp.cq.StartsWith('n'))
>>> cond
<pdpipe.Condition: Has at least 2 columns qualifying <ColumnQualifier: Columns starting with n>>
>>> cond(X)
True
>>> cond = pdp.cond.HasAtLeastNQualifyingColumns(
...     n=3, qualifier=pdp.cq.StartsWith('n'))
>>> cond(X)
False

Source code in pdpipe/cond.py

class HasAtLeastNQualifyingColumns(Condition):
    """
    Check if a dataframe has at least N columns statisfying a qualifier.

    Parameters
    ----------
    n : int
        The minimal number of columns that should satisfy the qualifier.
    qualifier : callable
        A function that takes a pandas.DataFrame and returns the labels of the
        subset of qualifying columns. See the pdp.cq module.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasAtLeastNQualifyingColumns(
    ...     n=2, qualifier=pdp.cq.StartsWith('n'))
    >>> cond
    <pdpipe.Condition: Has at least 2 columns qualifying <ColumnQualifier: Columns starting with n>>
    >>> cond(X)
    True
    >>> cond = pdp.cond.HasAtLeastNQualifyingColumns(
    ...     n=3, qualifier=pdp.cq.StartsWith('n'))
    >>> cond(X)
    False
    """  # noqa: E501

    class _AtLeastNQualifyingCallable:
        def __init__(self, n, qualifier):
            self._n = n
            self._qualifier = qualifier

        def __call__(self, X):
            return len(self._qualifier(X)) >= self._n

    def __init__(self, n: int, qualifier: callable, **kwargs):
        _func = HasAtLeastNQualifyingColumns._AtLeastNQualifyingCallable(n, qualifier)
        _func.__doc__ = (
            f"Dataframes with at least {n} columns qualifying " f"{qualifier}"
        )
        self._rpr = (
            f"<pdpipe.Condition: Has at least {n} columns qualifying " f"{qualifier}>"
        )
        super_kwargs = {
            "error_message": (
                f"Less than {n} columns qualify {qualifier} in the input " "dataframe!"
            )
        }
        super_kwargs.update(**kwargs)
        super_kwargs["func"] = _func
        super().__init__(**super_kwargs)

    def __repr__(self):
        return self._rpr

`HasNoQualifyingColumns`

Bases: HasAtMostNQualifyingColumns

Check whether a dataframe has no columns statisfying a qualifier.

Parameters:

Name	Type	Description	Default
`qualifier`	`callable`	A function that takes a pandas.DataFrame and returns the labels of the subset of qualifying columns. See the pdp.cq module.	required
`**kwargs`		Additionaly accepts all keyword arguments of the constructor of Condition. See the documentation of Condition for details.	`{}`

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> X = pd.DataFrame(
...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
>>> cond = pdp.cond.HasNoQualifyingColumns(
...     qualifier=pdp.cq.StartsWith('n'))
>>> cond
<pdpipe.Condition: Has no columns qualifying <ColumnQualifier: Columns starting with n>>
>>> cond(X)
False

Source code in pdpipe/cond.py

class HasNoQualifyingColumns(HasAtMostNQualifyingColumns):
    """
    Check whether a dataframe has no columns statisfying a qualifier.

    Parameters
    ----------
    qualifier : callable
        A function that takes a pandas.DataFrame and returns the labels of the
        subset of qualifying columns. See the pdp.cq module.
    **kwargs
        Additionaly accepts all keyword arguments of the constructor of
        Condition. See the documentation of Condition for details.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> X = pd.DataFrame(
    ...    [[8,'a',5],[5,'b',7]], [1,2], ['num', 'chr', 'nur'])
    >>> cond = pdp.cond.HasNoQualifyingColumns(
    ...     qualifier=pdp.cq.StartsWith('n'))
    >>> cond
    <pdpipe.Condition: Has no columns qualifying <ColumnQualifier: Columns starting with n>>
    >>> cond(X)
    False
    """  # noqa: E501

    def __init__(self, qualifier: callable, **kwargs):
        super_kwargs = {
            "error_message": (
                f"Found columns qualifing {qualifier} in the input dataframe!"
            )
        }
        super_kwargs.update(**kwargs)
        super().__init__(n=0, qualifier=qualifier, **super_kwargs)
        self._rpr = f"<pdpipe.Condition: Has no columns qualifying {qualifier}>"

Last update: 2022-01-19