Skip to content

Fly Handles

On the fly, relative pipeline stage creation.

Use drop_rows_where and keep_rows_where as handles to the future dataframe, using the [] indexing syntax to select a (single) column to apply the logic by, and regular binary operators such as >, >=, ==, !=, etc. to express the condition by which rows will be kept or dropped.

For example:

>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,5],[5,11]], [1,2,3], ['a','b'])
>>> df
   a   b
1  1   4
2  4   5
3  5  11
>>> pipeline = pdp.PdPipeline([
...     pdp.drop_rows_where['a'] > 4,
... ])
>>> pipeline(df)
   a  b
1  1  4
2  4  5

The resulting stages can be naturaly combined by logical binary operators: & for AND, | for OR and ^ for XOR, and can also be inverted with the ~ operator.

For example:

>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,5],[5,11]], [1,2,3], ['a','b'])
>>> pipeline = pdp.PdPipeline([
...     ~ (pdp.drop_rows_where['a'] > 4),
... ])
>>> pipeline(df)
   a   b
3  5  11
>>> pipeline = pdp.PdPipeline([
...     (pdp.drop_rows_where['a'] > 3) & (pdp.drop_rows_where['b'] < 10),
... ])
>>> pipeline(df)
   a   b
1  1   4
3  5  11

Attributes

drop_rows_where = _DropRowsByColValHandle() module-attribute

keep_rows_where = _KeepRowsByColValHandle() module-attribute

Classes

KeepRowsByQualifier

Bases: PdPipelineStage

A pipeline stage that keeps rows by a row qualifier.

All rows which the qualifier qualifies (i.e. return a boolean series with True in the corresponding entries) will be kept, while all other rows will be dropped from input dataframes.

Parameters:

Name Type Description Default
qualifier RowQualifier

An object that returns a boolean series from input dataframes. See more in pdpipe.rq.

required
**kwargs object

All PdPipelineStage constructor parameters are supported.

{}

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,5],[5,11]], [1,2,3], ['a','b'])
>>> q = pdp.rq.ColValGt('a', 3)
>>> pdp.fly.KeepRowsByQualifier(q).apply(df)
   a   b
2  4   5
3  5  11
Source code in pdpipe/fly.py
class KeepRowsByQualifier(PdPipelineStage):
    """
    A pipeline stage that keeps rows by a row qualifier.

    All rows which the qualifier qualifies (i.e. return a boolean series with
    True in the corresponding entries) will be kept, while all other rows will
    be dropped from input dataframes.

    Parameters
    ----------
    qualifier : RowQualifier
        An object that returns a boolean series from input dataframes. See more
        in `pdpipe.rq`.
    **kwargs : object
        All PdPipelineStage constructor parameters are supported.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> df = pd.DataFrame([[1,4],[4,5],[5,11]], [1,2,3], ['a','b'])
    >>> q = pdp.rq.ColValGt('a', 3)
    >>> pdp.fly.KeepRowsByQualifier(q).apply(df)
       a   b
    2  4   5
    3  5  11
    """

    def __init__(self, qualifier, **kwargs):
        self._keeprowsby_rq = qualifier
        super_kwargs = {
            "desc": f"Drop rows by qualifier {qualifier}",
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)

    def _prec(self, X: pandas.DataFrame) -> bool:
        return True

    def _transform(self, X, verbose=None):
        before_count = len(X)
        bool_ix = self._keeprowsby_rq(X)
        inter_X = X[bool_ix]
        if verbose:
            print(f"{before_count - len(inter_X)} rows dropped.")
        return inter_X

    def __and__(self, other):
        try:
            and_rq = self._keeprowsby_rq & other._keeprowsby_rq
            return type(self)(qualifier=and_rq)
        except AttributeError:
            return NotImplemented

    def __or__(self, other):
        try:
            or_rq = self._keeprowsby_rq | other._keeprowsby_rq
            return type(self)(qualifier=or_rq)
        except AttributeError:
            return NotImplemented

    def __xor__(self, other):
        try:
            xor_rq = self._keeprowsby_rq ^ other._keeprowsby_rq
            return type(self)(qualifier=xor_rq)
        except AttributeError:
            return NotImplemented

    def __invert__(self):
        not_rq = ~self._keeprowsby_rq
        return type(self)(qualifier=not_rq)

DropRowsByQualifier

Bases: PdPipelineStage

A pipeline stage that drops rows by a row qualifier.

All rows which the qualifier qualifies (i.e. return a boolean series with True in the corresponding entries) will be dropped, while all other rows will be kept in input dataframes.

Parameters:

Name Type Description Default
qualifier RowQualifier

An object that returns a boolean series from input dataframes. See more in pdpipe.rq.

required
**kwargs object

All PdPipelineStage constructor parameters are supported.

{}

Examples:

>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[1,4],[4,5],[5,11]], [1,2,3], ['a','b'])
>>> q = pdp.rq.ColValLt('a', 3)
>>> pdp.fly.DropRowsByQualifier(q).apply(df)
   a   b
2  4   5
3  5  11
Source code in pdpipe/fly.py
class DropRowsByQualifier(PdPipelineStage):
    """
    A pipeline stage that drops rows by a row qualifier.

    All rows which the qualifier qualifies (i.e. return a boolean series with
    True in the corresponding entries) will be dropped, while all other rows
    will be kept in input dataframes.

    Parameters
    ----------
    qualifier : RowQualifier
        An object that returns a boolean series from input dataframes. See more
        in `pdpipe.rq`.
    **kwargs : object
        All PdPipelineStage constructor parameters are supported.

    Examples
    --------
    >>> import pandas as pd; import pdpipe as pdp;
    >>> df = pd.DataFrame([[1,4],[4,5],[5,11]], [1,2,3], ['a','b'])
    >>> q = pdp.rq.ColValLt('a', 3)
    >>> pdp.fly.DropRowsByQualifier(q).apply(df)
       a   b
    2  4   5
    3  5  11
    """

    def __init__(self, qualifier, **kwargs):
        self._droprowsby_rq = qualifier
        super_kwargs = {
            "desc": f"Drop rows by qualifier {qualifier}",
        }
        super_kwargs.update(**kwargs)
        super().__init__(**super_kwargs)

    def _prec(self, X: pandas.DataFrame) -> bool:
        return True

    def _transform(self, X, verbose=None):
        before_count = len(X)
        bool_ix = ~self._droprowsby_rq(X)
        inter_X = X[bool_ix]
        if verbose:
            print(f"{before_count - len(inter_X)} rows dropped.")
        return inter_X

    def __and__(self, other):
        try:
            and_rq = self._droprowsby_rq & other._droprowsby_rq
            return type(self)(qualifier=and_rq)
        except AttributeError:
            return NotImplemented

    def __or__(self, other):
        try:
            or_rq = self._droprowsby_rq | other._droprowsby_rq
            return type(self)(qualifier=or_rq)
        except AttributeError:
            return NotImplemented

    def __xor__(self, other):
        try:
            xor_rq = self._droprowsby_rq ^ other._droprowsby_rq
            return type(self)(qualifier=xor_rq)
        except AttributeError:
            return NotImplemented

    def __invert__(self):
        not_rq = ~self._droprowsby_rq
        return type(self)(qualifier=not_rq)

Last update: 2022-01-21