Skip to content

Commit

Permalink
[ENH] generic select function (#1187)
Browse files Browse the repository at this point in the history
* fix blank note output

* return whatever user passes, even if they r duplicates

* add DropLabel for dropping columns

* generic select with tests

* update test_select.py

* update changelog

* update docs in select

* add version notifications

* update docs for DropLabel class

* update admonition

* ensure booleans are converted into arrays
  • Loading branch information
samukweku authored Nov 8, 2022
1 parent fe9fa5a commit fa4ad4a
Show file tree
Hide file tree
Showing 9 changed files with 276 additions and 22 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
- [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
- [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku
- [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521
- [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
- [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
- [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521
- [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521
- [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521
Expand Down
2 changes: 1 addition & 1 deletion janitor/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@
from .transform_columns import transform_column, transform_columns
from .truncate_datetime import truncate_datetime_dataframe
from .update_where import update_where
from .utils import patterns, unionize_dataframe_categories
from .utils import patterns, unionize_dataframe_categories, DropLabel
5 changes: 5 additions & 0 deletions janitor/functions/case_when.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def case_when(
else:
default
```
!!! abstract "Version Changed"
- 0.24.0
- Added `default` parameter.
:param df: A pandas DataFrame.
:param args: Variable argument of conditions and expected values.
Expand Down
8 changes: 7 additions & 1 deletion janitor/functions/conditional_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ def conditional_join(
3 4 3 5
4 4 3 6
!!! abstract "Version Changed"
- 0.24.0
- Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.
:param df: A pandas DataFrame.
:param right: Named Series or DataFrame to join to.
Expand Down Expand Up @@ -145,7 +151,7 @@ def conditional_join(
:param use_numba: Use numba, if installed, to accelerate the computation.
Applicable only to strictly non-equi joins. Default is `False`.
:returns: A pandas DataFrame of the two merged Pandas objects.
"""
""" # noqa: E501

return _conditional_join_compute(
df,
Expand Down
16 changes: 15 additions & 1 deletion janitor/functions/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ def pivot_longer(
7 Austin Texas Watermelon 99 None NaN
8 Hoover Alabama Watermelon 43 None NaN
!!! abstract "Version Changed"
- 0.24.0
- Added `dropna` parameter.
:param df: A pandas DataFrame.
:param index: Name(s) of columns to use as identifier variables.
Should be either a single column name, or a list/tuple of
Expand Down Expand Up @@ -1259,6 +1266,13 @@ def pivot_wider(
0 5.5 20 25 30 37
1 6.1 22 18 19 29
!!! abstract "Version Changed"
- 0.24.0
- Added `reset_index`, `names_expand` and `index_expand` parameters.
:param df: A pandas DataFrame.
:param index: Name(s) of columns to use as identifier variables.
It should be either a single column name, or a list of column names.
Expand Down Expand Up @@ -1293,7 +1307,7 @@ def pivot_wider(
Applies only if `index` is a categorical column. Default is `False`.
:returns: A pandas DataFrame that has been unpivoted from long to wide
form.
"""
""" # noqa: E501

df = df.copy()

Expand Down
86 changes: 81 additions & 5 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas_flavor as pf
import pandas as pd
from janitor.utils import deprecated_alias
from janitor.functions.utils import _select
from janitor.functions.utils import _select, DropLabel # noqa: F401


@pf.register_dataframe_method
Expand All @@ -24,7 +24,8 @@ def select_columns(
Optional ability to invert selection of columns available as well.
!!! Note
!!!note
The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_columns` is primarily for convenience.
Expand Down Expand Up @@ -57,7 +58,7 @@ def select_columns(
:returns: A pandas DataFrame with the specified columns selected.
""" # noqa: E501

return _select(df, args, invert, axis="columns")
return _select(df, args=args, invert=invert, axis="columns")


@pf.register_dataframe_method
Expand All @@ -79,11 +80,17 @@ def select_rows(
Optional ability to invert selection of rows available as well.
!!! Note
!!! info "New in version 0.24.0"
!!!note
The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_rows` is primarily for convenience.
Example:
>>> import pandas as pd
Expand Down Expand Up @@ -113,5 +120,74 @@ def select_rows(
provided.
:returns: A pandas DataFrame with the specified rows selected.
""" # noqa: E501
return _select(df, args=args, invert=invert, axis="index")


@pf.register_dataframe_method
def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
"""
Method-chainable selection of rows and columns.
It accepts a string, shell-like glob strings `(*string*)`,
regex, slice, array-like object, or a list of the previous options.
Selection on a MultiIndex on a level, or multiple levels,
is possible with a dictionary.
This method does not mutate the original DataFrame.
Selection can be inverted with the `DropLabel` class.
!!! info "New in version 0.24.0"
!!!note
The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select` is primarily for convenience.
Example:
>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
... index=['cobra', 'viper', 'sidewinder'],
... columns=['max_speed', 'shield'])
>>> df
max_speed shield
cobra 1 2
viper 4 5
sidewinder 7 8
>>> df.select(rows='cobra', columns='shield')
shield
cobra 2
Labels can be dropped with the `DropLabel` class:
>>> df.select(rows=DropLabel('cobra'))
max_speed shield
viper 4 5
sidewinder 7 8
:param df: A pandas DataFrame.
:param rows: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
:param columns: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
:returns: A pandas DataFrame with the specified rows and/or columns selected.
""" # noqa: E501

return _select(df, args, invert, axis="index")
return _select(df, args=None, rows=rows, columns=columns, axis="both")
87 changes: 77 additions & 10 deletions janitor/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
Pattern,
Union,
Callable,
Any,
)
from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
from pandas.core.common import is_bool_indexer

from dataclasses import dataclass

import pandas as pd
from janitor.utils import check, _expand_grid
Expand Down Expand Up @@ -269,6 +270,23 @@ def _select_callable(arg, func: Callable, axis=None):
return bools


@dataclass
class DropLabel:
"""
Helper class for removing labels within the `select` syntax.
`label` can be any of the types supported in the `select`,
`select_rows` and `select_columns` functions.
An array of integers not matching the labels is returned.
!!! info "New in version 0.24.0"
:param label: Label(s) to be dropped from the index.
:returns: A dataclass.
"""

label: Any


@singledispatch
def _select_index(arg, df, axis):
"""
Expand All @@ -284,6 +302,27 @@ def _select_index(arg, df, axis):
raise KeyError(f"No match was returned for {arg}") from exc


@_select_index.register(DropLabel) # noqa: F811
def _column_sel_dispatch(cols, df, axis): # noqa: F811
"""
Base function for selection on a Pandas Index object.
Returns the inverse of the passed label(s).
Returns an array of integers.
"""
arr = _select_index(cols.label, df, axis)
index = np.arange(getattr(df, axis).size)
if isinstance(arr, int):
arr = [arr]
elif isinstance(arr, slice):
arr = index[arr]
elif is_list_like(arr):
arr = np.asanyarray(arr)
if is_bool_dtype(arr):
return index[~arr]
return np.setdiff1d(index, arr)


@_select_index.register(str) # noqa: F811
def _index_dispatch(arg, df, axis): # noqa: F811
"""
Expand Down Expand Up @@ -437,7 +476,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811
f"{arg} is a boolean dtype and has wrong length: "
f"{len(arg)} instead of {len(index)}"
)
return arg
return np.asanyarray(arg)
try:

if isinstance(arg, pd.Series):
Expand Down Expand Up @@ -486,17 +525,27 @@ def _index_dispatch(arg, df, axis): # noqa: F811

return arg

# treat multiple DropLabel instances as a single unit
checks = (isinstance(entry, DropLabel) for entry in arg)
if sum(checks) > 1:
drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
drop_labels = [entry.label for entry in drop_labels]
drop_labels = DropLabel(drop_labels)
arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
arg.append(drop_labels)

indices = [_select_index(entry, df, axis) for entry in arg]

# single entry does not need to be combined
# or materialized if possible;
# this offers more performance
if len(indices) == 1:
if isinstance(indices[0], int):
if is_scalar(indices[0]):
return indices
if is_list_like(indices[0]):
return np.asanyarray(indices[0])
return indices[0]
indices = indices[0]
if is_list_like(indices):
indices = np.asanyarray(indices)
return indices
contents = []
for arr in indices:
if is_list_like(arr):
Expand All @@ -508,19 +557,37 @@ def _index_dispatch(arg, df, axis): # noqa: F811
elif isinstance(arr, int):
arr = [arr]
contents.append(arr)
contents = np.concatenate(contents)
# remove possible duplicates
return pd.unique(contents)
return np.concatenate(contents)


def _select(
df: pd.DataFrame, args: tuple, invert: bool, axis: str
df: pd.DataFrame,
args: tuple,
invert: bool = False,
axis: str = "index",
rows=None,
columns=None,
) -> pd.DataFrame:
"""
Index DataFrame on the index or columns.
Returns a DataFrame.
"""
assert axis in {"both", "index", "columns"}
if axis == "both":
if rows is None:
rows = slice(None)
else:
if not is_list_like(rows):
rows = [rows]
rows = _select_index(rows, df, axis="index")
if columns is None:
columns = slice(None)
else:
if not is_list_like(columns):
columns = [columns]
columns = _select_index(columns, df, axis="columns")
return df.iloc[rows, columns]
indices = _select_index(list(args), df, axis)
if invert:
rev = np.ones(getattr(df, axis).size, dtype=np.bool8)
Expand Down
Loading

0 comments on commit fa4ad4a

Please sign in to comment.