From ef031c118c514dfd0ee195686cae751da0e216e2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 22 Sep 2017 11:41:22 -0400 Subject: [PATCH] DEPR: deprecate .select() in favor of .loc(axis=)[] closes #12401 --- doc/source/whatsnew/v0.21.0.txt | 25 +++++ pandas/core/common.py | 8 ++ pandas/core/generic.py | 30 +++-- pandas/core/indexing.py | 105 +++++++++++++----- pandas/tests/frame/test_alter_axes.py | 5 +- .../tests/frame/test_axis_select_reindex.py | 34 +++++- pandas/tests/frame/test_mutate_columns.py | 1 + pandas/tests/groupby/test_groupby.py | 3 +- pandas/tests/series/test_indexing.py | 18 +-- pandas/tests/test_multilevel.py | 3 +- 10 files changed, 179 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index e47926d95d2fa..400519788d38b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -667,6 +667,31 @@ Deprecations - passing ``categories`` or ``ordered`` kwargs to :func:`Series.astype` is deprecated, in favor of passing a :ref:`CategoricalDtype ` (:issue:`17636`) - Passing a non-existant column in ``.to_excel(..., columns=)`` is deprecated and will raise a ``KeyError`` in the future (:issue:`17295`) +.. _whatsnew_0210.deprecations.select: + +Series.select and DataFrame.select +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.select` and :meth:`DataFrame.select` methods are deprecated in favor of using ``df.loc[labels.map(crit)]`` (:issue:`12401`) + +.. ipython:: python + + df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) + +.. code-block:: ipython + + In [3]: df.select(lambda x: x in ['bar', 'baz']) + FutureWarning: select is deprecated and will be removed in a future release. You can use .loc[crit] as a replacement + Out[3]: + A + bar 2 + baz 3 + +.. ipython:: python + + df.loc[df.index.map(lambda x: x in ['bar', 'baz'])] + + .. _whatsnew_0210.deprecations.argmin_min: Series.argmax and Series.argmin diff --git a/pandas/core/common.py b/pandas/core/common.py index 2686ad370e1ed..e0dc420bc53f8 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -445,9 +445,17 @@ def _apply_if_callable(maybe_callable, obj, **kwargs): """ Evaluate possibly callable input using obj and kwargs if it is callable, otherwise return as it is + + Parameters + ---------- + maybe_callable : possibly a callable + obj : NDFrame + **kwargs """ + if callable(maybe_callable): return maybe_callable(obj, **kwargs) + return maybe_callable diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5dd770b2600a0..bc8f68eb763d2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2339,6 +2339,8 @@ def select(self, crit, axis=0): """ Return data corresponding to axis labels matching criteria + DEPRECATED: use df.loc[df.index.map(crit)] to select via labels + Parameters ---------- crit : function @@ -2349,6 +2351,11 @@ def select(self, crit, axis=0): ------- selection : type of caller """ + warnings.warn("'select' is deprecated and will be removed in a " + "future release. You can use " + ".loc[labels.map(crit)] as a replacement", + FutureWarning, stacklevel=2) + axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) axis_values = self._get_axis(axis) @@ -3101,7 +3108,7 @@ def filter(self, items=None, like=None, regex=None, axis=None): See Also -------- - pandas.DataFrame.select + pandas.DataFrame.loc Notes ----- @@ -3120,20 +3127,23 @@ def filter(self, items=None, like=None, regex=None, axis=None): if axis is None: axis = self._info_axis_name - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis_name) + labels = self._get_axis(axis) if items is not None: - return self.reindex(**{axis_name: - [r for r in items if r in axis_values]}) + name = self._get_axis_name(axis) + return self.reindex( + **{name: [r for r in items if r in labels]}) elif like: - matchf = lambda x: (like in x if isinstance(x, string_types) else - like in str(x)) - return self.select(matchf, axis=axis_name) + def f(x): + if not isinstance(x, string_types): + x = str(x) + return like in x + values = labels.map(f) + return self.loc(axis=axis)[values] elif regex: matcher = re.compile(regex) - return self.select(lambda x: matcher.search(str(x)) is not None, - axis=axis_name) + values = labels.map(lambda x: matcher.search(str(x)) is not None) + return self.loc(axis=axis)[values] else: raise TypeError('Must pass either `items`, `like`, or `regex`') diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e977e84702982..199aa9cfca506 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -99,6 +99,8 @@ def __call__(self, axis=None): # we need to return a copy of ourselves new_self = self.__class__(self.obj, self.name) + if axis is not None: + axis = self.obj._get_axis_number(axis) new_self.axis = axis return new_self @@ -107,7 +109,8 @@ def __iter__(self): def __getitem__(self, key): if type(key) is tuple: - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj) + for x in key) try: values = self.obj.get_value(*key) if is_scalar(values): @@ -117,10 +120,16 @@ def __getitem__(self, key): return self._getitem_tuple(key) else: + # we by definition only have the 0th axis + axis = self.axis or 0 + key = com._apply_if_callable(key, self.obj) - return self._getitem_axis(key, axis=0) + return self._getitem_axis(key, axis=axis) + + def _get_label(self, label, axis=None): + if axis is None: + axis = self.axis or 0 - def _get_label(self, label, axis=0): if self.ndim == 1: # for perf reasons we want to try _xs first # as its basically direct indexing @@ -135,10 +144,14 @@ def _get_label(self, label, axis=0): return self.obj._xs(label, axis=axis) - def _get_loc(self, key, axis=0): + def _get_loc(self, key, axis=None): + if axis is None: + axis = self.axis return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0, kind=None): + def _slice(self, obj, axis=None, kind=None): + if axis is None: + axis = self.axis return self.obj._slice(obj, axis=axis, kind=kind) def _get_setitem_indexer(self, key): @@ -173,7 +186,8 @@ def _get_setitem_indexer(self, key): def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj) + for x in key) else: key = com._apply_if_callable(key, self.obj) indexer = self._get_setitem_indexer(key) @@ -192,10 +206,12 @@ def _has_valid_tuple(self, key): "[{types}] types" .format(types=self._valid_types)) - def _should_validate_iterable(self, axis=0): + def _should_validate_iterable(self, axis=None): """ return a boolean whether this axes needs validation for a passed iterable """ + if axis is None: + axis = self.axis or 0 ax = self.obj._get_axis(axis) if isinstance(ax, MultiIndex): return False @@ -233,6 +249,8 @@ def _convert_range(self, key, is_setter=False): def _convert_scalar_indexer(self, key, axis): # if we are accessing via lowered dim, use the last dim + if axis is None: + axis = 0 ax = self.obj._get_axis(min(axis, self.ndim - 1)) # a scalar return ax._convert_scalar_indexer(key, kind=self.name) @@ -895,7 +913,9 @@ def _multi_take(self, tup): except(KeyError, IndexingError): raise self._exception - def _convert_for_reindex(self, key, axis=0): + def _convert_for_reindex(self, key, axis=None): + if axis is None: + axis = self.axis or 0 labels = self.obj._get_axis(axis) if is_bool_indexer(key): @@ -925,7 +945,7 @@ def _handle_lowerdim_multi_index_axis0(self, tup): try: # fast path for series or for tup devoid of slices - return self._get_label(tup, axis=0) + return self._get_label(tup, axis=self.axis) except TypeError: # slices are unhashable pass @@ -1015,7 +1035,7 @@ def _getitem_nested_tuple(self, tup): # this is a series with a multi-index specified a tuple of # selectors - return self._getitem_axis(tup, axis=0) + return self._getitem_axis(tup, axis=self.axis) # handle the multi-axis by taking sections and reducing # this is iterative @@ -1049,7 +1069,10 @@ def _getitem_nested_tuple(self, tup): return obj - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): + + if axis is None: + axis = self.axis or 0 if self._should_validate_iterable(axis): self._has_valid_type(key, axis) @@ -1084,7 +1107,10 @@ def _getitem_axis(self, key, axis=0): return self._get_label(key, axis=axis) - def _getitem_iterable(self, key, axis=0): + def _getitem_iterable(self, key, axis=None): + if axis is None: + axis = self.axis or 0 + if self._should_validate_iterable(axis): self._has_valid_type(key, axis) @@ -1138,7 +1164,7 @@ def _getitem_iterable(self, key, axis=0): return result - def _convert_to_indexer(self, obj, axis=0, is_setter=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1153,6 +1179,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ + if axis is None: + axis = self.axis or 0 + labels = self.obj._get_axis(axis) if isinstance(obj, slice): @@ -1255,9 +1284,12 @@ def _tuplify(self, loc): tup[0] = loc return tuple(tup) - def _get_slice_axis(self, slice_obj, axis=0): + def _get_slice_axis(self, slice_obj, axis=None): obj = self.obj + if axis is None: + axis = self.axis or 0 + if not need_slice(slice_obj): return obj.copy(deep=False) indexer = self._convert_slice_indexer(slice_obj, axis) @@ -1325,7 +1357,8 @@ class _LocationIndexer(_NDFrameIndexer): def __getitem__(self, key): if type(key) is tuple: - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj) + for x in key) try: if self._is_scalar_access(key): return self._getitem_scalar(key) @@ -1333,8 +1366,11 @@ def __getitem__(self, key): pass return self._getitem_tuple(key) else: - key = com._apply_if_callable(key, self.obj) - return self._getitem_axis(key, axis=0) + # we by definition only have the 0th axis + axis = self.axis or 0 + + maybe_callable = com._apply_if_callable(key, self.obj) + return self._getitem_axis(maybe_callable, axis=axis) def _is_scalar_access(self, key): raise NotImplementedError() @@ -1342,10 +1378,12 @@ def _is_scalar_access(self, key): def _getitem_scalar(self, key): raise NotImplementedError() - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): raise NotImplementedError() - def _getbool_axis(self, key, axis=0): + def _getbool_axis(self, key, axis=None): + if axis is None: + axis = self.axis or 0 labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds, = key.nonzero() @@ -1354,8 +1392,11 @@ def _getbool_axis(self, key, axis=0): except Exception as detail: raise self._exception(detail) - def _get_slice_axis(self, slice_obj, axis=0): + def _get_slice_axis(self, slice_obj, axis=None): """ this is pretty simple as we just have to deal with labels """ + if axis is None: + axis = self.axis or 0 + obj = self.obj if not need_slice(slice_obj): return obj.copy(deep=False) @@ -1528,7 +1569,10 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): + if axis is None: + axis = self.axis or 0 + labels = self.obj._get_axis(axis) key = self._get_partial_string_timestamp_match_key(key, labels) @@ -1717,7 +1761,9 @@ def _getitem_tuple(self, tup): return retval - def _get_slice_axis(self, slice_obj, axis=0): + def _get_slice_axis(self, slice_obj, axis=None): + if axis is None: + axis = self.axis or 0 obj = self.obj if not need_slice(slice_obj): @@ -1729,7 +1775,7 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj._take(slice_obj, axis=axis, convert=False) - def _get_list_axis(self, key, axis=0): + def _get_list_axis(self, key, axis=None): """ Return Series values by list or array of integers @@ -1742,13 +1788,17 @@ def _get_list_axis(self, key, axis=0): ------- Series object """ + if axis is None: + axis = self.axis or 0 try: return self.obj._take(key, axis=axis, convert=False) except IndexError: # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): + if axis is None: + axis = self.axis or 0 if isinstance(key, slice): self._has_valid_type(key, axis) @@ -1781,8 +1831,10 @@ def _getitem_axis(self, key, axis=0): return self._get_loc(key, axis=axis) - def _convert_to_indexer(self, obj, axis=0, is_setter=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False): """ much simpler as we only have to deal with our valid types """ + if axis is None: + axis = self.axis or 0 # make need to convert a float key if isinstance(obj, slice): @@ -1818,7 +1870,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj) + for x in key) else: # scalar callable may return tuple key = com._apply_if_callable(key, self.obj) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 8bcc19e6d8ba4..27906838abb2d 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -143,10 +143,11 @@ def test_set_index_nonuniq(self): def test_set_index_bug(self): # GH1590 df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']}) - df2 = df.select(lambda indx: indx >= 1) - rs = df2.set_index('key') xp = DataFrame({'val': [1, 2]}, Index(['b', 'c'], name='key')) + + df2 = df.loc[df.index.map(lambda indx: indx >= 1)] + rs = df2.set_index('key') assert_frame_equal(rs, xp) def test_set_index_pass_arrays(self): diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 219c1df301c4b..f9a4275d14f55 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -796,16 +796,38 @@ def test_filter_corner(self): assert_frame_equal(result, empty) def test_select(self): + + # deprecated: gh-12410 f = lambda x: x.weekday() == 2 - result = self.tsframe.select(f, axis=0) - expected = self.tsframe.reindex( - index=self.tsframe.index[[f(x) for x in self.tsframe.index]]) - assert_frame_equal(result, expected) + index = self.tsframe.index[[f(x) for x in self.tsframe.index]] + expected_weekdays = self.tsframe.reindex(index=index) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = self.tsframe.select(f, axis=0) + assert_frame_equal(result, expected_weekdays) + + result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) + expected = self.frame.reindex(columns=['B', 'D']) + assert_frame_equal(result, expected, check_names=False) + + # replacement + f = lambda x: x.weekday == 2 + result = self.tsframe.loc(axis=0)[f(self.tsframe.index)] + assert_frame_equal(result, expected_weekdays) - result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) + crit = lambda x: x in ['B', 'D'] + result = self.frame.loc(axis=1)[(self.frame.columns.map(crit))] expected = self.frame.reindex(columns=['B', 'D']) + assert_frame_equal(result, expected, check_names=False) + + # doc example + df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) - # TODO should reindex check_names? + crit = lambda x: x in ['bar', 'baz'] + with tm.assert_produces_warning(FutureWarning): + expected = df.select(crit) + result = df.loc[df.index.map(crit)] assert_frame_equal(result, expected, check_names=False) def test_take(self): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 0043475702f94..26e2b801f6460 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -83,6 +83,7 @@ def test_assign_order(self): def test_assign_bad(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + # non-keyword argument with pytest.raises(TypeError): df.assign(lambda x: x.A) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 47bf837fa62d9..657de9b589dc9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3103,7 +3103,8 @@ def agg_before(hour, func, fix=False): """ def _func(data): - d = data.select(lambda x: x.hour < 11).dropna() + d = data.loc[data.index.map( + lambda x: x.hour < 11)].dropna() if fix: data[data.index[0]] if len(d) == 0: diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 09ba0e197438d..93e7b81163b54 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -2225,14 +2225,18 @@ def test_rename(self): assert result.name == expected.name def test_select(self): - n = len(self.ts) - result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) - expected = self.ts.reindex(self.ts.index[n // 2:]) - assert_series_equal(result, expected) - result = self.ts.select(lambda x: x.weekday() == 2) - expected = self.ts[self.ts.index.weekday == 2] - assert_series_equal(result, expected) + # deprecated: gh-12410 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + n = len(self.ts) + result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) + expected = self.ts.reindex(self.ts.index[n // 2:]) + assert_series_equal(result, expected) + + result = self.ts.select(lambda x: x.weekday() == 2) + expected = self.ts[self.ts.index.weekday == 2] + assert_series_equal(result, expected) def test_cast_on_putmask(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 050335988ca41..94577db15f01a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1239,7 +1239,8 @@ def test_groupby_level_no_obs(self): 'f2', 's1'), ('f2', 's2'), ('f3', 's1'), ('f3', 's2')]) df = DataFrame( [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) - df1 = df.select(lambda u: u[0] in ['f2', 'f3'], axis=1) + df1 = df.loc(axis=1)[df.columns.map( + lambda u: u[0] in ['f2', 'f3'])] grouped = df1.groupby(axis=1, level=0) result = grouped.sum()