diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1094e96bd0d201..34ea5ad6ac07f1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -415,6 +415,30 @@ These now coerce to ``object`` dtype. - Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) +.. _whatsnew_0210.api_breaking.select: + +Select method is deprecated +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.select` and :meth:`DataFrame.select` methods are deprecated in favor of using ``.loc[]`` (:issue:`12401`) + +.. ipython:: python + + df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) + +.. code_block:: ipython + + In [3]: df.select(lambda x: x in ['bar', 'baz']) + FutureWarning: select is deprecated and will be removed in a future release. You can use .loc[crit] as a replacement + Out[3]: + A + bar 2 + baz 3 + +.. ipython:: python + + df.loc[lambda x: x in ['bar', 'baz']] + .. _whatsnew_0210.api.na_changes: NA naming Changes diff --git a/pandas/core/common.py b/pandas/core/common.py index 515a4010961205..d0cbc69108f959 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -441,13 +441,58 @@ def _get_callable_name(obj): return None -def _apply_if_callable(maybe_callable, obj, **kwargs): +def _apply_if_callable(maybe_callable, obj, axis=None, **kwargs): """ Evaluate possibly callable input using obj and kwargs if it is callable, otherwise return as it is + + Parameters + ---------- + maybe_callable : possibly a callable + obj : NDFrame + axis : int, optional + **kwargs """ + if callable(maybe_callable): - return maybe_callable(obj, **kwargs) + + # we are allowing a user callable, which can return + # a result based on the object itself, e.g. a scalar / list + # of labels, or a boolean result from evaluating the labels + # on the specified axis + + def try_on_axis(): + labels = obj._get_axis(axis or 0) + return labels.map(maybe_callable, **kwargs).values + + # act on object + try: + result = maybe_callable(obj, **kwargs) + + # if we have asked for a specific axis + # then we must be 1d + if axis is not None: + if getattr(result, 'ndim', 1) != 1: + raise ValueError + + return result + except KeyError as e: + # this is potentially a legitimate error + # if we cannot work on the labels + # we want to preserve the original KeyError + try: + return try_on_axis() + except: # no pragma + raise e + except: # no pragma + pass + + # act on the axis + try: + return try_on_axis() + except AttributeError: + pass + return maybe_callable diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4f6fd0828693e1..b4871a15346563 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2281,6 +2281,8 @@ def select(self, crit, axis=0): """ Return data corresponding to axis labels matching criteria + DEPRECATED: use .loc(axis=)[crit] to select via labels + Parameters ---------- crit : function @@ -2291,6 +2293,11 @@ def select(self, crit, axis=0): ------- selection : type of caller """ + warnings.warn("select is deprecated and will be removed in a " + "future release. You can use " + ".loc[crit] as a replacement", + FutureWarning, stacklevel=2) + axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) axis_values = self._get_axis(axis) @@ -3043,7 +3050,7 @@ def filter(self, items=None, like=None, regex=None, axis=None): See Also -------- - pandas.DataFrame.select + pandas.DataFrame.loc Notes ----- @@ -3062,20 +3069,23 @@ def filter(self, items=None, like=None, regex=None, axis=None): if axis is None: axis = self._info_axis_name - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis_name) + labels = self._get_axis(axis) if items is not None: - return self.reindex(**{axis_name: - [r for r in items if r in axis_values]}) + name = self._get_axis_name(axis) + return self.reindex( + **{name: [r for r in items if r in labels]}) elif like: - matchf = lambda x: (like in x if isinstance(x, string_types) else - like in str(x)) - return self.select(matchf, axis=axis_name) + def f(x): + if not isinstance(x, string_types): + x = str(x) + return like in x + values = labels.map(f) + return self.loc(axis=axis)[values.values] elif regex: matcher = re.compile(regex) - return self.select(lambda x: matcher.search(str(x)) is not None, - axis=axis_name) + values = labels.map(lambda x: matcher.search(str(x)) is not None) + return self.loc(axis=axis)[values.values] else: raise TypeError('Must pass either `items`, `like`, or `regex`') @@ -5698,7 +5708,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, inplace = validate_bool_kwarg(inplace, 'inplace') # align the cond to same shape as myself - cond = com._apply_if_callable(cond, self) + cond = com._apply_if_callable(cond, self, axis=axis) if isinstance(cond, NDFrame): cond, _ = cond.align(self, join='right', broadcast_axis=1) else: @@ -5939,7 +5949,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): - other = com._apply_if_callable(other, self) + other = com._apply_if_callable(other, self, axis=axis) return self._where(cond, other, inplace, axis, level, try_cast, raise_on_error) @@ -5950,7 +5960,7 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): inplace = validate_bool_kwarg(inplace, 'inplace') - cond = com._apply_if_callable(cond, self) + cond = com._apply_if_callable(cond, self, axis=axis) return self.where(~cond, other=other, inplace=inplace, axis=axis, level=level, try_cast=try_cast, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b7a51afcedabfe..9b2e0861063807 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -99,6 +99,8 @@ def __call__(self, axis=None): # we need to return a copy of ourselves new_self = self.__class__(self.obj, self.name) + if axis is not None: + axis = self.obj._get_axis_number(axis) new_self.axis = axis return new_self @@ -107,7 +109,8 @@ def __iter__(self): def __getitem__(self, key): if type(key) is tuple: - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj, i) + for i, x in enumerate(key)) try: values = self.obj.get_value(*key) if is_scalar(values): @@ -117,10 +120,16 @@ def __getitem__(self, key): return self._getitem_tuple(key) else: - key = com._apply_if_callable(key, self.obj) - return self._getitem_axis(key, axis=0) + # we by definition only have the 0th axis + axis = self.axis or 0 + + key = com._apply_if_callable(key, self.obj, axis) + return self._getitem_axis(key, axis=axis) + + def _get_label(self, label, axis=None): + if axis is None: + axis = self.axis or 0 - def _get_label(self, label, axis=0): if self.ndim == 1: # for perf reasons we want to try _xs first # as its basically direct indexing @@ -135,10 +144,14 @@ def _get_label(self, label, axis=0): return self.obj._xs(label, axis=axis) - def _get_loc(self, key, axis=0): + def _get_loc(self, key, axis=None): + if axis is None: + axis = self.axis return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0, kind=None): + def _slice(self, obj, axis=None, kind=None): + if axis is None: + axis = self.axis return self.obj._slice(obj, axis=axis, kind=kind) def _get_setitem_indexer(self, key): @@ -173,9 +186,11 @@ def _get_setitem_indexer(self, key): def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj, i) + for i, x in enumerate(key)) else: - key = com._apply_if_callable(key, self.obj) + axis = self.axis or 0 + key = com._apply_if_callable(key, self.obj, axis) indexer = self._get_setitem_indexer(key) self._setitem_with_indexer(indexer, value) @@ -192,10 +207,12 @@ def _has_valid_tuple(self, key): "[{types}] types" .format(types=self._valid_types)) - def _should_validate_iterable(self, axis=0): + def _should_validate_iterable(self, axis=None): """ return a boolean whether this axes needs validation for a passed iterable """ + if axis is None: + axis = self.axis or 0 ax = self.obj._get_axis(axis) if isinstance(ax, MultiIndex): return False @@ -233,6 +250,8 @@ def _convert_range(self, key, is_setter=False): def _convert_scalar_indexer(self, key, axis): # if we are accessing via lowered dim, use the last dim + if axis is None: + axis = 0 ax = self.obj._get_axis(min(axis, self.ndim - 1)) # a scalar return ax._convert_scalar_indexer(key, kind=self.name) @@ -895,7 +914,9 @@ def _multi_take(self, tup): except(KeyError, IndexingError): raise self._exception - def _convert_for_reindex(self, key, axis=0): + def _convert_for_reindex(self, key, axis=None): + if axis is None: + axis = self.axis or 0 labels = self.obj._get_axis(axis) if is_bool_indexer(key): @@ -925,7 +946,7 @@ def _handle_lowerdim_multi_index_axis0(self, tup): try: # fast path for series or for tup devoid of slices - return self._get_label(tup, axis=0) + return self._get_label(tup, axis=self.axis) except TypeError: # slices are unhashable pass @@ -1015,7 +1036,7 @@ def _getitem_nested_tuple(self, tup): # this is a series with a multi-index specified a tuple of # selectors - return self._getitem_axis(tup, axis=0) + return self._getitem_axis(tup, axis=self.axis) # handle the multi-axis by taking sections and reducing # this is iterative @@ -1049,7 +1070,10 @@ def _getitem_nested_tuple(self, tup): return obj - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): + + if axis is None: + axis = self.axis or 0 if self._should_validate_iterable(axis): self._has_valid_type(key, axis) @@ -1084,7 +1108,10 @@ def _getitem_axis(self, key, axis=0): return self._get_label(key, axis=axis) - def _getitem_iterable(self, key, axis=0): + def _getitem_iterable(self, key, axis=None): + if axis is None: + axis = self.axis or 0 + if self._should_validate_iterable(axis): self._has_valid_type(key, axis) @@ -1138,7 +1165,7 @@ def _getitem_iterable(self, key, axis=0): return result - def _convert_to_indexer(self, obj, axis=0, is_setter=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1153,6 +1180,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ + if axis is None: + axis = self.axis or 0 + labels = self.obj._get_axis(axis) if isinstance(obj, slice): @@ -1255,9 +1285,12 @@ def _tuplify(self, loc): tup[0] = loc return tuple(tup) - def _get_slice_axis(self, slice_obj, axis=0): + def _get_slice_axis(self, slice_obj, axis=None): obj = self.obj + if axis is None: + axis = self.axis or 0 + if not need_slice(slice_obj): return obj.copy(deep=False) indexer = self._convert_slice_indexer(slice_obj, axis) @@ -1325,7 +1358,8 @@ class _LocationIndexer(_NDFrameIndexer): def __getitem__(self, key): if type(key) is tuple: - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj, i) + for i, x in enumerate(key)) try: if self._is_scalar_access(key): return self._getitem_scalar(key) @@ -1333,8 +1367,11 @@ def __getitem__(self, key): pass return self._getitem_tuple(key) else: - key = com._apply_if_callable(key, self.obj) - return self._getitem_axis(key, axis=0) + # we by definition only have the 0th axis + axis = self.axis or 0 + + maybe_callable = com._apply_if_callable(key, self.obj, axis) + return self._getitem_axis(maybe_callable, axis=axis) def _is_scalar_access(self, key): raise NotImplementedError() @@ -1342,10 +1379,12 @@ def _is_scalar_access(self, key): def _getitem_scalar(self, key): raise NotImplementedError() - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): raise NotImplementedError() - def _getbool_axis(self, key, axis=0): + def _getbool_axis(self, key, axis=None): + if axis is None: + axis = self.axis or 0 labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds, = key.nonzero() @@ -1354,8 +1393,11 @@ def _getbool_axis(self, key, axis=0): except Exception as detail: raise self._exception(detail) - def _get_slice_axis(self, slice_obj, axis=0): + def _get_slice_axis(self, slice_obj, axis=None): """ this is pretty simple as we just have to deal with labels """ + if axis is None: + axis = self.axis or 0 + obj = self.obj if not need_slice(slice_obj): return obj.copy(deep=False) @@ -1508,7 +1550,10 @@ def _get_partial_string_timestamp_match_key(self, key, labels): return key - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): + if axis is None: + axis = self.axis or 0 + labels = self.obj._get_axis(axis) key = self._get_partial_string_timestamp_match_key(key, labels) @@ -1697,7 +1742,9 @@ def _getitem_tuple(self, tup): return retval - def _get_slice_axis(self, slice_obj, axis=0): + def _get_slice_axis(self, slice_obj, axis=None): + if axis is None: + axis = self.axis or 0 obj = self.obj if not need_slice(slice_obj): @@ -1709,7 +1756,7 @@ def _get_slice_axis(self, slice_obj, axis=0): else: return self.obj.take(slice_obj, axis=axis, convert=False) - def _get_list_axis(self, key, axis=0): + def _get_list_axis(self, key, axis=None): """ Return Series values by list or array of integers @@ -1722,13 +1769,17 @@ def _get_list_axis(self, key, axis=0): ------- Series object """ + if axis is None: + axis = self.axis or 0 try: return self.obj.take(key, axis=axis, convert=False) except IndexError: # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") - def _getitem_axis(self, key, axis=0): + def _getitem_axis(self, key, axis=None): + if axis is None: + axis = self.axis or 0 if isinstance(key, slice): self._has_valid_type(key, axis) @@ -1761,8 +1812,10 @@ def _getitem_axis(self, key, axis=0): return self._get_loc(key, axis=axis) - def _convert_to_indexer(self, obj, axis=0, is_setter=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False): """ much simpler as we only have to deal with our valid types """ + if axis is None: + axis = self.axis or 0 # make need to convert a float key if isinstance(obj, slice): @@ -1798,10 +1851,12 @@ def __getitem__(self, key): def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com._apply_if_callable(x, self.obj) for x in key) + key = tuple(com._apply_if_callable(x, self.obj, i) + for i, x in enumerate(key)) else: # scalar callable may return tuple - key = com._apply_if_callable(key, self.obj) + axis = self.axis or 0 + key = com._apply_if_callable(key, self.obj, axis) if not isinstance(key, tuple): key = self._tuplify(key) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 8bcc19e6d8ba41..5195a1b8042cf3 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -143,10 +143,15 @@ def test_set_index_nonuniq(self): def test_set_index_bug(self): # GH1590 df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']}) - df2 = df.select(lambda indx: indx >= 1) - rs = df2.set_index('key') xp = DataFrame({'val': [1, 2]}, Index(['b', 'c'], name='key')) + + df2 = df.loc(axis=0)[lambda indx: indx >= 1] + rs = df2.set_index('key') + assert_frame_equal(rs, xp) + + df2 = df.loc[lambda indx: indx >= 1] + rs = df2.set_index('key') assert_frame_equal(rs, xp) def test_set_index_pass_arrays(self): diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index fb9b8c2ed7affe..2d7663ad3ae5cb 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -796,16 +796,43 @@ def test_filter_corner(self): assert_frame_equal(result, empty) def test_select(self): + + # deprecated: gh-12410 f = lambda x: x.weekday() == 2 - result = self.tsframe.select(f, axis=0) - expected = self.tsframe.reindex( - index=self.tsframe.index[[f(x) for x in self.tsframe.index]]) - assert_frame_equal(result, expected) + index = self.tsframe.index[[f(x) for x in self.tsframe.index]] + expected_weekdays = self.tsframe.reindex(index=index) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = self.tsframe.select(f, axis=0) + assert_frame_equal(result, expected_weekdays) + + result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) + expected = self.frame.reindex(columns=['B', 'D']) + assert_frame_equal(result, expected, check_names=False) + + # replacement + result = self.tsframe.loc(axis=0)[f] + assert_frame_equal(result, expected_weekdays) - result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) + result = self.tsframe.loc[f] + assert_frame_equal(result, expected_weekdays) + + result = self.frame.loc(axis=1)[lambda x: x in ('B', 'D')] expected = self.frame.reindex(columns=['B', 'D']) + assert_frame_equal(result, expected, check_names=False) + + # doc example + df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + + expected = df.select(lambda x: x in ['bar', 'baz']) + result = df.loc[lambda x: x in ['bar', 'baz']] + assert_frame_equal(result, expected, check_names=False) - # TODO should reindex check_names? + result = df.loc(axis=0)[lambda x: x in ['bar', 'baz']] assert_frame_equal(result, expected, check_names=False) def test_take(self): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 0043475702f94b..26e2b801f64607 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -83,6 +83,7 @@ def test_assign_order(self): def test_assign_bad(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + # non-keyword argument with pytest.raises(TypeError): df.assign(lambda x: x.A) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 47bf837fa62d95..b0cdb6df8e5ebd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3103,7 +3103,7 @@ def agg_before(hour, func, fix=False): """ def _func(data): - d = data.select(lambda x: x.hour < 11).dropna() + d = data.loc(axis=0)[lambda x: x.hour < 11].dropna() if fix: data[data.index[0]] if len(d) == 0: diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 83d6a09d38f415..c1b276a446ee94 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -2199,14 +2199,18 @@ def test_rename(self): assert result.name == expected.name def test_select(self): - n = len(self.ts) - result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) - expected = self.ts.reindex(self.ts.index[n // 2:]) - assert_series_equal(result, expected) - result = self.ts.select(lambda x: x.weekday() == 2) - expected = self.ts[self.ts.index.weekday == 2] - assert_series_equal(result, expected) + # deprecated: gh-12410 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + n = len(self.ts) + result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) + expected = self.ts.reindex(self.ts.index[n // 2:]) + assert_series_equal(result, expected) + + result = self.ts.select(lambda x: x.weekday() == 2) + expected = self.ts[self.ts.index.weekday == 2] + assert_series_equal(result, expected) def test_cast_on_putmask(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 050335988ca417..6fa50d2a312167 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1239,7 +1239,7 @@ def test_groupby_level_no_obs(self): 'f2', 's1'), ('f2', 's2'), ('f3', 's1'), ('f3', 's2')]) df = DataFrame( [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) - df1 = df.select(lambda u: u[0] in ['f2', 'f3'], axis=1) + df1 = df.loc(axis=1)[lambda u: u[0] in ['f2', 'f3']] grouped = df1.groupby(axis=1, level=0) result = grouped.sum()