-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: ArrayManager.quantile #40189
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: ArrayManager.quantile #40189
Changes from all commits
003a42d
8dda19b
6743b62
149084d
1748b65
bcd464c
79c2249
4957afe
856b3da
45e776d
33883b1
1c7fe15
9e8405e
1daa18e
926e1c0
34a41d8
0a40767
b0167d2
3d80803
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,50 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
from pandas._typing import ArrayLike | ||
|
||
from pandas.core.dtypes.common import is_list_like | ||
from pandas.core.dtypes.common import ( | ||
is_list_like, | ||
is_sparse, | ||
) | ||
from pandas.core.dtypes.missing import ( | ||
isna, | ||
na_value_for_dtype, | ||
) | ||
|
||
from pandas.core.nanops import nanpercentile | ||
|
||
if TYPE_CHECKING: | ||
from pandas.core.arrays import ExtensionArray | ||
|
||
|
||
def quantile_compat(values: ArrayLike, qs, interpolation: str, axis: int) -> ArrayLike: | ||
""" | ||
Compute the quantiles of the given values for each quantile in `qs`. | ||
|
||
Parameters | ||
---------- | ||
values : np.ndarray or ExtensionArray | ||
qs : a scalar or list of the quantiles to be computed | ||
interpolation : str | ||
axis : int | ||
|
||
Returns | ||
------- | ||
np.ndarray or ExtensionArray | ||
""" | ||
if isinstance(values, np.ndarray): | ||
fill_value = na_value_for_dtype(values.dtype, compat=False) | ||
mask = isna(values) | ||
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) | ||
else: | ||
result = quantile_ea_compat(values, qs, interpolation, axis) | ||
return result | ||
|
||
|
||
def quantile_with_mask( | ||
values: np.ndarray, | ||
|
@@ -75,3 +114,50 @@ def quantile_with_mask( | |
result = lib.item_from_zerodim(result) | ||
|
||
return result | ||
|
||
|
||
def quantile_ea_compat( | ||
values: ExtensionArray, qs, interpolation: str, axis: int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is all super gross. i guess moving it here and then going to clean in future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. depends on a few things: this can become somewhat less gross with 2D EAs (or 1D ndarrays in an ArrayManager world). There's also the fact that values_for_factorize used here doesnt work for IntegerArray/FloatingArray. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be better if we expose a .quantile() on EAs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that may eventually be the way to go. will see if thats what it takes to get the IntegerArray/FloatingArray working again |
||
) -> ExtensionArray: | ||
""" | ||
ExtensionArray compatibility layer for quantile_with_mask. | ||
|
||
We pretend that an ExtensionArray with shape (N,) is actually (1, N,) | ||
for compatibility with non-EA code. | ||
|
||
Parameters | ||
---------- | ||
values : ExtensionArray | ||
qs : a scalar or list of the quantiles to be computed | ||
interpolation: str | ||
axis : int | ||
|
||
Returns | ||
------- | ||
ExtensionArray | ||
""" | ||
# TODO(EA2D): make-believe not needed with 2D EAs | ||
orig = values | ||
|
||
# asarray needed for Sparse, see GH#24600 | ||
mask = np.asarray(values.isna()) | ||
mask = np.atleast_2d(mask) | ||
|
||
values, fill_value = values._values_for_factorize() | ||
values = np.atleast_2d(values) | ||
|
||
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) | ||
|
||
if not is_sparse(orig.dtype): | ||
# shape[0] should be 1 as long as EAs are 1D | ||
|
||
if result.ndim == 1: | ||
# i.e. qs was originally a scalar | ||
assert result.shape == (1,), result.shape | ||
result = type(orig)._from_factorized(result, orig) | ||
|
||
else: | ||
assert result.shape == (1, len(qs)), result.shape | ||
result = type(orig)._from_factorized(result[0], orig) | ||
|
||
return result |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
from __future__ import annotations | ||
|
||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
Callable, | ||
List, | ||
|
@@ -56,6 +57,7 @@ | |
) | ||
|
||
import pandas.core.algorithms as algos | ||
from pandas.core.array_algos.quantile import quantile_compat | ||
from pandas.core.array_algos.take import take_nd | ||
from pandas.core.arrays import ( | ||
DatetimeArray, | ||
|
@@ -82,6 +84,10 @@ | |
) | ||
from pandas.core.internals.blocks import make_block | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Float64Index | ||
|
||
|
||
T = TypeVar("T", bound="ArrayManager") | ||
|
||
|
||
|
@@ -448,7 +454,28 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: | |
|
||
return type(self)(result_arrays, self._axes) | ||
|
||
# TODO quantile | ||
def quantile( | ||
self, | ||
*, | ||
qs: Float64Index, | ||
axis: int = 0, | ||
transposed: bool = False, | ||
interpolation="linear", | ||
) -> ArrayManager: | ||
|
||
arrs = [ | ||
x if not isinstance(x, np.ndarray) else np.atleast_2d(x) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can the reshaping be moved to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this one (potentially) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think ArrayManager-specific logic should stay in ArrayManager wherever possible |
||
for x in self.arrays | ||
] | ||
assert axis == 1 | ||
new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs] | ||
for i, arr in enumerate(new_arrs): | ||
if arr.ndim == 2: | ||
assert arr.shape[0] == 1, arr.shape | ||
new_arrs[i] = arr[0] | ||
|
||
axes = [qs, self._axes[1]] | ||
return type(self)(new_arrs, axes) | ||
|
||
def isna(self, func) -> ArrayManager: | ||
return self.apply("apply", func=func) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -521,7 +521,6 @@ def quantile( | |
*, | ||
qs: Float64Index, | ||
axis: int = 0, | ||
transposed: bool = False, | ||
interpolation="linear", | ||
) -> BlockManager: | ||
""" | ||
|
@@ -534,8 +533,6 @@ def quantile( | |
axis: reduction axis, default 0 | ||
consolidate: bool, default True. Join together blocks having same | ||
dtype | ||
transposed: bool, default False | ||
we are holding transposed data | ||
interpolation : type of interpolation, default 'linear' | ||
qs : list of the quantiles to be computed | ||
|
||
|
@@ -557,13 +554,6 @@ def quantile( | |
for blk in self.blocks | ||
] | ||
|
||
if transposed: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
new_axes = new_axes[::-1] | ||
blocks = [ | ||
b.make_block(b.values.T, placement=np.arange(b.shape[1])) | ||
for b in blocks | ||
] | ||
|
||
return type(self)(blocks, new_axes) | ||
|
||
def isna(self, func) -> BlockManager: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,6 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
DataFrame, | ||
|
@@ -11,8 +9,6 @@ | |
) | ||
import pandas._testing as tm | ||
|
||
pytestmark = td.skip_array_manager_not_yet_implemented | ||
|
||
|
||
class TestDataFrameQuantile: | ||
@pytest.mark.parametrize( | ||
|
@@ -526,12 +522,13 @@ def test_quantile_empty_no_columns(self): | |
expected.columns.name = "captain tightpants" | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_quantile_item_cache(self): | ||
def test_quantile_item_cache(self, using_array_manager): | ||
# previous behavior incorrect retained an invalid _item_cache entry | ||
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't comment on the exact line (3 lines below), but could also do
because I think the rest of the test is still valid |
||
df["D"] = df["A"] * 2 | ||
ser = df["A"] | ||
assert len(df._mgr.blocks) == 2 | ||
if not using_array_manager: | ||
assert len(df._mgr.blocks) == 2 | ||
|
||
df.quantile(numeric_only=False) | ||
ser.values[0] = 99 | ||
|
@@ -610,12 +607,18 @@ def test_quantile_ea_with_na(self, index, frame_or_series): | |
expected = frame_or_series(expected) | ||
tm.assert_equal(result, expected) | ||
|
||
# TODO: filtering can be removed after GH#39763 is fixed | ||
@pytest.mark.filterwarnings("ignore:Using .astype to convert:FutureWarning") | ||
def test_quantile_ea_all_na(self, index, frame_or_series): | ||
|
||
obj = frame_or_series(index).copy() | ||
|
||
obj.iloc[:] = index._na_value | ||
|
||
# TODO(ArrayManager): this casting should be unnecessary after GH#39763 is fixed | ||
obj[:] = obj.astype(index.dtype) | ||
assert np.all(obj.dtypes == index.dtype) | ||
|
||
# result should be invariant to shuffling | ||
indexer = np.arange(len(index), dtype=np.intp) | ||
np.random.shuffle(indexer) | ||
|
Uh oh!
There was an error while loading. Please reload this page.