Skip to content

TST (string dtype): fix groupby xfails with using_infer_string + update error message #59430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Nov 8, 2024
Merged
Prev Previous commit
more cleanups now sum is implemented
  • Loading branch information
jorisvandenbossche committed Nov 4, 2024
commit baa1dd95ef140deb9ec87d7fdf523c46ac241ace
3 changes: 0 additions & 3 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,9 +752,6 @@ class String:
]

def setup(self, dtype, method):
if dtype == "string[python]" and method == "sum":
raise NotImplementedError # skip benchmark

cols = list("abcdefghjkl")
self.df = DataFrame(
np.random.randint(0, 100, size=(10_000, len(cols))),
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/generic/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def test_metadata_propagation_indiv_groupby(self):
"D": np.random.default_rng(2).standard_normal(8),
}
)
df = df.astype({"A": object, "B": object})
result = df.groupby("A").sum()
tm.assert_metadata_equivalent(df, result)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,7 @@ def test_groupby_as_index_agg(df):

result2 = grouped.agg({"C": "mean", "D": "sum"})
expected2 = grouped.mean(numeric_only=True)
expected2["D"] = grouped.sum(numeric_only=True)["D"]
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)

grouped = df.groupby("A", as_index=True)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def test_groupby_quantile_nullable_array(values, q):

@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only, using_infer_string):
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
if numeric_only:
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
Expand Down
9 changes: 2 additions & 7 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,6 @@ def test_ops_not_as_index(reduction_func):


def test_as_index_series_return_frame(df):
df = df.astype({"A": object, "B": object})
grouped = df.groupby("A", as_index=False)
grouped2 = df.groupby(["A", "B"], as_index=False)

Expand Down Expand Up @@ -979,7 +978,6 @@ def test_groupby_with_hier_columns():


def test_grouping_ndarray(df):
df = df.astype({"A": object, "B": object})
grouped = df.groupby(df["A"].values)
grouped2 = df.groupby(df["A"].rename(None))

Expand Down Expand Up @@ -1477,13 +1475,10 @@ def f(group):

def test_no_dummy_key_names(df):
# see gh-1291
df = df.astype({"A": object, "B": object})
gb = df.groupby(df["A"].values)
gb2 = df.groupby([df["A"].values, df["B"].values])
result = gb.sum()
result = df.groupby(df["A"].values).sum()
assert result.index.name is None

result2 = gb2.sum()
result2 = df.groupby([df["A"].values, df["B"].values]).sum()
assert result2.index.names == (None, None)


Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/groupby/test_numeric_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def df(self):
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": Series(["a", "b", "c"], dtype=object),
"string": Series(["a", "b", "c"], dtype="str"),
"object": Series(["a", "b", "c"], dtype=object),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
Expand All @@ -40,6 +41,7 @@ def df(self):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -112,6 +114,7 @@ def test_first_last(self, df, method):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):

# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if method.startswith("cum") else TypeError
exception = (
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
)

if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
# The methods default to numeric_only=False and raise TypeError
Expand All @@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,7 @@ def test_groupby_raises_string(
"quantile",
]:
msg = f"dtype 'str' does not support operation '{groupby_func}'"
if groupby_func == "sum":
# The object-dtype allows this, StringArray variants do not.
klass = TypeError
elif groupby_func in ["sem", "std", "skew"]:
if groupby_func in ["sem", "std", "skew"]:
# The object-dtype raises ValueError when trying to convert to numeric.
klass = TypeError
elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def test_pivot_table_nocols(self):
df = DataFrame(
{"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}
)
df = df.astype({"rows": object, "cols": object})
rs = df.pivot_table(columns="cols", aggfunc="sum")
xp = df.pivot_table(index="cols", aggfunc="sum").T
tm.assert_frame_equal(rs, xp)
Expand Down
Loading