more cleanups now sum is implemented

pandas-dev · jorisvandenbossche · Nov 8, 2024 · Aug 6, 2024 · Aug 13, 2024 · Aug 14, 2024
commit baa1dd95ef140deb9ec87d7fdf523c46ac241ace
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -752,9 +752,6 @@ class String:
     ]
 
     def setup(self, dtype, method):
-        if dtype == "string[python]" and method == "sum":
-            raise NotImplementedError  # skip benchmark
-
         cols = list("abcdefghjkl")
         self.df = DataFrame(
             np.random.randint(0, 100, size=(10_000, len(cols))),

diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py
@@ -61,7 +61,6 @@ def test_metadata_propagation_indiv_groupby(self):
                 "D": np.random.default_rng(2).standard_normal(8),
             }
         )
-        df = df.astype({"A": object, "B": object})
         result = df.groupby("A").sum()
         tm.assert_metadata_equivalent(df, result)
 

diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -1020,7 +1020,7 @@ def test_groupby_as_index_agg(df):
 
     result2 = grouped.agg({"C": "mean", "D": "sum"})
     expected2 = grouped.mean(numeric_only=True)
-    expected2["D"] = grouped.sum(numeric_only=True)["D"]
+    expected2["D"] = grouped.sum()["D"]
     tm.assert_frame_equal(result2, expected2)
 
     grouped = df.groupby("A", as_index=True)

diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py
@@ -244,7 +244,7 @@ def test_groupby_quantile_nullable_array(values, q):
 
 @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
 @pytest.mark.parametrize("numeric_only", [True, False])
-def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only, using_infer_string):
+def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
     df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
     if numeric_only:
         result = df.groupby("a").quantile(q, numeric_only=numeric_only)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -575,7 +575,6 @@ def test_ops_not_as_index(reduction_func):
 
 
 def test_as_index_series_return_frame(df):
-    df = df.astype({"A": object, "B": object})
     grouped = df.groupby("A", as_index=False)
     grouped2 = df.groupby(["A", "B"], as_index=False)
 
@@ -979,7 +978,6 @@ def test_groupby_with_hier_columns():
 
 
 def test_grouping_ndarray(df):
-    df = df.astype({"A": object, "B": object})
     grouped = df.groupby(df["A"].values)
     grouped2 = df.groupby(df["A"].rename(None))
 
@@ -1477,13 +1475,10 @@ def f(group):
 
 def test_no_dummy_key_names(df):
     # see gh-1291
-    df = df.astype({"A": object, "B": object})
-    gb = df.groupby(df["A"].values)
-    gb2 = df.groupby([df["A"].values, df["B"].values])
-    result = gb.sum()
+    result = df.groupby(df["A"].values).sum()
     assert result.index.name is None
 
-    result2 = gb2.sum()
+    result2 = df.groupby([df["A"].values, df["B"].values]).sum()
     assert result2.index.names == (None, None)
 
 

diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py
@@ -28,7 +28,8 @@ def df(self):
                 "group": [1, 1, 2],
                 "int": [1, 2, 3],
                 "float": [4.0, 5.0, 6.0],
-                "string": Series(["a", "b", "c"], dtype=object),
+                "string": Series(["a", "b", "c"], dtype="str"),
+                "object": Series(["a", "b", "c"], dtype=object),
                 "category_string": Series(list("abc")).astype("category"),
                 "category_int": [7, 8, 9],
                 "datetime": date_range("20130101", periods=3),
@@ -40,6 +41,7 @@ def df(self):
                 "int",
                 "float",
                 "string",
+                "object",
                 "category_string",
                 "category_int",
                 "datetime",
@@ -112,6 +114,7 @@ def test_first_last(self, df, method):
                 "int",
                 "float",
                 "string",
+                "object",
                 "category_string",
                 "category_int",
                 "datetime",
@@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
 
         # object dtypes for transformations are not implemented in Cython and
         # have no Python fallback
-        exception = NotImplementedError if method.startswith("cum") else TypeError
+        exception = (
+            (NotImplementedError, TypeError) if method.startswith("cum") else TypeError
+        )
 
         if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
             # The methods default to numeric_only=False and raise TypeError
@@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     re.escape(f"agg function failed [how->{method},dtype->object]"),
                     # cumsum/cummin/cummax/cumprod
                     "function is not implemented for this dtype",
+                    f"dtype 'str' does not support operation '{method}'",
                 ]
             )
             with pytest.raises(exception, match=msg):

diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
@@ -194,10 +194,7 @@ def test_groupby_raises_string(
             "quantile",
         ]:
             msg = f"dtype 'str' does not support operation '{groupby_func}'"
-            if groupby_func == "sum":
-                # The object-dtype allows this, StringArray variants do not.
-                klass = TypeError
-            elif groupby_func in ["sem", "std", "skew"]:
+            if groupby_func in ["sem", "std", "skew"]:
                 # The object-dtype raises ValueError when trying to convert to numeric.
                 klass = TypeError
         elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -139,7 +139,6 @@ def test_pivot_table_nocols(self):
         df = DataFrame(
             {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}
         )
-        df = df.astype({"rows": object, "cols": object})
         rs = df.pivot_table(columns="cols", aggfunc="sum")
         xp = df.pivot_table(index="cols", aggfunc="sum").T
         tm.assert_frame_equal(rs, xp)