-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Groupby ops on empty objects loses index, columns, dtypes #39940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
84a0347
17c2396
bb30001
f124da1
2fc70ff
d3e52aa
af55c7d
4ec2eca
782caba
bd51562
c4e1c0d
d00d5bc
379e12a
8266af6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
|
||
import pandas as pd | ||
from pandas import ( | ||
Categorical, | ||
DataFrame, | ||
Grouper, | ||
Index, | ||
|
@@ -18,6 +19,7 @@ | |
Timestamp, | ||
date_range, | ||
read_csv, | ||
to_datetime, | ||
) | ||
import pandas._testing as tm | ||
from pandas.core.base import SpecificationError | ||
|
@@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error(): | |
) | ||
|
||
|
||
def test_empty_dataframe_groupby(): | ||
# GH8093 | ||
df = DataFrame(columns=["A", "B", "C"]) | ||
|
||
result = df.groupby("A").sum() | ||
expected = DataFrame(columns=["B", "C"], dtype=np.float64) | ||
expected.index.name = "A" | ||
|
||
tm.assert_frame_equal(result, expected) | ||
@pytest.mark.parametrize("columns", ["C", ["C"]]) | ||
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) | ||
@pytest.mark.parametrize( | ||
"values", | ||
[ | ||
[True], | ||
[0], | ||
[0.0], | ||
["a"], | ||
[Categorical([0])], | ||
[to_datetime(0)], | ||
[date_range(0, 1, 1, tz="US/Eastern")], | ||
[pd.array([0], dtype="Int64")], | ||
], | ||
) | ||
@pytest.mark.parametrize("method", ["attr", "agg", "apply"]) | ||
@pytest.mark.parametrize( | ||
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"] | ||
) | ||
def test_empty_groupby(columns, keys, values, method, op): | ||
# GH8093 & GH26411 | ||
|
||
override_dtype = None | ||
if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply": | ||
# sum/product of bools is an integer | ||
override_dtype = "int64" | ||
|
||
df = DataFrame([3 * values], columns=list("ABC")) | ||
df = df.iloc[:0] | ||
|
||
gb = df.groupby(keys)[columns] | ||
if method == "attr": | ||
result = getattr(gb, op)() | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OT: might be worthile to split up this file as getting kind of long. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense - TBH I've never fully understood what tests were meant to be in here. I've always thought of it as tests of the *GroupBy attributes themselves, rather than the computation methods (e.g. sum, apply, etc). If that's the case, then maybe just move any tests that rely on calling computation methods out? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right the test_groupby.py is basically test that we correctly construct a groupby object and other tests are about actually executing it. over the years these have slowly been separated out. i think time to rename this and be clear about it. |
||
result = getattr(gb, method)(op) | ||
|
||
expected = df.set_index(keys)[columns] | ||
if override_dtype is not None: | ||
expected = expected.astype(override_dtype) | ||
if len(keys) == 1: | ||
expected.index.name = keys[0] | ||
tm.assert_equal(result, expected) | ||
|
||
|
||
def test_tuple_as_grouping(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add some other datatypes to make sure preserving (categorical, datetime, datetime w/tz, Int). if some still don't work, just xfail them and create an issue.)