Skip to content

BUG: Groupby ops on empty objects loses index, columns, dtypes #39940

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Feb 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ Groupby/resample/rolling
- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`)
- Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`)
- Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`)
- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`)
-

Reshaping
Expand All @@ -454,6 +455,7 @@ Reshaping
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`)

Sparse
^^^^^^
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,13 +450,19 @@ def _wrap_transformed_output(
return result

def _wrap_applied_output(
self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
self,
data: Series,
keys: Index,
values: Optional[List[Any]],
not_indexed_same: bool = False,
) -> FrameOrSeriesUnion:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.

Parameters
----------
data : Series
Input data for groupby operation.
keys : Index
Keys of groups that Series was grouped by.
values : Optional[List[Any]]
Expand All @@ -471,7 +477,10 @@ def _wrap_applied_output(
if len(keys) == 0:
# GH #6265
return self.obj._constructor(
[], name=self._selection_name, index=keys, dtype=np.float64
[],
name=self._selection_name,
index=self.grouper.result_index,
dtype=data.dtype,
)
assert values is not None

Expand Down Expand Up @@ -1229,9 +1238,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

return self.obj._constructor(result, columns=result_columns)

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
def _wrap_applied_output(self, data, keys, values, not_indexed_same=False):
if len(keys) == 0:
return self.obj._constructor(index=keys)
result = self.obj._constructor(
index=self.grouper.result_index, columns=data.columns
)
result = result.astype(data.dtypes.to_dict(), copy=False)
return result

# GH12824
first_not_none = next(com.not_none(*values), None)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ def _python_apply_general(
keys, values, mutated = self.grouper.apply(f, data, self.axis)

return self._wrap_applied_output(
keys, values, not_indexed_same=mutated or self.mutated
data, keys, values, not_indexed_same=mutated or self.mutated
)

def _iterate_slices(self) -> Iterable[Series]:
Expand Down Expand Up @@ -1058,7 +1058,7 @@ def _wrap_aggregated_output(
def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
raise AbstractMethodError(self)

def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
raise AbstractMethodError(self)

@final
Expand Down
19 changes: 3 additions & 16 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,14 +236,8 @@ def __internal_pivot_table(
)

# discard the top level
if (
values_passed
and not values_multi
and not table.empty
and (table.columns.nlevels > 1)
):
table = table[values[0]]

if values_passed and not values_multi and table.columns.nlevels > 1:
table = table.droplevel(0, axis=1)
if len(index) == 0 and len(columns) > 0:
table = table.T

Expand Down Expand Up @@ -650,7 +644,6 @@ def crosstab(
**dict(zip(unique_colnames, columns)),
}
df = DataFrame(data, index=common_idx)
original_df_cols = df.columns

if values is None:
df["__dummy__"] = 0
Expand All @@ -660,7 +653,7 @@ def crosstab(
kwargs = {"aggfunc": aggfunc}

table = df.pivot_table(
["__dummy__"],
"__dummy__",
index=unique_rownames,
columns=unique_colnames,
margins=margins,
Expand All @@ -669,12 +662,6 @@ def crosstab(
**kwargs,
)

# GH18321, after pivoting, an extra top level of column index of `__dummy__` is
# created, and this extra level should not be included in the further steps
if not table.empty:
cols_diff = df.columns.difference(original_df_cols)[0]
table = table[cols_diff]

# Post-process
if normalize is not False:
table = _normalize(
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,13 @@ def test_agg_apply_corner(ts, tsframe):
# DataFrame
grouped = tsframe.groupby(tsframe["A"] * np.nan)
exp_df = DataFrame(
columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64)
columns=tsframe.columns,
dtype=float,
index=Index([], name="A", dtype=np.float64),
)
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False)
tm.assert_frame_equal(grouped.sum(), exp_df)
tm.assert_frame_equal(grouped.agg(np.sum), exp_df)
tm.assert_frame_equal(grouped.apply(np.sum), exp_df)


def test_agg_grouping_is_list_tuple(ts):
Expand Down
53 changes: 44 additions & 9 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import pandas as pd
from pandas import (
Categorical,
DataFrame,
Grouper,
Index,
Expand All @@ -18,6 +19,7 @@
Timestamp,
date_range,
read_csv,
to_datetime,
)
import pandas._testing as tm
from pandas.core.base import SpecificationError
Expand Down Expand Up @@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error():
)


def test_empty_dataframe_groupby():
# GH8093
df = DataFrame(columns=["A", "B", "C"])

result = df.groupby("A").sum()
expected = DataFrame(columns=["B", "C"], dtype=np.float64)
expected.index.name = "A"

tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("columns", ["C", ["C"]])
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
@pytest.mark.parametrize(
"values",
[
[True],
[0],
[0.0],
["a"],
[Categorical([0])],
[to_datetime(0)],
[date_range(0, 1, 1, tz="US/Eastern")],
[pd.array([0], dtype="Int64")],
],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some other datatypes to make sure preserving (categorical, datetime, datetime w/tz, Int). if some still don't work, just xfail them and create an issue.)

)
@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
)
def test_empty_groupby(columns, keys, values, method, op):
# GH8093 & GH26411

override_dtype = None
if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply":
# sum/product of bools is an integer
override_dtype = "int64"

df = DataFrame([3 * values], columns=list("ABC"))
df = df.iloc[:0]

gb = df.groupby(keys)[columns]
if method == "attr":
result = getattr(gb, op)()
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OT: might be worthile to split up this file as getting kind of long.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense - TBH I've never fully understood what tests were meant to be in here. I've always thought of it as tests of the *GroupBy attributes themselves, rather than the computation methods (e.g. sum, apply, etc). If that's the case, then maybe just move any tests that rely on calling computation methods out?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right the test_groupby.py is basically test that we correctly construct a groupby object and other tests are about actually executing it. over the years these have slowly been separated out. i think time to rename this and be clear about it.

result = getattr(gb, method)(op)

expected = df.set_index(keys)[columns]
if override_dtype is not None:
expected = expected.astype(override_dtype)
if len(keys) == 1:
expected.index.name = keys[0]
tm.assert_equal(result, expected)


def test_tuple_as_grouping():
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas import (
DataFrame,
Series,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm
Expand Down Expand Up @@ -398,6 +399,18 @@ def test_resample_groupby_agg():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
def test_empty(keys):
# GH 26411
df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([]))
result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False)
if len(keys) == 1:
expected.index.name = keys[0]

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("consolidate", [True, False])
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
# https://github.com/pandas-dev/pandas/issues/39329
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/reshape/test_crosstab.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self):
s2 = Series([4, 5, 6], index=[4, 5, 6])

actual = crosstab(s1, s2)
expected = DataFrame()
expected = DataFrame(
index=Index([], dtype="int64", name="row_0"),
columns=Index([], dtype="int64", name="col_0"),
)

tm.assert_frame_equal(actual, expected)

Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
tm.assert_frame_equal(result, expected)

def test_pivot_table_empty_aggfunc(self):
# GH 9186
# GH 9186 & GH 13483
df = DataFrame(
{
"A": [2, 2, 3, 3, 2],
Expand All @@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self):
}
)
result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size)
expected = DataFrame()
expected = DataFrame(index=Index([], dtype="int64", name="A"))
expected.columns.name = "D"
tm.assert_frame_equal(result, expected)

def test_pivot_table_no_column_raises(self):
Expand Down