diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e659bf7adbf1a..b6c13c287d5f9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -421,6 +421,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +- Bug in :func:`crosstab` when ``dropna=False`` would not keep ``np.nan`` in the result (:issue:`10772`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 1c81698eafd18..a030182e774fb 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -164,7 +164,7 @@ def __internal_pivot_table( pass values = list(values) - grouped = data.groupby(keys, observed=observed, sort=sort) + grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index d894a8fe5391d..1bcc86c4908a2 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -286,24 +286,29 @@ def test_margin_dropna4(self): # GH 12642 # _add_margins raises KeyError: Level None not found # when margins=True and dropna=False + # GH: 10772: Keep np.nan in result with dropna=False df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = crosstab(df.a, df.b, margins=True, dropna=False) - expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") + expected = DataFrame([[1, 0, 1.0], [1, 3, 4.0], [0, 1, np.nan], [2, 4, 6.0]]) + expected.index = Index([1.0, 2.0, np.nan, "All"], name="a") expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna5(self): + # GH: 10772: Keep np.nan in result with dropna=False df = DataFrame( {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} ) actual = crosstab(df.a, df.b, margins=True, dropna=False) - expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") + expected = DataFrame( + [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, np.nan], [1, 4, 0, 6.0]] + ) + expected.index = Index([1.0, 2.0, np.nan, "All"], name="a") + expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b") tm.assert_frame_equal(actual, expected) def test_margin_dropna6(self): + # GH: 10772: Keep np.nan in result with dropna=False a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) c = np.array( @@ -315,13 +320,14 @@ def test_margin_dropna6(self): ) m = MultiIndex.from_arrays( [ - ["one", "one", "two", "two", "All"], - ["dull", "shiny", "dull", "shiny", ""], + ["one", "one", "two", "two", np.nan, np.nan, "All"], + ["dull", "shiny", "dull", "shiny", "dull", "shiny", ""], ], names=["b", "c"], ) expected = DataFrame( - [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 0, 7]], + columns=m, ) expected.index = Index(["bar", "foo", "All"], name="a") tm.assert_frame_equal(actual, expected) @@ -330,11 +336,23 @@ def test_margin_dropna6(self): [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False ) m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + [ + ["bar", "bar", "bar", "foo", "foo", "foo", "All"], + ["one", "two", np.nan, "one", "two", np.nan, ""], + ], names=["a", "b"], ) expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + [ + [1, 0, 1.0], + [1, 0, 1.0], + [0, 0, np.nan], + [2, 0, 2.0], + [1, 1, 2.0], + [0, 1, np.nan], + [5, 2, 7.0], + ], + index=m, ) expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index efd5ec2f0868f..b40f0f7a45263 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -248,11 +248,18 @@ def test_pivot_with_non_observable_dropna(self, dropna): ) result = df.pivot_table(index="A", values="B", dropna=dropna) + if dropna: + values = [2.0, 3.0] + codes = [0, 1] + else: + # GH: 10772 + values = [2.0, 3.0, 0.0] + codes = [0, 1, -1] expected = DataFrame( - {"B": [2.0, 3.0]}, + {"B": values}, index=Index( Categorical.from_codes( - [0, 1], categories=["low", "high"], ordered=True + codes, categories=["low", "high"], ordered=dropna ), name="A", ),