Skip to content

Commit

Permalink
API: Always return DataFrame from get_dummies (pandas-dev#24284)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and jreback committed Dec 15, 2018
1 parent e3b6683 commit 8e1a1a3
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 51 deletions.
45 changes: 45 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,51 @@ Finally, a ``Series.sparse`` accessor was added to provide sparse-specific metho
s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
s.sparse.density
.. _whatsnew_0240.api_breaking.get_dummies:

:meth:`get_dummies` always returns a DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, when ``sparse=True`` was passed to :func:`get_dummies`, the return value could be either
a :class:`DataFrame` or a :class:`SparseDataFrame`, depending on whether all or a just a subset
of the columns were dummy-encoded. Now, a :class:`DataFrame` is always returned (:issue:`24284`).

*Previous Behavior*

The first :func:`get_dummies` returns a :class:`DataFrame` because the column ``A``
is not dummy encoded. When just ``["B", "C"]`` are passed to ``get_dummies``,
then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was returned.

.. code-block:: ipython
In [2]: df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})
In [3]: type(pd.get_dummies(df, sparse=True))
Out[3]: pandas.core.frame.DataFrame
In [4]: type(pd.get_dummies(df[['B', 'C']], sparse=True))
Out[4]: pandas.core.sparse.frame.SparseDataFrame
.. ipython:: python
:suppress:
df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})
*New Behavior*

Now, the return type is consistently a :class:`DataFrame`.

.. ipython:: python
type(pd.get_dummies(df, sparse=True))
type(pd.get_dummies(df[['B', 'C']], sparse=True))
.. note::

There's no difference in memory usage between a :class:`SparseDataFrame`
and a :class:`DataFrame` with sparse values. The memory usage will
be the same as in the previous version of pandas.

.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:

Raise ValueError in ``DataFrame.to_dict(orient='index')``
Expand Down
26 changes: 10 additions & 16 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pandas.core.sorting import (
compress_group_index, decons_obs_group_ids, get_compressed_ids,
get_group_index)
from pandas.core.sparse.api import SparseDataFrame, SparseSeries


class _Unstacker(object):
Expand Down Expand Up @@ -706,9 +705,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
If `columns` is None then all the columns with
`object` or `category` dtype will be converted.
sparse : bool, default False
Whether the dummy columns should be sparse or not. Returns
SparseDataFrame if `data` is a Series or if all columns are included.
Otherwise returns a DataFrame with some SparseBlocks.
Whether the dummy-encoded columns should be be backed by
a :class:`SparseArray` (True) or a regular NumPy array (False).
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
Expand All @@ -722,7 +720,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
Returns
-------
dummies : DataFrame or SparseDataFrame
dummies : DataFrame
See Also
--------
Expand Down Expand Up @@ -865,19 +863,16 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
if is_object_dtype(dtype):
raise ValueError("dtype=object is not a valid dtype for get_dummies")

def get_empty_Frame(data, sparse):
def get_empty_frame(data):
if isinstance(data, Series):
index = data.index
else:
index = np.arange(len(data))
if not sparse:
return DataFrame(index=index)
else:
return SparseDataFrame(index=index, default_fill_value=0)
return DataFrame(index=index)

# if all NaN
if not dummy_na and len(levels) == 0:
return get_empty_Frame(data, sparse)
return get_empty_frame(data)

codes = codes.copy()
if dummy_na:
Expand All @@ -886,7 +881,7 @@ def get_empty_Frame(data, sparse):

# if dummy_na, we just fake a nan level. drop_first will drop it again
if drop_first and len(levels) == 1:
return get_empty_Frame(data, sparse)
return get_empty_frame(data)

number_of_cols = len(levels)

Expand Down Expand Up @@ -933,11 +928,10 @@ def _make_col_name(prefix, prefix_sep, level):
sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
sparse_index=IntIndex(N, ixs), fill_value=0,
dtype=dtype)
sparse_series[col] = SparseSeries(data=sarr, index=index)
sparse_series[col] = Series(data=sarr, index=index)

out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
default_fill_value=0,
dtype=dtype)
out = DataFrame(sparse_series, index=index, columns=dummy_cols,
dtype=dtype)
return out

else:
Expand Down
65 changes: 30 additions & 35 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import OrderedDict

from pandas import DataFrame, Series
from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.sparse.api import SparseDtype, SparseArray
import pandas as pd

Expand Down Expand Up @@ -54,23 +55,16 @@ def test_basic(self, sparse, dtype):
'b': [0, 1, 0],
'c': [0, 0, 1]},
dtype=self.effective_dtype(dtype))
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
if sparse:
tm.assert_sp_frame_equal(result,
expected.to_sparse(kind='integer',
fill_value=0))
else:
assert_frame_equal(result, expected)
expected = expected.apply(pd.SparseArray, fill_value=0.0)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=sparse, dtype=dtype)
if sparse:
expected = expected.to_sparse(kind='integer', fill_value=0)
assert_frame_equal(result, expected)

expected.index = list('ABC')
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
if sparse:
expected.to_sparse(kind='integer', fill_value=0)
assert_frame_equal(result, expected)

def test_basic_types(self, sparse, dtype):
Expand All @@ -86,23 +80,27 @@ def test_basic_types(self, sparse, dtype):
'c': [0, 0, 1]},
dtype=self.effective_dtype(dtype),
columns=list('abc'))
if not sparse:
compare = tm.assert_frame_equal
else:
expected = expected.to_sparse(fill_value=0, kind='integer')
compare = tm.assert_sp_frame_equal

if sparse:
if is_integer_dtype(dtype):
fill_value = 0
elif dtype == bool:
fill_value = False
else:
fill_value = 0.0

expected = expected.apply(SparseArray, fill_value=fill_value)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
compare(result, expected)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=sparse, dtype=dtype)
compare(result, expected)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_df, columns=s_df.columns,
sparse=sparse, dtype=dtype)
if sparse:
dtype_name = 'Sparse[{}, 0]'.format(
self.effective_dtype(dtype).name
dtype_name = 'Sparse[{}, {}]'.format(
self.effective_dtype(dtype).name,
fill_value
)
else:
dtype_name = self.effective_dtype(dtype).name
Expand Down Expand Up @@ -137,14 +135,13 @@ def test_just_na(self, sparse):
assert res_series_index.index.tolist() == ['A']

def test_include_na(self, sparse, dtype):
if sparse:
pytest.xfail(reason='nan in index is problematic (GH 16894)')

s = ['a', 'b', np.nan]
res = get_dummies(s, sparse=sparse, dtype=dtype)
exp = DataFrame({'a': [1, 0, 0],
'b': [0, 1, 0]},
dtype=self.effective_dtype(dtype))
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0.0)
assert_frame_equal(res, exp)

# Sparse dataframes do not allow nan labelled columns, see #GH8822
Expand All @@ -156,6 +153,8 @@ def test_include_na(self, sparse, dtype):
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
if sparse:
exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True,
Expand All @@ -175,10 +174,8 @@ def test_unicode(self, sparse):
u('letter_%s') % eacute: [0, 1, 1]},
dtype=np.uint8)
if sparse:
tm.assert_sp_frame_equal(res, exp.to_sparse(fill_value=0,
kind='integer'))
else:
assert_frame_equal(res, exp)
exp = exp.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res, exp)

def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[['A', 'B']]
Expand All @@ -189,16 +186,14 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
'B_c': [0, 0, 1]},
dtype=np.uint8)
if sparse:
expected = pd.SparseDataFrame({
expected = pd.DataFrame({
"A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
"A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
"B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
"B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
})

tm.assert_sp_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)
assert_frame_equal(result, expected)

def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype)
Expand Down Expand Up @@ -402,7 +397,7 @@ def test_basic_drop_first(self, sparse):

result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
expected = expected.to_sparse(fill_value=0, kind='integer')
expected = expected.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
Expand Down Expand Up @@ -436,7 +431,7 @@ def test_basic_drop_first_NA(self, sparse):
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
if sparse:
exp = exp.to_sparse(fill_value=0, kind='integer')
exp = exp.apply(pd.SparseArray, fill_value=0)

assert_frame_equal(res, exp)

Expand All @@ -447,7 +442,7 @@ def test_basic_drop_first_NA(self, sparse):
nan: [0, 0, 1]},
dtype=np.uint8).reindex(['b', nan], axis=1)
if sparse:
exp_na = exp_na.to_sparse(fill_value=0, kind='integer')
exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
Expand All @@ -462,7 +457,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
'B_c': [0, 0, 1]},
dtype=np.uint8)
if sparse:
expected = expected.to_sparse(fill_value=0, kind='integer')
expected = expected.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(
Expand Down

0 comments on commit 8e1a1a3

Please sign in to comment.