diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index efcc04d688334..3d4bb8ec57794 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -46,9 +46,14 @@ The categorical data type is useful in the following cases: See also the :ref:`API docs on categoricals`. +.. _categorical.objectcreation: + Object Creation --------------- +Series Creation +~~~~~~~~~~~~~~~ + Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways: By specifying ``dtype="category"`` when constructing a ``Series``: @@ -77,7 +82,7 @@ discrete bins. See the :ref:`example on tiling ` in the docs df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) -By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. +By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``. .. ipython:: python @@ -89,6 +94,55 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df +Categorical data has a specific ``category`` :ref:`dtype `: + +.. ipython:: python + + df.dtypes + +DataFrame Creation +~~~~~~~~~~~~~~~~~~ + +Similar to the previous section where a single column was converted to categorical, all columns in a +``DataFrame`` can be batch converted to categorical either during or after construction. + +This can be done during construction by specifying ``dtype="category"`` in the ``DataFrame`` constructor: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category") + df.dtypes + +Note that the categories present in each column differ; the conversion is done column by column, so +only labels present in a given column are categories: + +.. ipython:: python + + df['A'] + df['B'] + + +.. versionadded:: 0.23.0 + +Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + df_cat = df.astype('category') + df_cat.dtypes + +This conversion is likewise done column by column: + +.. ipython:: python + + df_cat['A'] + df_cat['B'] + + +Controlling Behavior +~~~~~~~~~~~~~~~~~~~~ + In the examples above where we passed ``dtype='category'``, we used the default behavior: @@ -108,21 +162,36 @@ of :class:`~pandas.api.types.CategoricalDtype`. s_cat = s.astype(cat_type) s_cat -Categorical data has a specific ``category`` :ref:`dtype `: +Similarly, a ``CategoricalDtype`` can be used with a ``DataFrame`` to ensure that categories +are consistent among all columns. .. ipython:: python - df.dtypes + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + cat_type = CategoricalDtype(categories=list('abcd'), + ordered=True) + df_cat = df.astype(cat_type) + df_cat['A'] + df_cat['B'] .. note:: - In contrast to R's `factor` function, categorical data is not converting input values to - strings and categories will end up the same data type as the original values. + To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as + categories for each column, the ``categories`` parameter can be determined programatically by + ``categories = pd.unique(df.values.ravel())``. -.. note:: +If you already have ``codes`` and ``categories``, you can use the +:func:`~pandas.Categorical.from_codes` constructor to save the factorize step +during normal constructor mode: - In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `categories` to change the categories after creation time. +.. ipython:: python + + splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) + + +Regaining Original Data +~~~~~~~~~~~~~~~~~~~~~~~ To get back to the original ``Series`` or NumPy array, use ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: @@ -136,14 +205,15 @@ To get back to the original ``Series`` or NumPy array, use s2.astype(str) np.asarray(s2) -If you already have `codes` and `categories`, you can use the -:func:`~pandas.Categorical.from_codes` constructor to save the factorize step -during normal constructor mode: +.. note:: -.. ipython:: python + In contrast to R's `factor` function, categorical data is not converting input values to + strings; categories will end up the same data type as the original values. - splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. note:: + + In contrast to R's `factor` function, there is currently no way to assign/change labels at + creation time. Use `categories` to change the categories after creation time. .. _categorical.categoricaldtype: diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 542e62aa374be..e9ba073312064 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -268,6 +268,37 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python df.assign(A=df.A+1, C= lambda df: df.A* -1) + +.. _whatsnew_0230.enhancements.astype_category: + +``DataFrame.astype`` performs column-wise conversion to ``Categorical`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.astype` can now perform column-wise conversion to ``Categorical`` by supplying the string ``'category'`` or +a :class:`~pandas.api.types.CategoricalDtype`. Previously, attempting this would raise a ``NotImplementedError``. See the +:ref:`categorical.objectcreation` section of the documentation for more details and examples. (:issue:`12860`, :issue:`18099`) + +Supplying the string ``'category'`` performs column-wise conversion, with only labels appearing in a given column set as categories: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + df = df.astype('category') + df['A'].dtype + df['B'].dtype + + +Supplying a ``CategoricalDtype`` will make the categories in each column consistent with the supplied dtype: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) + cdt = CategoricalDtype(categories=list('abcd'), ordered=True) + df = df.astype(cdt) + df['A'].dtype + df['B'].dtype + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1ed6ae9c8a6c..c4eb7dd7e7a7e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,6 +18,7 @@ is_number, is_integer, is_bool, is_bool_dtype, + is_categorical_dtype, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -4429,14 +4430,18 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): if col_name not in self: raise KeyError('Only a column name can be used for the ' 'key in a dtype mappings argument.') - from pandas import concat results = [] for col_name, col in self.iteritems(): if col_name in dtype: results.append(col.astype(dtype[col_name], copy=copy)) else: results.append(results.append(col.copy() if copy else col)) - return concat(results, axis=1, copy=False) + return pd.concat(results, axis=1, copy=False) + + elif is_categorical_dtype(dtype) and self.ndim > 1: + # GH 18099: columnwise conversion to categorical + results = (self[col].astype(dtype, copy=copy) for col in self) + return pd.concat(results, axis=1, copy=False) # else, only a single dtype is given new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index e9e5b2a447a4a..430d43019afc2 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -8,11 +8,11 @@ import numpy as np from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, - compat, concat, option_context) + Categorical, compat, concat, option_context) from pandas.compat import u from pandas import _np_version_under1p14 -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype, CategoricalDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -619,12 +619,21 @@ def test_astype_duplicate_col(self): expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']]) - def test_categorical_astype_ndim_raises(self, columns): - # GH 18004 - msg = '> 1 ndim Categorical are not supported at this time' - with tm.assert_raises_regex(NotImplementedError, msg): - DataFrame(columns=columns).astype('category') + @pytest.mark.parametrize('dtype', [ + 'category', + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list('abcdef')), + CategoricalDtype(categories=list('edba'), ordered=False), + CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr) + def test_astype_categorical(self, dtype): + # GH 18099 + d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} + df = DataFrame(d) + result = df.astype(dtype) + expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cls", [ pd.api.types.CategoricalDtype,