Skip to content

Commit

Permalink
BUG/API: .merge() and .join() on category dtype columns will now
Browse files Browse the repository at this point in the history
preserve the category dtype when possible

closes #10409
  • Loading branch information
jreback committed Feb 7, 2017
1 parent 8d57450 commit 4c67377
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 1 deletion.
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,30 @@ def time_i8merge(self):
merge(self.left, self.right, how='outer')


class MergeCategoricals(object):
goal_time = 0.2

def setup(self):
self.left_object = pd.DataFrame(
{'X': np.random.choice(range(0, 10), size=(10000,)),
'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})

self.right_object = pd.DataFrame(
{'X': np.random.choice(range(0, 10), size=(10000,)),
'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})

self.left_cat = self.left_object.assign(
Y=self.left_object['Y'].astype('category'))
self.right_cat = self.right_object.assign(
Z=self.right_object['Z'].astype('category'))

def time_merge_object(self):
merge(self.left_object, self.right_object, on='X')

def time_merge_cat(self):
merge(self.left_cat, self.right_cat, on='X')


#----------------------------------------------------------------------
# Ordered merge

Expand Down
6 changes: 5 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,10 @@ Other API Changes
- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
- ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`)
- ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype``
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
- ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`)
- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)


.. _whatsnew_0200.deprecations:

Deprecations
Expand Down Expand Up @@ -409,6 +412,7 @@ Performance Improvements
- Improved performance of timeseries plotting with an irregular DatetimeIndex
(or with ``compat_x=True``) (:issue:`15073`).
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)

- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.

Expand Down
2 changes: 2 additions & 0 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5223,6 +5223,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
# External code requested filling/upcasting, bool values must
# be upcasted to object to avoid being upcasted to numeric.
values = self.block.astype(np.object_).values
elif self.block.is_categorical:
values = self.block.values
else:
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4098,12 +4098,14 @@ def test_merge(self):
cright = right.copy()
cright['d'] = cright['d'].astype('category')
result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
expected['d'] = expected['d'].astype('category', categories=['null'])
tm.assert_frame_equal(result, expected)

# cat-object
cleft = left.copy()
cleft['b'] = cleft['b'].astype('category')
result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c')
expected['b'] = expected['b'].astype('category')
tm.assert_frame_equal(result, expected)

# cat-cat
Expand Down
8 changes: 8 additions & 0 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
is_datetime64_dtype,
needs_i8_conversion,
is_int64_dtype,
is_categorical_dtype,
is_integer_dtype,
is_float_dtype,
is_integer,
Expand Down Expand Up @@ -1339,6 +1340,13 @@ def _factorize_keys(lk, rk, sort=True):
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
lk = lk.values
rk = rk.values

# if we exactly match in categories, allow us to use codes
if (is_categorical_dtype(lk) and
is_categorical_dtype(rk) and
lk.is_dtype_equal(rk)):
return lk.codes, rk.codes, len(lk.categories)

if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
klass = _hash.Int64Factorizer
lk = _ensure_int64(com._values_from_object(lk))
Expand Down
116 changes: 116 additions & 0 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pandas.util.testing import (assert_frame_equal,
assert_series_equal,
slow)
from pandas.types.dtypes import CategoricalDtype
from pandas import DataFrame, Index, MultiIndex, Series, Categorical
import pandas.util.testing as tm

Expand Down Expand Up @@ -1368,3 +1369,118 @@ def f():
def f():
household.join(log_return, how='outer')
self.assertRaises(NotImplementedError, f)


class TestMergeCategorical(tm.TestCase):
_multiprocess_can_split_ = True

def setUp(self):
np.random.seed(1234)
self.left = DataFrame(
{'X': np.random.choice(['foo', 'bar'], size=(10,)),
'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})

self.right = pd.DataFrame(
{'X': np.random.choice(['foo', 'bar'], size=(10,)),
'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10,))})

def test_identical(self):
# GH 10409
left = self.left.assign(X=self.left.X.astype('category'))

merged = pd.merge(left, left, on='X')
result = merged.dtypes.sort_index()
expected = Series([CategoricalDtype(),
np.dtype('O'),
np.dtype('O')],
index=['X', 'Y_x', 'Y_y'])
assert_series_equal(result, expected)

def test_other_columns(self):
# non-merge columns should preserver if possible
x = self.left.X.astype('category')
left = DataFrame({'X': x, 'Y': x})

merged = pd.merge(left, left, on='X')
result = merged.dtypes.sort_index()
expected = Series([CategoricalDtype(),
CategoricalDtype(),
CategoricalDtype()],
index=['X', 'Y_x', 'Y_y'])
assert_series_equal(result, expected)

# different categories
x = self.left.X.astype('category')
left = DataFrame(
{'X': x,
'Y': x.cat.set_categories(['bar', 'foo', 'bah'])})
right = self.right.drop_duplicates(['X'])
right = right.assign(
Y=pd.Series(['foo', 'foo']).astype(
'category', categories=['foo', 'bar', 'baz']))

merged = pd.merge(left, right, on='X')
result = merged.dtypes.sort_index()
expected = Series([CategoricalDtype(),
CategoricalDtype(),
CategoricalDtype(),
np.dtype('O')],
index=['X', 'Y_x', 'Y_y', 'Z'])
assert_series_equal(result, expected)

def test_categories_same(self):
# GH 10409
left = self.left.assign(X=self.left.X.astype('category'))
right = self.right.assign(X=self.right.X.astype('category'))

merged = pd.merge(left, right, on='X')
result = merged.dtypes.sort_index()
expected = Series([CategoricalDtype(),
np.dtype('O'),
np.dtype('O')],
index=['X', 'Y', 'Z'])
assert_series_equal(result, expected)

def test_categories_different(self):

r = self.right.drop_duplicates(['X'])

# from above with original categories
left = self.left.assign(X=self.left.X.astype('category'))

right = r.assign(X=r.X.astype('category'))
merged = pd.merge(left, right, on='X')

# swap the categories
# but should still work (end return categorical)
left = self.left.assign(X=self.left.X.astype('category'))
right = r.assign(X=r.X.astype('category', categories=['foo', 'bar']))
result = pd.merge(left, right, on='X')
tm.assert_index_equal(result.X.cat.categories,
pd.Index(['bar', 'foo']))

assert_frame_equal(result, merged)

result = result.dtypes.sort_index()
expected = Series([CategoricalDtype(),
np.dtype('O'),
np.dtype('O')],
index=['X', 'Y', 'Z'])
assert_series_equal(result, expected)

# swap the categories and ordered on one
# but should still work (end return categorical)
right = r.assign(X=r.X.astype('category', categories=['foo', 'bar'],
ordered=True))
result = pd.merge(left, right, on='X')
tm.assert_index_equal(result.X.cat.categories,
pd.Index(['bar', 'foo']))

assert_frame_equal(result, merged)

result = result.dtypes.sort_index()
expected = Series([CategoricalDtype(),
np.dtype('O'),
np.dtype('O')],
index=['X', 'Y', 'Z'])
assert_series_equal(result, expected)
1 change: 1 addition & 0 deletions pandas/tools/tests/test_merge_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def test_basic_categorical(self):
trades.ticker = trades.ticker.astype('category')
quotes = self.quotes.copy()
quotes.ticker = quotes.ticker.astype('category')
expected.ticker = expected.ticker.astype('category')

result = merge_asof(trades, quotes,
on='time',
Expand Down

0 comments on commit 4c67377

Please sign in to comment.