Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: merging an Integer EA rasises #23262

Merged
merged 15 commits into from
Dec 19, 2018
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1051,6 +1051,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`).
- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`).
- A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`).

.. _whatsnew_0240.api.incompatibilities:
Expand Down
23 changes: 15 additions & 8 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype,
is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number,
is_numeric_dtype, needs_i8_conversion)
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
needs_i8_conversion)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype

from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
Expand Down Expand Up @@ -1589,25 +1590,31 @@ def _right_outer_join(x, y, max_groups):


def _factorize_keys(lk, rk, sort=True):
# Some pre-processing for non-ndarray lk / rk
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
lk = lk.values
rk = rk.values

# if we exactly match in categories, allow us to factorize on codes
if (is_categorical_dtype(lk) and
elif (is_categorical_dtype(lk) and
is_categorical_dtype(rk) and
lk.is_dtype_equal(rk)):
klass = libhashtable.Int64Factorizer

if lk.categories.equals(rk.categories):
# if we exactly match in categories, allow us to factorize on codes
rk = rk.codes
else:
# Same categories in different orders -> recode
rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)

lk = ensure_int64(lk.codes)
rk = ensure_int64(rk)
elif is_integer_dtype(lk) and is_integer_dtype(rk):

elif (is_extension_array_dtype(lk.dtype) and
is_extension_array_dtype(rk.dtype) and
lk.dtype == rk.dtype):
lk, _ = lk._values_for_factorize()
rk, _ = rk._values_for_factorize()

if is_integer_dtype(lk) and is_integer_dtype(rk):
# GH#23917 TODO: needs tests for case where lk is integer-dtype
# and rk is datetime-dtype
klass = libhashtable.Int64Factorizer
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,38 @@ def test_merge(self, data, na_value):
dtype=data.dtype)})
self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])

def test_merge_on_extension_array(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b], dtype=data.dtype)

df = pd.DataFrame({"key": key, "val": [1, 2]})
result = pd.merge(df, df, on='key')
expected = pd.DataFrame({"key": key,
"val_x": [1, 2],
"val_y": [1, 2]})
self.assert_frame_equal(result, expected)

# order
result = pd.merge(df.iloc[[1, 0]], df, on='key')
expected = expected.iloc[[1, 0]].reset_index(drop=True)
self.assert_frame_equal(result, expected)

def test_merge_on_extension_array_duplicates(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})

result = pd.merge(df1, df2, on='key')
expected = pd.DataFrame({
"key": key.take([0, 0, 0, 0, 1]),
"val_x": [1, 1, 3, 3, 2],
"val_y": [1, 3, 1, 3, 2],
})
self.assert_frame_equal(result, expected)

TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("columns", [
["A", "B"],
pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')],
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,16 @@ def test_merging_with_bool_or_int_cateorical_column(self, category_column,
CDT(categories, ordered=ordered))
assert_frame_equal(expected, result)

def test_merge_on_int_array(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move to pandas/tests/extension/base/reshape (and use the fixtures)

# GH 23020
df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
'B': 1})
result = pd.merge(df, df, on='A')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think can remove this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this does test merging on an NA key, which the generic test doesn't. So maybe leave it?

expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'),
'B_x': 1,
'B_y': 1})
assert_frame_equal(result, expected)


@pytest.fixture
def left_df():
Expand Down