Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: merging an Integer EA rasises #23262

Merged
merged 15 commits into from
Dec 19, 2018
Merged
Prev Previous commit
Next Next commit
Updates
* generic release note
* removed _values_for_factorize changes
* made tests depend on just the first two
* do the factorization first
  • Loading branch information
TomAugspurger committed Dec 18, 2018
commit 61f5e1592ab9412982dad415e3508eb5a8bb3978
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`).
- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
- Bug in :func:`pandas.merge` when merging on an Integer extension array (:issue:`23020`)
- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`).
- A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`).

.. _whatsnew_0240.api.incompatibilities:
Expand Down
13 changes: 0 additions & 13 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,19 +814,6 @@ def _check_timedeltalike_freq_compat(self, other):
def _values_for_argsort(self):
return self._data

def _values_for_factorize(self):
return np.asarray(self), np.nan

def factorize(self, na_sentinel=-1):
from pandas.core.algorithms import factorize

arr, na_value = self._values_for_factorize()

labels, uniques = factorize(arr, na_sentinel=na_sentinel)

uniques = period_array(uniques)
return labels, uniques


PeriodArray._add_comparison_ops()

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ def unique(self):

def _values_for_factorize(self):
# Still override this for hash_pandas_object
return np.asarray(self, object), self.fill_value
return np.asarray(self), self.fill_value

def factorize(self, na_sentinel=-1):
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
Expand Down
15 changes: 9 additions & 6 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,15 @@ def _factorize_keys(lk, rk, sort=True):
lk = lk.values
rk = rk.values

elif (is_extension_array_dtype(lk.dtype) and
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
is_extension_array_dtype(rk.dtype) and
lk.dtype == rk.dtype and
not is_categorical_dtype(lk)):
# Categorical handled separately, since unordered categories can
# may need to be recoded.
lk, _ = lk._values_for_factorize()
rk, _ = rk._values_for_factorize()

# if we exactly match in categories, allow us to factorize on codes
if (is_categorical_dtype(lk) and
is_categorical_dtype(rk) and
Expand All @@ -1608,12 +1617,6 @@ def _factorize_keys(lk, rk, sort=True):

lk = ensure_int64(lk.codes)
rk = ensure_int64(rk)
elif (is_extension_array_dtype(lk) and
is_extension_array_dtype(rk) and
lk.dtype == rk.dtype):
klass = libhashtable.Factorizer
lk, _ = lk._values_for_factorize()
rk, _ = rk._values_for_factorize()
elif is_integer_dtype(lk) and is_integer_dtype(rk):
# GH#23917 TODO: needs tests for case where lk is integer-dtype
# and rk is datetime-dtype
Expand Down
35 changes: 26 additions & 9 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,18 +175,35 @@ def test_merge(self, data, na_value):

def test_merge_on_extension_array(self, data):
# GH 23020
df1 = pd.DataFrame({'ext': [1, 2, 3],
'key': data[:3]})
a, b = data[:2]
key = type(data)._from_sequence([a, b], dtype=data.dtype)

df = pd.DataFrame({"key": key, "val": [1, 2]})
result = pd.merge(df, df, on='key')
expected = pd.DataFrame({"key": key,
"val_x": [1, 2],
"val_y": [1, 2]})
self.assert_frame_equal(result, expected)

res = pd.merge(df1, df1, on='key')
# order
result = pd.merge(df.iloc[[1, 0]], df, on='key')
expected = expected.iloc[[1, 0]].reset_index(drop=True)
self.assert_frame_equal(result, expected)

exp = pd.DataFrame(
{'key': data[:3],
'ext_x': [1, 2, 3],
'ext_y': [1, 2, 3]})
def test_merge_on_extension_array_duplicates(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})

self.assert_frame_equal(res, exp[['ext_x', 'key', 'ext_y']],
check_dtype=True)
result = pd.merge(df1, df2, on='key')
expected = pd.DataFrame({
"key": key.take([0, 0, 0, 0, 1]),
"val_x": [1, 1, 3, 3, 2],
"val_y": [1, 3, 1, 3, 2],
})
self.assert_frame_equal(result, expected)

TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("columns", [
["A", "B"],
Expand Down