Updates

* generic release note * removed _values_for_factorize changes * made tests depend on just the first two * do the factorization first
pandas-dev · jreback · Dec 19, 2018 · Oct 21, 2018 · Nov 4, 2018 · Nov 11, 2018
commit 61f5e1592ab9412982dad415e3508eb5a8bb3978
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1051,7 +1051,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`).
 - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
 - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
-- Bug in :func:`pandas.merge` when merging on an Integer extension array (:issue:`23020`)
+- Bug in :func:`pandas.merge` when merging on an extension array-backed column (:issue:`23020`).
 - A default repr for :class:`pandas.api.extensions.ExtensionArray` is now provided (:issue:`23601`).
 
 .. _whatsnew_0240.api.incompatibilities:

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -814,19 +814,6 @@ def _check_timedeltalike_freq_compat(self, other):
     def _values_for_argsort(self):
         return self._data
 
-    def _values_for_factorize(self):
-        return np.asarray(self), np.nan
-
-    def factorize(self, na_sentinel=-1):
-        from pandas.core.algorithms import factorize
-
-        arr, na_value = self._values_for_factorize()
-
-        labels, uniques = factorize(arr, na_sentinel=na_sentinel)
-
-        uniques = period_array(uniques)
-        return labels, uniques
-
 
 PeriodArray._add_comparison_ops()
 

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -942,7 +942,7 @@ def unique(self):
 
     def _values_for_factorize(self):
         # Still override this for hash_pandas_object
-        return np.asarray(self, object), self.fill_value
+        return np.asarray(self), self.fill_value
 
     def factorize(self, na_sentinel=-1):
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1594,6 +1594,15 @@ def _factorize_keys(lk, rk, sort=True):
         lk = lk.values
         rk = rk.values
 
+    elif (is_extension_array_dtype(lk.dtype) and
+          is_extension_array_dtype(rk.dtype) and
+          lk.dtype == rk.dtype and
+          not is_categorical_dtype(lk)):
+        # Categorical handled separately, since unordered categories can
+        # may need to be recoded.
+        lk, _ = lk._values_for_factorize()
+        rk, _ = rk._values_for_factorize()
+
     # if we exactly match in categories, allow us to factorize on codes
     if (is_categorical_dtype(lk) and
             is_categorical_dtype(rk) and
@@ -1608,12 +1617,6 @@ def _factorize_keys(lk, rk, sort=True):
 
         lk = ensure_int64(lk.codes)
         rk = ensure_int64(rk)
-    elif (is_extension_array_dtype(lk) and
-          is_extension_array_dtype(rk) and
-          lk.dtype == rk.dtype):
-        klass = libhashtable.Factorizer
-        lk, _ = lk._values_for_factorize()
-        rk, _ = rk._values_for_factorize()
     elif is_integer_dtype(lk) and is_integer_dtype(rk):
         # GH#23917 TODO: needs tests for case where lk is integer-dtype
         #  and rk is datetime-dtype

diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -175,18 +175,35 @@ def test_merge(self, data, na_value):
 
     def test_merge_on_extension_array(self, data):
         # GH 23020
-        df1 = pd.DataFrame({'ext': [1, 2, 3],
-                            'key': data[:3]})
+        a, b = data[:2]
+        key = type(data)._from_sequence([a, b], dtype=data.dtype)
+
+        df = pd.DataFrame({"key": key, "val": [1, 2]})
+        result = pd.merge(df, df, on='key')
+        expected = pd.DataFrame({"key": key,
+                                 "val_x": [1, 2],
+                                 "val_y": [1, 2]})
+        self.assert_frame_equal(result, expected)
 
-        res = pd.merge(df1, df1, on='key')
+        # order
+        result = pd.merge(df.iloc[[1, 0]], df, on='key')
+        expected = expected.iloc[[1, 0]].reset_index(drop=True)
+        self.assert_frame_equal(result, expected)
 
-        exp = pd.DataFrame(
-            {'key': data[:3],
-             'ext_x': [1, 2, 3],
-             'ext_y': [1, 2, 3]})
+    def test_merge_on_extension_array_duplicates(self, data):
+        # GH 23020
+        a, b = data[:2]
+        key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
+        df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
+        df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
 
-        self.assert_frame_equal(res, exp[['ext_x', 'key', 'ext_y']],
-                                check_dtype=True)
+        result = pd.merge(df1, df2, on='key')
+        expected = pd.DataFrame({
+            "key": key.take([0, 0, 0, 0, 1]),
+            "val_x": [1, 1, 3, 3, 2],
+            "val_y": [1, 3, 1, 3, 2],
+        })
+        self.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("columns", [
         ["A", "B"],