From 3182e9b43a3e9961d253e6bc7ff42412824bffb8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Aug 2024 17:25:05 +0200 Subject: [PATCH] String dtype: fix alignment sorting in case of python storage (#59448) * String dtype: fix alignment sorting in case of python storage * add test --- pandas/core/indexes/base.py | 5 ++++- pandas/tests/series/methods/test_align.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 50f44cc728aea..d39c337fbb4b2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4884,7 +4884,10 @@ def _can_use_libjoin(self) -> bool: return ( isinstance(self.dtype, np.dtype) or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) - or self.dtype == "string[python]" + or ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ) ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index 620a8ae54fee3..f7e4dbe11f3d3 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -148,6 +148,19 @@ def test_align_periodindex(join_type): ts.align(ts[::2], join=join_type) +def test_align_stringindex(any_string_dtype): + left = Series(range(3), index=pd.Index(["a", "b", "d"], dtype=any_string_dtype)) + right = Series(range(3), index=pd.Index(["a", "b", "c"], dtype=any_string_dtype)) + result_left, result_right = left.align(right) + + expected_idx = pd.Index(["a", "b", "c", "d"], dtype=any_string_dtype) + expected_left = Series([0, 1, np.nan, 2], index=expected_idx) + expected_right = Series([0, 1, 2, np.nan], index=expected_idx) + + tm.assert_series_equal(result_left, expected_left) + tm.assert_series_equal(result_right, expected_right) + + def test_align_left_fewer_levels(): # GH#45224 left = Series([2], index=pd.MultiIndex.from_tuples([(1, 3)], names=["a", "c"]))