Skip to content

Commit

Permalink
BUG (string dtype): convert dictionary input to materialized string a…
Browse files Browse the repository at this point in the history
…rray in ArrowStringArray constructor (#59479)
  • Loading branch information
jorisvandenbossche committed Oct 9, 2024
1 parent 138140d commit 3a362d8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 12 deletions.
16 changes: 10 additions & 6 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,18 +125,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr

def __init__(self, values) -> None:
_chk_pyarrow_available()
if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
values.type
if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
pa.types.is_string(values.type)
or (
pa.types.is_dictionary(values.type)
and (
pa.types.is_string(values.type.value_type)
or pa.types.is_large_string(values.type.value_type)
)
)
):
values = pc.cast(values, pa.large_string())

super().__init__(values)
self._dtype = StringDtype(storage=self._storage, na_value=self._na_value)

if not pa.types.is_large_string(self._pa_array.type) and not (
pa.types.is_dictionary(self._pa_array.type)
and pa.types.is_large_string(self._pa_array.type.value_type)
):
if not pa.types.is_large_string(self._pa_array.type):
raise ValueError(
"ArrowStringArray requires a PyArrow (chunked) array of "
"large_string type"
Expand Down
11 changes: 5 additions & 6 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked):
ArrowStringArray(arr)


@pytest.mark.xfail(
reason="dict conversion does not seem to be implemented for large string in arrow"
)
@pytest.mark.parametrize("string_type", ["string", "large_string"])
@pytest.mark.parametrize("chunked", [True, False])
def test_constructor_valid_string_type_value_dictionary(chunked):
def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
pa = pytest.importorskip("pyarrow")

arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode()
arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode()
if chunked:
arr = pa.chunked_array(arr)

arr = ArrowStringArray(arr)
assert pa.types.is_string(arr._pa_array.type.value_type)
# dictionary type get converted to dense large string array
assert pa.types.is_large_string(arr._pa_array.type)


def test_constructor_from_list():
Expand Down

0 comments on commit 3a362d8

Please sign in to comment.