Skip to content

Commit

Permalink
String dtype: honor mode.string_storage option (and change default to…
Browse files Browse the repository at this point in the history
… None) (#59488)

* String dtype: honor mode.string_storage option (and change default to None)

* fix test + explicitly test default

* use 'auto' instead of None
  • Loading branch information
jorisvandenbossche authored and WillAyd committed Aug 13, 2024
1 parent aa5d8f9 commit 015514b
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 18 deletions.
12 changes: 8 additions & 4 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,16 @@ def __init__(
# infer defaults
if storage is None:
if na_value is not libmissing.NA:
if HAS_PYARROW:
storage = "pyarrow"
else:
storage = "python"
storage = get_option("mode.string_storage")
if storage == "auto":
if HAS_PYARROW:
storage = "pyarrow"
else:
storage = "python"
else:
storage = get_option("mode.string_storage")
if storage == "auto":
storage = "python"

if storage == "pyarrow_numpy":
# TODO raise a deprecation warning
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,13 +505,12 @@ def use_inf_as_na_cb(key) -> None:

string_storage_doc = """
: string
The default storage for StringDtype. This option is ignored if
``future.infer_string`` is set to True.
The default storage for StringDtype.
"""


def is_valid_string_storage(value: Any) -> None:
legal_values = ["python", "pyarrow"]
legal_values = ["auto", "python", "pyarrow"]
if value not in legal_values:
msg = "Value must be one of python|pyarrow"
if value == "pyarrow_numpy":
Expand All @@ -526,7 +525,7 @@ def is_valid_string_storage(value: Any) -> None:
with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
"python",
"auto",
string_storage_doc,
# validator=is_one_of_factory(["python", "pyarrow"]),
validator=is_valid_string_storage,
Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import numpy as np
import pytest

from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td

import pandas as pd
Expand All @@ -27,11 +26,10 @@ def test_eq_all_na():
tm.assert_extension_array_equal(result, expected)


def test_config(string_storage, request, using_infer_string):
if using_infer_string and string_storage == "python" and HAS_PYARROW:
# string storage with na_value=NaN always uses pyarrow if available
# -> does not yet honor the option
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
def test_config(string_storage, using_infer_string):
# with the default string_storage setting
# always "python" at the moment
assert StringDtype().storage == "python"

with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pytest

from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td

from pandas.core.dtypes.astype import astype_array
Expand Down Expand Up @@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance():


def test_pandas_dtype_string_dtypes(string_storage):
# TODO(infer_string) remove skip if "python" is supported
pytest.importorskip("pyarrow")
with pd.option_context("future.infer_string", True):
# with the default string_storage setting
result = pandas_dtype("str")
assert result == pd.StringDtype(
"pyarrow" if HAS_PYARROW else "python", na_value=np.nan
)

with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
# TODO(infer_string) hardcoded to pyarrow until python is supported
assert result == pd.StringDtype("pyarrow", na_value=np.nan)
assert result == pd.StringDtype(string_storage, na_value=np.nan)

with pd.option_context("future.infer_string", False):
with pd.option_context("string_storage", string_storage):
Expand Down

0 comments on commit 015514b

Please sign in to comment.