Skip to content

Commit

Permalink
TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#5…
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Sep 9, 2024
1 parent b717abb commit 83fd9ba
Show file tree
Hide file tree
Showing 29 changed files with 119 additions and 134 deletions.
28 changes: 28 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,34 @@ def string_dtype(request):
return request.param


@pytest.fixture(
params=[
("python", pd.NA),
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
("python", np.nan),
],
ids=[
"string=string[python]",
"string=string[pyarrow]",
"string=str[pyarrow]",
"string=str[python]",
],
)
def string_dtype_no_object(request):
"""
Parametrized fixture for string dtypes.
* 'string[python]' (NA variant)
* 'string[pyarrow]' (NA variant)
* 'str' (NaN variant, with pyarrow)
* 'str' (NaN variant, without pyarrow)
"""
# need to instantiate the StringDtype here instead of in the params
# to avoid importing pyarrow during test collection
storage, na_value = request.param
return pd.StringDtype(storage, na_value)


@pytest.fixture(
params=[
"string[python]",
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/apply/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Index,
Expand All @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):

def test_numba_vs_python_string_index():
# GH#56189
pytest.importorskip("pyarrow")
df = DataFrame(
1,
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
)
func = lambda x: x
result = df.apply(func, engine="numba", axis=0)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
arr[[0, 1]] = ["foo", "bar", "baz"]


@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
def test_pickle_roundtrip(dtype):
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
def test_pickle_roundtrip(na_value):
# GH 42600
pytest.importorskip("pyarrow")
dtype = StringDtype("pyarrow", na_value=na_value)
expected = pd.Series(range(10), dtype=dtype)
expected_sliced = expected.head(2)
full_pickled = pickle.dumps(expected)
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,7 @@ def test_access_by_position(index_flat):
assert index[-1] == index[size - 1]

msg = f"index {size} is out of bounds for axis 0 with size {size}"
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
index.dtype, "string[pyarrow_numpy]"
):
if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
msg = "index out of bounds"
with pytest.raises(IndexError, match=msg):
index[size]
Expand Down
10 changes: 3 additions & 7 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None:
("dtype", "infer_string"),
[
(object, False),
("string[pyarrow_numpy]", True),
(pd.StringDtype(na_value=np.nan), True),
],
)
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
# https://github.com/pandas-dev/pandas/issues/56204
pytest.importorskip("pyarrow")

df = DataFrame({"a": [1, 2], "b": [3, 4]})
with pd.option_context("future.infer_string", infer_string):
df.loc[df["a"] == 1, "c"] = "1"
Expand All @@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
tm.assert_frame_equal(df, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_add_new_column_infer_string():
# GH#55366
pytest.importorskip("pyarrow")
df = DataFrame({"x": [1]})
with pd.option_context("future.infer_string", True):
df.loc[df["x"] == 1, "y"] = "1"
expected = DataFrame(
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
columns=Index(["x", "y"], dtype=object),
{"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
columns=Index(["x", "y"], dtype="str"),
)
tm.assert_frame_equal(df, expected)

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from pandas.compat import HAS_PYARROW

import pandas as pd
from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
result = df.rank(numeric_only=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"dtype, exp_dtype",
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
)
def test_rank_string_dtype(self, dtype, exp_dtype):
def test_rank_string_dtype(self, string_dtype_no_object):
# GH#55362
pytest.importorskip("pyarrow")
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
result = obj.rank(method="first")
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
if string_dtype_no_object.storage == "python":
# TODO nullable string[python] should also return nullable Int64
exp_dtype = "float64"
expected = Series([1, 2, None, 3], dtype=exp_dtype)
tm.assert_series_equal(result, expected)
7 changes: 2 additions & 5 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self):

def test_frame_string_inference(self):
# GH#54430
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand Down Expand Up @@ -2690,8 +2689,7 @@ def test_frame_string_inference(self):

def test_frame_string_inference_array_string_dtype(self):
# GH#54496
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
Expand All @@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self):

def test_frame_string_inference_block_dim(self):
# GH#55363
pytest.importorskip("pyarrow")
with pd.option_context("future.infer_string", True):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2
Expand Down
13 changes: 2 additions & 11 deletions pandas/tests/groupby/methods/test_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype):


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_size_strings(dtype):
def test_size_strings(any_string_dtype):
# GH#55627
dtype = any_string_dtype
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
Expand Down
14 changes: 3 additions & 11 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
Categorical,
CategoricalIndex,
Expand Down Expand Up @@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby(
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
Expand All @@ -398,9 +388,10 @@ def test_compound(
expected_rows,
expected_count,
expected_group_size,
dtype,
any_string_dtype,
using_infer_string,
):
dtype = any_string_dtype
education_df = education_df.astype(dtype)
education_df.columns = education_df.columns.astype(dtype)
# Multiple groupby keys and as_index=False
Expand All @@ -417,6 +408,7 @@ def test_compound(
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
if dtype == "string[pyarrow]":
# TODO(nullable) also string[python] should return nullable dtypes
expected["proportion"] = expected["proportion"].convert_dtypes()
else:
expected["count"] = expected_count
Expand Down
11 changes: 2 additions & 9 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period():
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
],
)
def test_by_column_values_with_same_starting_value(dtype):
def test_by_column_values_with_same_starting_value(any_string_dtype):
# GH29635
df = DataFrame(
{
"Name": ["Thomas", "Thomas", "Thomas John"],
"Credit": [1200, 1300, 900],
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
"Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
}
)
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func):


@pytest.mark.parametrize("func", ["min", "max"])
def test_min_empty_string_dtype(func):
def test_min_empty_string_dtype(func, string_dtype_no_object):
# GH#55619
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
dtype = string_dtype_no_object
df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
result = getattr(df.groupby("a"), func)()
expected = DataFrame(
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list):

def test_index_string_inference(self):
# GH#54430
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
expected = Index(["a", "b"], dtype=dtype)
expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
with pd.option_context("future.infer_string", True):
ser = Index(["a", "b"])
tm.assert_index_equal(ser, expected)
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/indexes/base_class/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val):
tm.assert_index_equal(result, expected)
assert type(expected[2]) is type(val)

def test_insert_none_into_string_numpy(self):
def test_insert_none_into_string_numpy(self, string_dtype_no_object):
# GH#55365
pytest.importorskip("pyarrow")
index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]")
index = Index(["a", "b", "c"], dtype=string_dtype_no_object)
result = index.insert(-1, None)
expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]")
expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
Expand Down
23 changes: 6 additions & 17 deletions pandas/tests/indexes/object/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
NA,
is_matching_na,
)
import pandas.util._test_decorators as td

import pandas as pd
from pandas import Index
Expand Down Expand Up @@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):


class TestSliceLocs:
# TODO(infer_string) parametrize over multiple string dtypes
@pytest.mark.parametrize(
"dtype",
[
"object",
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
],
)
@pytest.mark.parametrize(
"in_slice,expected",
[
Expand All @@ -191,24 +182,22 @@ class TestSliceLocs:
(pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc]
],
)
def test_slice_locs_negative_step(self, in_slice, expected, dtype):
index = Index(list("bcdxy"), dtype=dtype)
def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
index = Index(list("bcdxy"), dtype=any_string_dtype)

s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
result = index[s_start : s_stop : in_slice.step]
expected = Index(list(expected), dtype=dtype)
expected = Index(list(expected), dtype=any_string_dtype)
tm.assert_index_equal(result, expected)

# TODO(infer_string) parametrize over multiple string dtypes
@td.skip_if_no("pyarrow")
def test_slice_locs_negative_step_oob(self):
index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]")
def test_slice_locs_negative_step_oob(self, any_string_dtype):
index = Index(list("bcdxy"), dtype=any_string_dtype)

result = index[-10:5:1]
tm.assert_index_equal(result, index)

result = index[4:-10:-1]
expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]")
expected = Index(list("yxdcb"), dtype=any_string_dtype)
tm.assert_index_equal(result, expected)

def test_slice_locs_dup(self):
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -933,10 +933,9 @@ def test_isin_empty(self, empty):
result = index.isin(empty)
tm.assert_numpy_array_equal(expected, result)

@td.skip_if_no("pyarrow")
def test_isin_arrow_string_null(self):
def test_isin_string_null(self, string_dtype_no_object):
# GH#55821
index = Index(["a", "b"], dtype="string[pyarrow_numpy]")
index = Index(["a", "b"], dtype=string_dtype_no_object)
result = index.isin([None])
expected = np.array([False, False])
tm.assert_numpy_array_equal(result, expected)
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,10 @@ def test_ensure_copied_data(self, index):
tm.assert_numpy_array_equal(
index._values._ndarray, result._values._ndarray, check_same="same"
)
elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"):
elif (
isinstance(index.dtype, StringDtype)
and index.dtype.storage == "pyarrow"
):
assert tm.shares_memory(result._values, index._values)
else:
raise NotImplementedError(index.dtype)
Expand Down
Loading

0 comments on commit 83fd9ba

Please sign in to comment.