diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d392c84be66fe..d145836f3e596 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -380,7 +380,7 @@ jobs: fetch-depth: 0 - name: Set up Python Free-threading Version - uses: deadsnakes/action@v3.1.0 + uses: deadsnakes/action@v3.2.0 with: python-version: 3.13-dev nogil: true diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7ed5103b3b796..06078d8958492 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -75,9 +75,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.ordinal GL08" \ -i "pandas.PeriodDtype.freq SA01" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.RangeIndex.start SA01" \ -i "pandas.RangeIndex.step SA01" \ - -i "pandas.RangeIndex.stop SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ @@ -92,10 +90,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ -i "pandas.Series.dt.round PR01,PR02" \ - -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ -i "pandas.Series.dt.to_period PR01,PR02" \ -i "pandas.Series.dt.total_seconds PR01" \ @@ -113,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.Timedelta.total_seconds SA01" \ - -i "pandas.TimedeltaIndex.nanoseconds SA01" \ - -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ @@ -123,13 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ - -i "pandas.api.types.is_bool PR01,SA01" \ - -i "pandas.api.types.is_categorical_dtype SA01" \ - -i "pandas.api.types.is_complex PR01,SA01" \ - -i "pandas.api.types.is_complex_dtype SA01" \ - -i "pandas.api.types.is_datetime64_dtype SA01" \ - -i "pandas.api.types.is_datetime64_ns_dtype SA01" \ - -i "pandas.api.types.is_datetime64tz_dtype SA01" \ -i "pandas.api.types.is_dict_like PR07,SA01" \ -i "pandas.api.types.is_extension_array_dtype SA01" \ -i "pandas.api.types.is_file_like PR07,SA01" \ @@ -163,7 +150,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ @@ -179,7 +165,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03355f655eb28..03b3a6b55dff6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,8 +103,9 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - +- Interval ^^^^^^^^ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9a29ff4d49966..819318e119668 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -55,6 +55,7 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..75f58f565dd6f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: - already_copied = False + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False elif not copy and not result.flags.writeable: # Weird edge case where result is a view already_copied = False @@ -1123,10 +1129,21 @@ def is_bool(obj: object) -> bool: """ Return True if given object is boolean. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + Examples -------- >>> pd.api.types.is_bool(True) @@ -1142,10 +1159,22 @@ def is_complex(obj: object) -> bool: """ Return True if given object is complex. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + api.types.is_number: Check if the object is a number. + api.types.is_integer: Return True if given object is integer. + Examples -------- >>> pd.api.types.is_complex(1 + 1j) diff --git a/pandas/conftest.py b/pandas/conftest.py index d11213f1164bc..222aefb4afda8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1272,6 +1272,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 3058b4fb966f6..0db529b7262ee 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -11,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -22,10 +23,7 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sized, - ) + from collections.abc import Callable from pandas._typing import ( Scalar, @@ -34,7 +32,7 @@ class ArrowStringArrayMixin: - _pa_array: Sized + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -127,13 +125,29 @@ def _str_get(self, i: int) -> Self: selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, - type=self._pa_array.type, # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ) -> Self: + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ) -> Self: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a46156b6b7db3..be9cca50d09de 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -2376,17 +2377,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: starts_with = pc.starts_with(self._pa_array, pattern=prefix) @@ -2428,7 +2418,9 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.bool_ split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) uniques = flattened_values.unique() @@ -2438,7 +2430,15 @@ def _str_get_dummies(self, sep: str = "|"): n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) + if dtype == str: + dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c613a345686cc..8e0225b31e17b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2681,11 +2681,11 @@ def _str_map( result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 50aa48ded58a5..db0b3e0237eb0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -55,6 +55,7 @@ from pandas._typing import ( ArrayLike, Dtype, + NpDtype, Self, npt, ) @@ -305,6 +306,7 @@ def astype(self, dtype, copy: bool = True): _str_swapcase = ArrowStringArrayMixin._str_swapcase _str_slice_replace = ArrowStringArrayMixin._str_slice_replace _str_len = ArrowStringArrayMixin._str_len + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -351,13 +353,6 @@ def _str_repeat(self, repeats: int | Sequence[int]): else: return ArrowExtensionArray._str_repeat(self, repeats=repeats) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if stop is None: - return super()._str_slice(start, stop, step) - return ArrowExtensionArray._str_slice(self, start=start, stop=stop, step=step) - def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: return ArrowStringArrayMixin._str_removeprefix(self, prefix) @@ -379,12 +374,22 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return ArrowStringArrayMixin._str_find(self, sub, start, end) - def _str_get_dummies(self, sep: str = "|"): - dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.int64 + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies( + sep, dtype + ) if len(labels) == 0: - return np.empty(shape=(0, 0), dtype=np.int64), labels + return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) - return dummies.astype(np.int64, copy=False), labels + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + return dummies.astype(dummies_dtype, copy=False), labels def _convert_int_result(self, result): if self.dtype.na_value is np.nan: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c8a86ffc187d0..754ae277e359a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -842,6 +842,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: seconds_docstring = textwrap.dedent( """Number of seconds (>= 0 and less than 1 day) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.nanoseconds : Return number of nanoseconds for each element. + Examples -------- For Series: @@ -917,6 +922,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: nanoseconds_docstring = textwrap.dedent( """Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.microseconds : Return number of nanoseconds for each element. + Examples -------- For Series: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bcf1ade9b0320..16f6bd396fe93 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -279,6 +279,13 @@ def is_datetime64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_ns_dtype: Check whether the provided array or + dtype is of the datetime64[ns] dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_dtype @@ -316,6 +323,13 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of a DatetimeTZDtype dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64tz_dtype @@ -514,6 +528,12 @@ def is_categorical_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Categorical dtype. + See Also + -------- + api.types.is_list_like: Check if the object is list-like. + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + Examples -------- >>> from pandas.api.types import is_categorical_dtype @@ -977,6 +997,13 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64[ns] dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_ns_dtype @@ -1436,6 +1463,14 @@ def is_complex_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a complex dtype. + See Also + -------- + api.types.is_complex: Return True if given object is complex. + api.types.is_numeric_dtype: Check whether the provided array or + dtype is of a numeric dtype. + api.types.is_integer_dtype: Check whether the provided array or + dtype is of an integer dtype. + Examples -------- >>> from pandas.api.types import is_complex_dtype diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c112d9b6a4b54..230f61bab96df 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -600,15 +600,23 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): ---------- func : function Criterion to apply to each group. Should return True or False. - dropna : bool + dropna : bool, optional Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + *args : tuple + Optional positional arguments to pass to `func`. + **kwargs : dict + Optional keyword arguments to pass to `func`. Returns ------- Series The filtered subset of the original Series. + See Also + -------- + DataFrameGroupBy.filter : Filter elements from groups base on criterion. + Notes ----- Functions that mutate the passed object can produce unexpected @@ -1943,9 +1951,9 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: dropna : bool Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. - *args + *args : tuple Additional positional arguments to pass to `func`. - **kwargs + **kwargs : dict Additional keyword arguments to pass to `func`. Returns @@ -1953,6 +1961,10 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: DataFrame The filtered subset of the original DataFrame. + See Also + -------- + SeriesGroupBy.filter : Filter elements from groups base on criterion. + Notes ----- Each subframe is endowed the attribute 'name' in case you need to know diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b11ce6bd7b919..75d0dfbeb6f01 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -295,6 +295,16 @@ def start(self) -> int: """ The value of the `start` parameter (``0`` if this was not supplied). + This property returns the starting value of the `RangeIndex`. If the `start` + value is not explicitly provided during the creation of the `RangeIndex`, + it defaults to 0. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the `RangeIndex`. + RangeIndex.step : Returns the step value of the `RangeIndex`. + Examples -------- >>> idx = pd.RangeIndex(5) @@ -313,6 +323,17 @@ def stop(self) -> int: """ The value of the `stop` parameter. + This property returns the `stop` value of the RangeIndex, which defines the + upper (or lower, in case of negative steps) bound of the index range. The + `stop` value is exclusive, meaning the RangeIndex includes values up to but + not including this value. + + See Also + -------- + RangeIndex : Immutable index representing a range of integers. + RangeIndex.start : The start value of the RangeIndex. + RangeIndex.step : The step size between elements in the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4f79e30f48f3c..0c26ce27c680c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -580,8 +580,15 @@ def __arrow_c_stream__(self, requested_schema=None): PyCapsule """ pa = import_optional_dependency("pyarrow", min_version="16.0.0") - ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) - return ca.__arrow_c_stream__(requested_schema) + type = ( + pa.DataType._import_from_c_capsule(requested_schema) + if requested_schema is not None + else None + ) + ca = pa.array(self, type=type) + if not isinstance(ca, pa.ChunkedArray): + ca = pa.chunked_array([ca]) + return ca.__arrow_c_stream__() # ---------------------------------------------------------------------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index bdb88e981bcda..6d10365a1b968 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, + is_extension_array_dtype, is_integer, is_list_like, is_object_dtype, @@ -54,6 +55,8 @@ Iterator, ) + from pandas._typing import NpDtype + from pandas import ( DataFrame, Index, @@ -2431,7 +2434,11 @@ def wrap( return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep: str = "|"): + def get_dummies( + self, + sep: str = "|", + dtype: NpDtype | None = None, + ): """ Return DataFrame of dummy/indicator variables for Series. @@ -2442,6 +2449,8 @@ def get_dummies(self, sep: str = "|"): ---------- sep : str, default "|" String to split on. + dtype : dtype, default np.int64 + Data type for new columns. Only a single dtype is allowed. Returns ------- @@ -2466,10 +2475,24 @@ def get_dummies(self, sep: str = "|"): 0 1 1 0 1 0 0 0 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) + a b c + 0 True True False + 1 False False False + 2 True False True """ + from pandas.core.frame import DataFrame + # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep, dtype) + if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + return self._wrap_result( + DataFrame(result, columns=name, dtype=dtype), + name=name, + returns_string=False, + ) return self._wrap_result( result, name=name, diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 1281a03e297f9..97d906e3df077 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -16,6 +16,7 @@ import re from pandas._typing import ( + NpDtype, Scalar, Self, ) @@ -163,7 +164,7 @@ def _str_wrap(self, width: int, **kwargs): pass @abc.abstractmethod - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c6b18d7049c57..6211c7b528db9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -18,6 +18,7 @@ import pandas._libs.ops as libops from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods @@ -398,9 +399,11 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): from pandas import Series + if dtype is None: + dtype = np.int64 arr = Series(self).fillna("") try: arr = sep + arr + sep @@ -412,7 +415,13 @@ def _str_get_dummies(self, sep: str = "|"): tags.update(ts) tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) def _isin(test_elements: str, element: str) -> bool: return element in test_elements diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index d86eeadbaa0fe..825d295043e69 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 58ba340441d86..8e13dcf25ceba 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -222,9 +222,10 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array): res = arr_na.max(skipna=False) assert np.isnan(res) - res = arr_na.min(skipna=True) - assert res == MIN - assert type(res) == type(MIN) - res = arr_na.max(skipna=True) - assert res == MAX - assert type(res) == type(MAX) + for kws in [{"skipna": True}, {}]: + res = arr_na.min(**kws) + assert res == MIN + assert type(res) == type(MIN) + res = arr_na.max(**kws) + assert res == MAX + assert type(res) == type(MAX) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index b042cf632288b..d4363171788d4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index bbd9b150b88a8..7819b7b75f065 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -183,9 +183,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..80c30f2d0c26e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -7,7 +7,6 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) -@td.skip_if_no("pyarrow") -def test_astype_string_read_only_on_pickle_roundrip(): +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter read-only array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) base_copy = pickle.loads(pickle.dumps(base)) base_copy._values.flags.writeable = False - base_copy.astype("string[pyarrow]") + base_copy.astype(any_string_dtype) tm.assert_series_equal(base, base_copy) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc4f14882b9d7..f86d927ddda67 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2036,6 +2036,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 8ce4e8725d632..0723c3c70091c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 4b1435babe6b1..c1cdeaa6c10dd 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -14,6 +14,7 @@ ) from pandas.compat import HAS_PYARROW +import pandas as pd from pandas import ( DataFrame, Index, @@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0176a36fe78d7..3d46e03547c38 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2690,8 +2689,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index edeac642551a0..91200f53e36bd 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, @@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype): @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index da3d626f2d777..8f8f7f64aba75 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -398,9 +388,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -417,6 +408,7 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 11b874d0b1608..6393468fb8ccd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8a421654cdf9b..a6ea1502103c5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 6036eddce7a01..0896b97e8a40e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index e17e39a334acc..56cdca49cb2b0 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 2e9ba007a45c1..ea3d068a673e8 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: - # TODO(infer_string) parametrize over multiple string dtypes - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -191,24 +182,22 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) + expected = Index(list(expected), dtype=any_string_dtype) tm.assert_index_equal(result, expected) - # TODO(infer_string) parametrize over multiple string dtypes - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) result = index[-10:5:1] tm.assert_index_equal(result, index) result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + expected = Index(list("yxdcb"), dtype=any_string_dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 486b24845d2ff..2b62b384930d6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -933,10 +933,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 75284a8f8fd47..cd3d599abd30e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -295,7 +295,10 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 76910db941d36..38961345dc1f2 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -465,7 +465,7 @@ def test_non_str_names_w_duplicates(): ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), - (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), None], @@ -528,7 +528,11 @@ def test_pandas_nullable_with_missing_values( ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), - (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3d07c0219691e..1c54232b8b510 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2245,18 +2245,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 07f29518b7881..b664423364f6b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -568,8 +567,6 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) def test_string_inference_object_dtype(all_parsers, dtype): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -583,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): "a": pd.Series(["x", "y", "z"], dtype=object), "b": pd.Series(["a", "a", "a"], dtype=object), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -593,9 +590,9 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index dd3a0eabe95ae..8ae87d4bab52d 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -310,7 +310,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -318,8 +317,8 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a1f3babb1ae3b..9721d045b7b91 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -243,5 +243,7 @@ def test_string_inference(self, tmp_path): df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 90133344fdfc9..efb3dffecd856 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -436,7 +436,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a29e479b7c9f1..4c2ea036f08dc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1109,8 +1109,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1140,8 +1140,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 980c88f070b89..c28a33069d23f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3809,7 +3809,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3817,7 +3816,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 8af224f1ad64f..d3edee17366f7 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import InvalidIndexError import pandas as pd @@ -47,18 +45,11 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) - # These are actual copies. - result = concat([df, df2, df3], axis=1) - for block in result._mgr.blocks: - assert block.values.base is not None - - # These are the same. result = concat([df, df2, df3], axis=1) for block in result._mgr.blocks: @@ -69,6 +60,8 @@ def test_concat_copy(self): assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None + elif arr.dtype == "string": + tm.shares_memory(arr, df3._mgr.blocks[0].values) # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 8d972087b0dff..f7b0876c5a605 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -3064,12 +3062,8 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_merge_datatype_error_raises(self, using_infer_string): - if using_infer_string: - msg = "incompatible merge keys" - else: - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self): + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 27a34decae7b0..9ce2c925a368b 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) @@ -712,19 +708,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index be4f2ab4d183d..95aa5291cb45a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -21,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -83,7 +81,6 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -100,7 +97,6 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -178,7 +174,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -206,7 +201,6 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -236,7 +230,6 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -361,7 +354,6 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): df.melt(["A"], ["F"], col_level=0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -369,6 +361,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1222,12 +1216,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1243,15 +1235,16 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_wide_to_long_pyarrow_string_columns(): +def test_wide_to_long_string_columns(string_storage): # GH 57066 - pytest.importorskip("pyarrow") + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) df = DataFrame( { "ID": {0: 1}, @@ -1261,17 +1254,17 @@ def test_wide_to_long_pyarrow_string_columns(): "D": {0: 1}, } ) - df.columns = df.columns.astype("string[pyarrow_numpy]") + df.columns = df.columns.astype(string_dtype) result = wide_to_long( df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" ) expected = DataFrame( [[1, 1], [1, 1], [1, 2]], - columns=Index(["D", "R"], dtype=object), + columns=Index(["D", "R"]), index=pd.MultiIndex.from_arrays( [ [1, 1, 1], - Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + Index(["test1", "test2", "test3"], dtype=string_dtype), ], names=["ID", "UNPIVOTED"], ), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8cfe565ebdd65..eccf676b87f89 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1068,7 +1068,6 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -1078,7 +1077,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2570,13 +2569,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2658,7 +2660,9 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2674,7 +2678,9 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2688,7 +2694,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py index 34a2a638e4185..e73cf9bee6aeb 100644 --- a/pandas/tests/series/test_arrow_interface.py +++ b/pandas/tests/series/test_arrow_interface.py @@ -21,3 +21,41 @@ def test_series_arrow_interface(): ca = pa.chunked_array(s) expected = pa.chunked_array([[1, 4, 2]]) assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_arrow_dtypes(): + s = pd.Series([1, 4, 2], dtype="Int64[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_stringdtype(): + s = pd.Series(["foo", "bar"], dtype="string[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([["foo", "bar"]], type=pa.large_string()) + assert ca.equals(expected) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 1586195e79a9d..8516018e8aa93 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -9,6 +9,7 @@ DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -514,7 +515,7 @@ def test_pyarrow_numpy_string_invalid(self): # GH#56008 pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 expected_eq = Series(False, index=ser.index) tm.assert_series_equal(result, expected_eq) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bf01c4996bb32..ea9f89ed129aa 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -22,7 +22,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") + return dtype == "string" and dtype.storage == "pyarrow" def test_contains(any_string_dtype): diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 31386e4e342ae..0656f505dc745 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -8,6 +11,11 @@ _testing as tm, ) +try: + import pyarrow as pa +except ImportError: + pa = None + def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -32,22 +40,85 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) -def test_get_dummies_with_name_dummy(any_string_dtype): - # GH 12180 - # Dummies named 'name' should work as expected - s = Series(["a", "b,name", "b"], dtype=any_string_dtype) - result = s.str.get_dummies(",") - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]) +# GH#47872 +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.int16, + np.uint16, + np.int32, + np.uint32, + np.int64, + np.uint64, + bool, + "Int8", + "Int16", + "Int32", + "Int64", + "boolean", + ], +) +def test_get_dummies_with_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype + ) tm.assert_frame_equal(result, expected) -def test_get_dummies_with_name_dummy_index(): - # GH 12180 - # Dummies named 'name' should work as expected - idx = Index(["a|b", "name|c", "b|name"]) - result = idx.str.get_dummies("|") +# GH#47872 +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "dtype", + [ + "int8[pyarrow]", + "uint8[pyarrow]", + "int16[pyarrow]", + "uint16[pyarrow]", + "int32[pyarrow]", + "uint32[pyarrow]", + "int64[pyarrow]", + "uint64[pyarrow]", + "bool[pyarrow]", + ], +) +def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) - expected = MultiIndex.from_tuples( - [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + +# GH#47872 +def test_get_dummies_with_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=str) + expected = DataFrame( + [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], + columns=list("abc"), + dtype=str, ) - tm.assert_index_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +# GH#47872 +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pa_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype="str[pyarrow]") + expected = DataFrame( + [ + ["true", "true", "false"], + ["true", "false", "true"], + ["false", "false", "false"], + ], + columns=list("abc"), + dtype="str[pyarrow]", + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..4995b448f7e94 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string()))