Skip to content

POC: consistent NaN treatment for pyarrow dtypes #61732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1456,7 +1456,7 @@ def _maybe_upcast(
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
arr = ArrowExtensionArray(pa.array(arr))

return arr

Expand Down
54 changes: 41 additions & 13 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import numpy as np

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas._libs.tslibs import (
Timedelta,
Timestamp,
Expand Down Expand Up @@ -360,7 +361,7 @@ def _from_sequence_of_strings(
# duration to string casting behavior
mask = isna(scalars)
if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
strings = pa.array(strings, type=pa.string(), from_pandas=True)
strings = pa.array(strings, type=pa.string())
strings = pc.if_else(mask, None, strings)
try:
scalars = strings.cast(pa.int64())
Expand All @@ -381,7 +382,7 @@ def _from_sequence_of_strings(
if isinstance(strings, (pa.Array, pa.ChunkedArray)):
scalars = strings
else:
scalars = pa.array(strings, type=pa.string(), from_pandas=True)
scalars = pa.array(strings, type=pa.string())
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
scalars = scalars.cast(pa.bool_())
Expand All @@ -393,6 +394,13 @@ def _from_sequence_of_strings(
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
if not pa.types.is_decimal(pa_type):
# TODO: figure out why doing this cast breaks with decimal dtype
# in test_from_sequence_of_strings_pa_array
mask = strings.is_null()
scalars = pa.array(scalars, mask=np.array(mask), type=pa_type)
# TODO: could we just do strings.cast(pa_type)?

else:
raise NotImplementedError(
f"Converting strings to {pa_type} is not implemented."
Expand Down Expand Up @@ -435,7 +443,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""
if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value):
elif isna(value) and not lib.is_float(value):
pa_scalar = pa.scalar(None, type=pa_type)
else:
# Workaround https://github.com/apache/arrow/issues/37291
Expand All @@ -452,7 +460,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
value = value.as_unit(pa_type.unit)
value = value._value

pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
pa_scalar = pa.scalar(value, type=pa_type)

if pa_type is not None and pa_scalar.type != pa_type:
pa_scalar = pa_scalar.cast(pa_type)
Expand Down Expand Up @@ -484,6 +492,13 @@ def _box_pa_array(
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

elif hasattr(value, "__arrow_array__"):
# e.g. StringArray
if copy:
value = value.copy()
pa_array = value.__arrow_array__()

else:
if (
isinstance(value, np.ndarray)
Expand All @@ -510,19 +525,32 @@ def _box_pa_array(
value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit)
value = value.to_numpy()

mask = None
if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM":
# similar to isna(value) but exclude NaN
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're moving towards making NaN distinct from NA (NaN not a missing value), maybe we should eventually make isna have a nan_as_null: bool = True argument

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that may be necessary, but im a little wary of it since it might mean needing to add that argument to everywhere with a skipna keyword

# TODO: cythonize!
mask = np.array([x is NA or x is None for x in value], dtype=bool)

from_pandas = False
if pa.types.is_integer(pa_type):
# If user specifically asks to cast a numpy float array with NaNs
# to pyarrow integer, we'll treat those NaNs as NA
Comment on lines +536 to +537
Copy link
Member

@mroeschke mroeschke Jun 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would personally be in favor of a harder break - this should raise like PyArrow does, and auser that want this behavior should fill NaN's first.

In [3]: pa.array(np.array([1.0, np.nan]), from_pandas=False, type=pa.int64())
ArrowInvalid: Float value nan was truncated converting to int64

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be fine with that. I just tried it out-- expecting it to break a ton of tests-- and im only seeing 147 failures (vs 89 without it), so not that bad.

Probably need to add the behavior of convert_dtypes to the checklist above. ATM this branch doesn't change its behavior.

from_pandas = True
try:
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(
value, type=pa_type, mask=mask, from_pandas=from_pandas
)
except (pa.ArrowInvalid, pa.ArrowTypeError):
# GH50430: let pyarrow infer type, then cast
pa_array = pa.array(value, from_pandas=True)
pa_array = pa.array(value, mask=mask, from_pandas=from_pandas)

if pa_type is None and pa.types.is_duration(pa_array.type):
# Workaround https://github.com/apache/arrow/issues/37291
from pandas.core.tools.timedeltas import to_timedelta

value = to_timedelta(value)
value = value.to_numpy()
pa_array = pa.array(value, type=pa_type, from_pandas=True)
pa_array = pa.array(value, type=pa_type)

if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
# GH52843: upstream bug for duration types when originally
Expand Down Expand Up @@ -1169,7 +1197,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
if not len(values):
return np.zeros(len(self), dtype=bool)

result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
result = pc.is_in(self._pa_array, value_set=pa.array(values))
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand Down Expand Up @@ -2002,7 +2030,7 @@ def __setitem__(self, key, value) -> None:
raise ValueError("Length of indexer and values mismatch")
chunks = [
*self._pa_array[:key].chunks,
pa.array([value], type=self._pa_array.type, from_pandas=True),
pa.array([value], type=self._pa_array.type),
*self._pa_array[key + 1 :].chunks,
]
data = pa.chunked_array(chunks).combine_chunks()
Expand Down Expand Up @@ -2056,7 +2084,7 @@ def _rank_calc(
pa_type = pa.float64()
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
result = pa.array(ranked, type=pa_type)
return result

data = self._pa_array.combine_chunks()
Expand Down Expand Up @@ -2308,7 +2336,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
right, right_type = _to_numpy_and_type(right)
pa_type = left_type or right_type
result = np.where(cond, left, right)
return pa.array(result, type=pa_type, from_pandas=True)
return pa.array(result, type=pa_type)

@classmethod
def _replace_with_mask(
Expand Down Expand Up @@ -2351,7 +2379,7 @@ def _replace_with_mask(
replacements = replacements.as_py()
result = np.array(values, dtype=object)
result[mask] = replacements
return pa.array(result, type=values.type, from_pandas=True)
return pa.array(result, type=values.type)

# ------------------------------------------------------------------
# GroupBy Methods
Expand Down Expand Up @@ -2430,7 +2458,7 @@ def _groupby_op(
return type(self)(pa_result)
else:
# DatetimeArray, TimedeltaArray
pa_result = pa.array(result, from_pandas=True)
pa_result = pa.array(result)
return type(self)(pa_result)

def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,12 @@ def _str_map_str_or_object(
if self.dtype.storage == "pyarrow":
import pyarrow as pa

# TODO: shouldn't this already be caught my passed mask?
# it isn't in test_extract_expand_capture_groups_index
# mask = mask | np.array(
# [x is libmissing.NA for x in result], dtype=bool
# )

result = pa.array(
result, mask=mask, type=pa.large_string(), from_pandas=True
)
Expand Down Expand Up @@ -726,7 +732,7 @@ def __arrow_array__(self, type=None):

values = self._ndarray.copy()
values[self.isna()] = None
return pa.array(values, type=type, from_pandas=True)
return pa.array(values, type=type)

def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override]
arr = self._ndarray
Expand Down
19 changes: 18 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9874,7 +9874,7 @@ def where(
def where(
self,
cond,
other=np.nan,
other=lib.no_default,
*,
inplace: bool = False,
axis: Axis | None = None,
Expand Down Expand Up @@ -10032,6 +10032,23 @@ def where(
stacklevel=2,
)

if other is lib.no_default:
if self.ndim == 1:
if isinstance(self.dtype, ExtensionDtype):
other = self.dtype.na_value
else:
other = np.nan
else:
if self._mgr.nblocks == 1 and isinstance(
self._mgr.blocks[0].values.dtype, ExtensionDtype
):
# FIXME: checking this is kludgy!
other = self._mgr.blocks[0].values.dtype.na_value
else:
# FIXME: the same problem we had with Series will now
# show up column-by-column!
other = np.nan

other = common.apply_if_callable(other, self)
return self._where(cond, other, inplace=inplace, axis=axis, level=level)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ def test_setitem_frame_2d_values(self, data):
df.iloc[:-1] = df.iloc[:-1].copy()
tm.assert_frame_equal(df, orig)

# FIXME: Breaks for pyarrow float dtype bc df.values changes NAs to NaN
df.iloc[:] = df.values
tm.assert_frame_equal(df, orig)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ def test_EA_types(self, engine, data, dtype_backend, request):
pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
)
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
csv_output = df.to_csv(index=False, na_rep=np.nan)
csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA?
if pa.types.is_binary(pa_dtype):
csv_output = BytesIO(csv_output)
else:
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how):
df = DataFrame(
{
"a": [2, 1, 1, 2, 3, 3],
"b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],
"c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],
# TODO: test that has mixed na_value and NaN either working for
# float or raising for int?
"b": [na_value, 3.0, na_value, 4.0, na_value, na_value],
"c": [na_value, 3.0, na_value, 4.0, na_value, na_value],
},
dtype=any_real_nullable_dtype,
)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,13 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):

ser = ser if dtype is None else ser.astype(dtype)
result = ser.rank(method=method)
if dtype == "float64[pyarrow]":
# the NaNs are not treated as NA
exp = exp.copy()
if method == "average":
exp[np.isnan(ser)] = 9.5
elif method == "dense":
exp[np.isnan(ser)] = 6
tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method)))

@pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
Expand Down Expand Up @@ -321,6 +328,8 @@ def test_rank_tie_methods_on_infs_nans(
order = [ranks[1], ranks[0], ranks[2]]
elif na_option == "bottom":
order = [ranks[0], ranks[2], ranks[1]]
elif dtype == "float64[pyarrow]":
order = [ranks[0], [NA] * chunk, ranks[1]]
else:
order = [ranks[0], [np.nan] * chunk, ranks[1]]
expected = order if ascending else order[::-1]
Expand Down
Loading