-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
POC: consistent NaN treatment for pyarrow dtypes #61732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
import numpy as np | ||
|
||
from pandas._libs import lib | ||
from pandas._libs.missing import NA | ||
from pandas._libs.tslibs import ( | ||
Timedelta, | ||
Timestamp, | ||
|
@@ -360,7 +361,7 @@ def _from_sequence_of_strings( | |
# duration to string casting behavior | ||
mask = isna(scalars) | ||
if not isinstance(strings, (pa.Array, pa.ChunkedArray)): | ||
strings = pa.array(strings, type=pa.string(), from_pandas=True) | ||
strings = pa.array(strings, type=pa.string()) | ||
strings = pc.if_else(mask, None, strings) | ||
try: | ||
scalars = strings.cast(pa.int64()) | ||
|
@@ -381,7 +382,7 @@ def _from_sequence_of_strings( | |
if isinstance(strings, (pa.Array, pa.ChunkedArray)): | ||
scalars = strings | ||
else: | ||
scalars = pa.array(strings, type=pa.string(), from_pandas=True) | ||
scalars = pa.array(strings, type=pa.string()) | ||
scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) | ||
scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) | ||
scalars = scalars.cast(pa.bool_()) | ||
|
@@ -393,6 +394,13 @@ def _from_sequence_of_strings( | |
from pandas.core.tools.numeric import to_numeric | ||
|
||
scalars = to_numeric(strings, errors="raise") | ||
if not pa.types.is_decimal(pa_type): | ||
# TODO: figure out why doing this cast breaks with decimal dtype | ||
# in test_from_sequence_of_strings_pa_array | ||
mask = strings.is_null() | ||
scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) | ||
# TODO: could we just do strings.cast(pa_type)? | ||
|
||
else: | ||
raise NotImplementedError( | ||
f"Converting strings to {pa_type} is not implemented." | ||
|
@@ -435,7 +443,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: | |
""" | ||
if isinstance(value, pa.Scalar): | ||
pa_scalar = value | ||
elif isna(value): | ||
elif isna(value) and not lib.is_float(value): | ||
pa_scalar = pa.scalar(None, type=pa_type) | ||
else: | ||
# Workaround https://github.com/apache/arrow/issues/37291 | ||
|
@@ -452,7 +460,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: | |
value = value.as_unit(pa_type.unit) | ||
value = value._value | ||
|
||
pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) | ||
pa_scalar = pa.scalar(value, type=pa_type) | ||
|
||
if pa_type is not None and pa_scalar.type != pa_type: | ||
pa_scalar = pa_scalar.cast(pa_type) | ||
|
@@ -484,6 +492,13 @@ def _box_pa_array( | |
if copy: | ||
value = value.copy() | ||
pa_array = value.__arrow_array__() | ||
|
||
elif hasattr(value, "__arrow_array__"): | ||
# e.g. StringArray | ||
if copy: | ||
value = value.copy() | ||
pa_array = value.__arrow_array__() | ||
|
||
else: | ||
if ( | ||
isinstance(value, np.ndarray) | ||
|
@@ -510,19 +525,32 @@ def _box_pa_array( | |
value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) | ||
value = value.to_numpy() | ||
|
||
mask = None | ||
if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM": | ||
# similar to isna(value) but exclude NaN | ||
# TODO: cythonize! | ||
mask = np.array([x is NA or x is None for x in value], dtype=bool) | ||
|
||
from_pandas = False | ||
if pa.types.is_integer(pa_type): | ||
# If user specifically asks to cast a numpy float array with NaNs | ||
# to pyarrow integer, we'll treat those NaNs as NA | ||
Comment on lines
+536
to
+537
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would personally be in favor of a harder break - this should raise like PyArrow does, and auser that want this behavior should fill NaN's first. In [3]: pa.array(np.array([1.0, np.nan]), from_pandas=False, type=pa.int64())
ArrowInvalid: Float value nan was truncated converting to int64 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd be fine with that. I just tried it out-- expecting it to break a ton of tests-- and im only seeing 147 failures (vs 89 without it), so not that bad. Probably need to add the behavior of |
||
from_pandas = True | ||
try: | ||
pa_array = pa.array(value, type=pa_type, from_pandas=True) | ||
pa_array = pa.array( | ||
value, type=pa_type, mask=mask, from_pandas=from_pandas | ||
) | ||
except (pa.ArrowInvalid, pa.ArrowTypeError): | ||
# GH50430: let pyarrow infer type, then cast | ||
pa_array = pa.array(value, from_pandas=True) | ||
pa_array = pa.array(value, mask=mask, from_pandas=from_pandas) | ||
|
||
if pa_type is None and pa.types.is_duration(pa_array.type): | ||
# Workaround https://github.com/apache/arrow/issues/37291 | ||
from pandas.core.tools.timedeltas import to_timedelta | ||
|
||
value = to_timedelta(value) | ||
value = value.to_numpy() | ||
pa_array = pa.array(value, type=pa_type, from_pandas=True) | ||
pa_array = pa.array(value, type=pa_type) | ||
|
||
if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: | ||
# GH52843: upstream bug for duration types when originally | ||
|
@@ -1169,7 +1197,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: | |
if not len(values): | ||
return np.zeros(len(self), dtype=bool) | ||
|
||
result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) | ||
result = pc.is_in(self._pa_array, value_set=pa.array(values)) | ||
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls | ||
# to False | ||
return np.array(result, dtype=np.bool_) | ||
|
@@ -2002,7 +2030,7 @@ def __setitem__(self, key, value) -> None: | |
raise ValueError("Length of indexer and values mismatch") | ||
chunks = [ | ||
*self._pa_array[:key].chunks, | ||
pa.array([value], type=self._pa_array.type, from_pandas=True), | ||
pa.array([value], type=self._pa_array.type), | ||
*self._pa_array[key + 1 :].chunks, | ||
] | ||
data = pa.chunked_array(chunks).combine_chunks() | ||
|
@@ -2056,7 +2084,7 @@ def _rank_calc( | |
pa_type = pa.float64() | ||
else: | ||
pa_type = pa.uint64() | ||
result = pa.array(ranked, type=pa_type, from_pandas=True) | ||
result = pa.array(ranked, type=pa_type) | ||
return result | ||
|
||
data = self._pa_array.combine_chunks() | ||
|
@@ -2308,7 +2336,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: | |
right, right_type = _to_numpy_and_type(right) | ||
pa_type = left_type or right_type | ||
result = np.where(cond, left, right) | ||
return pa.array(result, type=pa_type, from_pandas=True) | ||
return pa.array(result, type=pa_type) | ||
|
||
@classmethod | ||
def _replace_with_mask( | ||
|
@@ -2351,7 +2379,7 @@ def _replace_with_mask( | |
replacements = replacements.as_py() | ||
result = np.array(values, dtype=object) | ||
result[mask] = replacements | ||
return pa.array(result, type=values.type, from_pandas=True) | ||
return pa.array(result, type=values.type) | ||
|
||
# ------------------------------------------------------------------ | ||
# GroupBy Methods | ||
|
@@ -2430,7 +2458,7 @@ def _groupby_op( | |
return type(self)(pa_result) | ||
else: | ||
# DatetimeArray, TimedeltaArray | ||
pa_result = pa.array(result, from_pandas=True) | ||
pa_result = pa.array(result) | ||
return type(self)(pa_result) | ||
|
||
def _apply_elementwise(self, func: Callable) -> list[list[Any]]: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we're moving towards making
NaN
distinct fromNA
(NaN not a missing value), maybe we should eventually makeisna
have anan_as_null: bool = True
argumentThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that may be necessary, but im a little wary of it since it might mean needing to add that argument to everywhere with a skipna keyword