From 844dc4a4fb8d213303085709aa4a3649400ed51a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Dec 2019 11:55:17 -0600 Subject: [PATCH] API: Uses pd.NA in IntegerArray (#29964) --- doc/source/user_guide/integer_na.rst | 28 +++++ doc/source/whatsnew/v1.0.0.rst | 58 ++++++++++ pandas/core/arrays/boolean.py | 6 +- pandas/core/arrays/integer.py | 116 +++++++++++++------ pandas/tests/arrays/test_integer.py | 149 ++++++++++++++++++------- pandas/tests/base/test_conversion.py | 2 +- pandas/tests/extension/test_integer.py | 27 +++-- 7 files changed, 298 insertions(+), 88 deletions(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 77568f3bcb244..a45d7a4fa1547 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,6 +15,10 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with @@ -23,6 +27,9 @@ much. But if your integer column is, say, an identifier, casting to float can be problematic. Some integers cannot even be represented as floating point numbers. +Construction +------------ + Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. @@ -39,6 +46,12 @@ NumPy's ``'int64'`` dtype: pd.array([1, 2, np.nan], dtype="Int64") +All NA-like values are replaced with :attr:`pandas.NA`. + +.. ipython:: python + + pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64") + This array can be stored in a :class:`DataFrame` or :class:`Series` like any NumPy array. @@ -78,6 +91,9 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +Operations +---------- + Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another dtype if needed. @@ -123,3 +139,15 @@ Reduction and groupby operations such as 'sum' work as well. df.sum() df.groupby('B').A.sum() + +Scalar NA Value +--------------- + +:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar +missing value. Slicing a single element that's missing will return +:attr:`pandas.NA` + +.. ipython:: python + + a = pd.array([1, None], dtype="Int64") + a[1] diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a6ba7770dadcc..8755abe642068 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -365,6 +365,64 @@ The following methods now also correctly output values for unobserved categories As a reminder, you can specify the ``dtype`` to disable all inference. +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` rather than +:attr:`numpy.nan` as its missing value marker (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a[2] + nan + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a[2] + +See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` +and :attr:`numpy.nan`. + +:class:`arrays.IntegerArray` comparisons return :class:`arrays.BooleanArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Comparison operations on a :class:`arrays.IntegerArray` now returns a +:class:`arrays.BooleanArray` rather than a NumPy array (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a > 1 + array([False, True, False]) + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a > 1 + +Note that missing values now propagate, rather than always comparing unequal +like :attr:`numpy.nan`. See :ref:`missing_data.NA` for more. + By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 600165ad9ac13..7301c0ab434a0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -730,7 +730,6 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -777,8 +776,11 @@ def logical_method(self, other): @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): # Rely on pandas to unbox and dispatch to us. return NotImplemented diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ee8b2c3bb723f..62f31addedc0b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,10 +1,10 @@ import numbers -from typing import Type +from typing import Any, Tuple, Type import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -44,7 +44,7 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = np.nan + na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" @@ -263,6 +263,11 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): .. versionadded:: 0.24.0 + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + .. warning:: IntegerArray is currently experimental, and its API or internal @@ -358,14 +363,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt - def __getitem__(self, item): if is_integer(item): if self._mask[item]: @@ -373,14 +370,30 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self): + def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default): """ coerce to an ndarary of object dtype """ + if dtype is None: + dtype = object + + if na_value is lib._no_default and is_float_dtype(dtype): + na_value = np.nan + elif na_value is lib._no_default: + na_value = libmissing.NA + + if is_integer_dtype(dtype): + # Specifically, a NumPy integer dtype, not a pandas integer dtype, + # since we're coercing to a numpy dtype by definition in this function. + if not self.isna().any(): + return self._data.astype(dtype) + else: + raise ValueError( + "cannot convert to integer NumPy array with missing values" + ) - # TODO(jreback) make this better - data = self._data.astype(object) - data[self._mask] = self._na_value + data = self._data.astype(dtype) + data[self._mask] = na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -390,7 +403,7 @@ def __array__(self, dtype=None): the array interface, return my values We return an object array here to preserve our scalar values """ - return self._coerce_to_ndarray() + return self._coerce_to_ndarray(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -506,7 +519,7 @@ def isna(self): @property def _na_value(self): - return np.nan + return self.dtype.na_value @classmethod def _concat_same_type(cls, to_concat): @@ -545,7 +558,7 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray() + data = self._coerce_to_ndarray(dtype=dtype) return astype_nansafe(data, dtype, copy=False) @property @@ -600,12 +613,19 @@ def value_counts(self, dropna=True): # w/o passing the dtype array = np.append(array, [self._mask.sum()]) index = Index( - np.concatenate([index.values, np.array([np.nan], dtype=object)]), + np.concatenate( + [index.values, np.array([self.dtype.na_value], dtype=object)] + ), dtype=object, ) return Series(array, index=index) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. + return self._coerce_to_ndarray(na_value=np.nan), np.nan + def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. @@ -629,9 +649,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): + from pandas.arrays import BooleanArray + mask = None - if isinstance(other, IntegerArray): + if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask elif is_list_like(other): @@ -643,25 +665,35 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op_name}__") - result = method(other) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) if result is NotImplemented: result = invalid_comparison(self._data, other, op) # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask - result[mask] = op_name == "ne" - return result + return BooleanArray(result, mask) name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) @@ -673,7 +705,8 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") - data[mask] = self._na_value + # We explicitly use NaN within reductions. + data[mask] = np.nan op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) @@ -739,12 +772,13 @@ def integer_arithmetic_method(self, other): raise TypeError("can only perform ops with numeric values") else: - if not (is_float(other) or is_integer(other)): + if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") - # nans propagate if omask is None: mask = self._mask.copy() + if other is libmissing.NA: + mask |= True else: mask = self._mask | omask @@ -754,20 +788,23 @@ def integer_arithmetic_method(self, other): # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": @@ -790,6 +827,11 @@ def integer_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + Attributes ---------- None diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index e534c93c69f68..f9b002d4409ce 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, NA, 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + "[ 1, 2, NA, 1, 2, NA, 1, 2, NA, 1,\n" " ...\n" - " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + " NA, 1, 2, NA, 1, 2, NA, 1, 2, NA]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -108,13 +108,17 @@ def test_repr_array_long(): class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) + result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype)) tm.assert_series_equal(result, expected) # from int / list @@ -156,10 +160,13 @@ def _check_op(self, s, op_name, other, exc=None): # 1 ** na is na, so need to unmask those if op_name == "__pow__": - mask = np.where(s == 1, False, mask) + mask = np.where(~s.isna() & (s == 1), False, mask) elif op_name == "__rpow__": - mask = np.where(other == 1, False, mask) + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) # float result type or float op if ( @@ -208,20 +215,27 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): else: expected = expected.fillna(0) else: - expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 try: - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) except ValueError: expected = expected.astype(float) - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) - expected[mask] = np.nan + expected[mask] = pd.NA # assert that the expected astype is ok # (skip for unsigned as they have wrap around) @@ -255,21 +269,18 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) self._check_op(s, op, other, exc=TypeError) @@ -359,9 +370,9 @@ def test_pow_scalar(self): expected = pd.array([0, 1, None, 2], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = a ** pd.NA - # expected = pd.array([None, 1, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = a ** pd.NA + expected = pd.array([None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = a ** np.nan expected = np.array([np.nan, 1, np.nan, np.nan], dtype="float64") @@ -376,9 +387,9 @@ def test_pow_scalar(self): expected = pd.array([1, 1, 1, 1], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = pd.NA ** a - # expected = pd.array([1, None, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = np.nan ** a expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") @@ -406,10 +417,10 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -417,22 +428,61 @@ def _compare_other(self, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data._data) - expected = op(expected, other) + expected = op(pd.Series(data._data), other) # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA + expected = expected.astype("boolean") tm.assert_series_equal(result, expected) - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, 0) + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.Series([0] * len(data)) - self._compare_other(data, op_name, other) + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) def test_no_shared_mask(self, data): result = data + 1 @@ -442,20 +492,21 @@ def test_compare_to_string(self, any_nullable_int_dtype): # GH 28930 s = pd.Series([1, None], dtype=any_nullable_int_dtype) result = s == "a" - expected = pd.Series([False, False]) + expected = pd.Series([False, pd.NA], dtype="boolean") self.assert_series_equal(result, expected) def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): # GH 28930 - s1 = pd.Series([1, 2, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, 2, 3], dtype="int") + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") method = getattr(s1, all_compare_operators) result = method(2) method = getattr(s2, all_compare_operators) - expected = method(2) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA self.assert_series_equal(result, expected) @@ -543,6 +594,17 @@ def test_astype(self, all_data): expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): s = pd.Series([1, 2, 3], dtype="Int64") @@ -572,12 +634,17 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) + def test_coerce_to_ndarray_float_NA_rasies(self): + a = pd.array([0, 1, 2], dtype="Int64") + with pytest.raises(TypeError, match="NAType"): + a._coerce_to_ndarray(dtype="float", na_value=pd.NA) + def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NaN\n1 1" + expected = " A\n0 NA\n1 1" assert result == expected @@ -593,7 +660,7 @@ def test_conversions(data_missing): # we assert that we are exactly equal # including type conversions of scalars result = df["A"].astype("object").values - expected = np.array([np.nan, 1], dtype=object) + expected = np.array([pd.NA, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): @@ -756,7 +823,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, False]) + expected = pd.Series([False, True, None], dtype="boolean") tm.assert_series_equal(result, expected) result = df.A + df.B @@ -820,7 +887,7 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert float NaN to integer" + msg = "cannot convert to integer NumPy array with missing values" with pytest.raises(ValueError, match=msg): arr.astype("uint32") @@ -895,7 +962,9 @@ def test_arrow_array(data): import pyarrow as pa arr = pa.array(data) - expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) assert arr.equals(expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8fa52af832907..4b6349a505509 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -315,7 +315,7 @@ def test_array_multiindex_raises(): ), ( pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), + np.array([0, pd.NA], dtype=object), ), ( pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..8e54543e5437c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -34,7 +34,7 @@ def make_data(): - return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] @pytest.fixture( @@ -65,7 +65,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) + return integer_array([pd.NA, 1], dtype=dtype) @pytest.fixture @@ -75,18 +75,18 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return integer_array([1, np.nan, 0], dtype=dtype) + return integer_array([1, pd.NA, 0], dtype=dtype) @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -94,7 +94,7 @@ def data_for_grouping(dtype): b = 1 a = 0 c = 2 - na = np.nan + na = pd.NA return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -129,7 +129,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.astype(float) + expected = expected.fillna(np.nan).astype(float) if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) @@ -142,6 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 @@ -162,6 +163,16 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None)