diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 760da36a30075..c32eda4928da7 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -652,7 +652,9 @@ class Rank: ] def setup(self, dtype): - self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype) + self.df = DataFrame( + np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype + ) def time_rank(self, dtype): self.df.rank() diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 987a19cf99dd6..88c69335b39f4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -700,6 +700,7 @@ Deprecations - Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`) - Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) - Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) - In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) - Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) @@ -945,6 +946,7 @@ Indexing - Bug in :meth:`DataFrame.loc` returning :class:`MultiIndex` in wrong order if indexer has duplicates (:issue:`40978`) - Bug in :meth:`DataFrame.__setitem__` raising ``TypeError`` when using a str subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) - Bug in :meth:`PeriodIndex.get_loc` failing to raise ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) +- Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`) Missing ^^^^^^^ diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f7cec262ca302..3351bb7cac7d6 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -106,7 +106,8 @@ cdef class IndexEngine: try: return self.mapping.get_item(val) - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): + # GH#41775 OverflowError e.g. if we are uint64 and val is -1 raise KeyError(val) cdef inline _get_loc_duplicates(self, object val): diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index f91b96dc1b1dc..0c8f2baabc804 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -190,11 +190,7 @@ def maybe_indices_to_slice( max_len: int, ) -> slice | np.ndarray: ... # np.ndarray[np.uint8] -def clean_index_list(obj: list) -> tuple[ - list | np.ndarray, # np.ndarray[object | np.int64 | np.uint64] - bool, -]: ... - +def is_all_arraylike(obj: list) -> bool: ... # ----------------------------------------------------------------- # Functions which in reality take memoryviews diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4b5ef3e909a00..1a07b76583fca 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -740,19 +740,15 @@ cpdef ndarray[object] ensure_string_array( return result -@cython.wraparound(False) -@cython.boundscheck(False) -def clean_index_list(obj: list): +def is_all_arraylike(obj: list) -> bool: """ - Utility used in ``pandas.core.indexes.api.ensure_index``. + Should we treat these as levels of a MultiIndex, as opposed to Index items? """ cdef: Py_ssize_t i, n = len(obj) object val bint all_arrays = True - # First check if we have a list of arraylikes, in which case we will - # pass them to MultiIndex.from_arrays for i in range(n): val = obj[i] if not (isinstance(val, list) or @@ -762,31 +758,7 @@ def clean_index_list(obj: list): all_arrays = False break - if all_arrays: - return obj, all_arrays - - # don't force numpy coerce with nan's - inferred = infer_dtype(obj, skipna=False) - if inferred in ['string', 'bytes', 'mixed', 'mixed-integer']: - return np.asarray(obj, dtype=object), 0 - elif inferred in ['integer']: - # we infer an integer but it *could* be a uint64 - - arr = np.asarray(obj) - if arr.dtype.kind not in ["i", "u"]: - # eg [0, uint64max] gets cast to float64, - # but then we know we have either uint64 or object - if (arr < 0).any(): - # TODO: similar to maybe_cast_to_integer_array - return np.asarray(obj, dtype="object"), 0 - - # GH#35481 - guess = np.asarray(obj, dtype="uint64") - return guess, 0 - - return arr, 0 - - return np.asarray(obj), 0 + return all_arrays # ------------------------------------------------------------------------------ diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index aaf58f1fcb150..40f23c25a1e99 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -30,9 +30,13 @@ from pandas.core.dtypes.common import ( is_datetime64_dtype, is_datetime64tz_dtype, + is_float_dtype, + is_integer_dtype, is_period_dtype, is_sequence, is_timedelta64_dtype, + is_unsigned_integer_dtype, + pandas_dtype, ) import pandas as pd @@ -41,11 +45,14 @@ CategoricalIndex, DataFrame, DatetimeIndex, + Float64Index, Index, + Int64Index, IntervalIndex, MultiIndex, RangeIndex, Series, + UInt64Index, bdate_range, ) from pandas._testing._io import ( # noqa:F401 @@ -292,12 +299,32 @@ def makeBoolIndex(k=10, name=None): return Index([False, True] + [False] * (k - 2), name=name) +def makeNumericIndex(k=10, name=None, *, dtype): + dtype = pandas_dtype(dtype) + assert isinstance(dtype, np.dtype) + + if is_integer_dtype(dtype): + values = np.arange(k, dtype=dtype) + if is_unsigned_integer_dtype(dtype): + values += 2 ** (dtype.itemsize * 8 - 1) + elif is_float_dtype(dtype): + values = np.random.random_sample(k) - np.random.random_sample(1) + values.sort() + values = values * (10 ** np.random.randint(0, 9)) + else: + raise NotImplementedError(f"wrong dtype {dtype}") + + return Index(values, dtype=dtype, name=name) + + def makeIntIndex(k=10, name=None): - return Index(list(range(k)), name=name) + base_idx = makeNumericIndex(k, name=name, dtype="int64") + return Int64Index(base_idx) def makeUIntIndex(k=10, name=None): - return Index([2 ** 63 + i for i in range(k)], name=name) + base_idx = makeNumericIndex(k, name=name, dtype="uint64") + return UInt64Index(base_idx) def makeRangeIndex(k=10, name=None, **kwargs): @@ -305,8 +332,8 @@ def makeRangeIndex(k=10, name=None, **kwargs): def makeFloatIndex(k=10, name=None): - values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) - return Index(values * (10 ** np.random.randint(0, 9)), name=name) + base_idx = makeNumericIndex(k, name=name, dtype="float64") + return Float64Index(base_idx) def makeDateIndex(k: int = 10, freq="B", name=None, **kwargs) -> DatetimeIndex: diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 2d695458e32e6..96d010b487a79 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -308,7 +308,7 @@ def assert_index_equal( """ __tracebackhide__ = True - def _check_types(left, right, obj="Index"): + def _check_types(left, right, obj="Index") -> None: if not exact: return diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 031ff6a7665d4..7cf34635ce9c1 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -342,7 +342,7 @@ def reconstruct(result): result, **reconstruct_axes, **reconstruct_kwargs, copy=False ) # TODO: When we support multiple values in __finalize__, this - # should pass alignable to `__fianlize__` instead of self. + # should pass alignable to `__finalize__` instead of self. # Then `np.add(a, b)` would consider attrs from both a and b # when a and b are NDFrames. if len(alignable) == 1: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index edaa53cd55042..c877d27fd2392 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -24,6 +24,7 @@ Dtype, DtypeObj, ) +from pandas.errors import IntCastingNaNError from pandas.core.dtypes.base import ( ExtensionDtype, @@ -511,7 +512,24 @@ def sanitize_array( # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) + except IntCastingNaNError: + subarr = np.array(data, copy=copy) except ValueError: + if not raise_cast_failure: + # i.e. called via DataFrame constructor + warnings.warn( + "In a future version, passing float-dtype values and an " + "integer dtype to DataFrame will retain floating dtype " + "if they cannot be cast losslessly (matching Series behavior). " + "To retain the old behavior, use DataFrame(data).astype(dtype)", + FutureWarning, + stacklevel=4, + ) + # GH#40110 until the deprecation is enforced, we _dont_ + # ignore the dtype for DataFrame, and _do_ cast even though + # it is lossy. + dtype = cast(np.dtype, dtype) + return np.array(data, dtype=dtype, copy=copy) subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 161572f3f1ac3..177b1ccd166cb 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array( if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype): + if is_float_dtype(arr.dtype): + if not np.isfinite(arr).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + raise ValueError("Trying to coerce float values to integers") + if is_object_dtype(arr.dtype): raise ValueError("Trying to coerce float values to integers") if casted.dtype < arr.dtype: @@ -2102,6 +2108,17 @@ def maybe_cast_to_integer_array( ) return casted + if arr.dtype.kind in ["m", "M"]: + # test_constructor_maskedarray_nonfloat + warnings.warn( + f"Constructing Series or DataFrame from {arr.dtype} values and " + f"dtype={dtype} is deprecated and will raise in a future version. " + "Use values.view(dtype) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return casted + # No known cases that get here, but raising explicitly to cover our bases. raise ValueError(f"values cannot be losslessly cast to {dtype}") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 49dc71954fd8f..99816237155b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8315,8 +8315,8 @@ def rank( How to rank NaN values: * keep: assign NaN rank to NaN values - * top: assign smallest rank to NaN values if ascending - * bottom: assign highest rank to NaN values if ascending. + * top: assign lowest rank to NaN values + * bottom: assign highest rank to NaN values ascending : bool, default True Whether or not the elements should be ranked in ascending order. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 124903446220d..457ec8ab702d5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,6 +1,5 @@ from __future__ import annotations -from copy import copy as copy_func from datetime import datetime import functools from itertools import zip_longest @@ -3721,7 +3720,7 @@ def _convert_listlike_indexer(self, keyarr): else: keyarr = self._convert_arr_indexer(keyarr) - indexer = self._convert_list_indexer(keyarr) + indexer = None return indexer, keyarr def _convert_arr_indexer(self, keyarr) -> np.ndarray: @@ -3739,22 +3738,6 @@ def _convert_arr_indexer(self, keyarr) -> np.ndarray: """ return com.asarray_tuplesafe(keyarr) - def _convert_list_indexer(self, keyarr): - """ - Convert a list-like indexer to the appropriate dtype. - - Parameters - ---------- - keyarr : Index (or sub-class) - Indexer to convert. - kind : iloc, loc, optional - - Returns - ------- - positional indexer or None - """ - return None - @final def _invalid_indexer(self, form: str_t, key) -> TypeError: """ @@ -5410,6 +5393,7 @@ def _find_common_type_compat(self, target) -> DtypeObj: return np.dtype("object") dtype = find_common_type([self.dtype, target_dtype]) + if dtype.kind in ["i", "u"]: # TODO: what about reversed with self being categorical? if ( @@ -6312,21 +6296,15 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind # check in clean_index_list index_like = list(index_like) - converted, all_arrays = lib.clean_index_list(index_like) - - if len(converted) > 0 and all_arrays: + if len(index_like) and lib.is_all_arraylike(index_like): from pandas.core.indexes.multi import MultiIndex - return MultiIndex.from_arrays(converted) + return MultiIndex.from_arrays(index_like) else: - index_like = converted + return Index(index_like, copy=copy, tupleize_cols=False) else: - # clean_index_list does the equivalent of copying - # so only need to do this if not list instance - if copy: - index_like = copy_func(index_like) - return Index(index_like) + return Index(index_like, copy=copy) def ensure_has_len(seq): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 1541885887dab..554cf33e22555 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -525,18 +525,6 @@ def _get_indexer_non_unique( indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), ensure_platform_int(missing) - @doc(Index._convert_list_indexer) - def _convert_list_indexer(self, keyarr): - # Return our indexer or raise if all of the values are not included in - # the categories - - if self.categories._defer_to_indexing: - # See tests.indexing.interval.test_interval:test_loc_getitem_frame - indexer = self.categories._convert_list_indexer(keyarr) - return Index(self.codes).get_indexer_for(indexer) - - return self.get_indexer_for(keyarr) - # -------------------------------------------------------------------- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5f24eb0cfaad6..3dc46f04d1d45 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -49,7 +49,6 @@ TimedeltaArray, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -599,7 +598,7 @@ def _convert_arr_indexer(self, keyarr): try: return self._data._validate_listlike(keyarr, allow_object=True) except (ValueError, TypeError): - return com.asarray_tuplesafe(keyarr) + return super()._convert_arr_indexer(keyarr) class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 06ab7fdbcf872..58c2b3e26ce06 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -815,20 +815,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound") return getattr(self, side)._maybe_cast_slice_bound(label, side) - @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr): - """ - we are passed a list-like indexer. Return the - indexer for matching intervals. - """ - locs = self.get_indexer_for(keyarr) - - # we have missing values - if (locs == -1).any(): - raise KeyError(keyarr[locs == -1].tolist()) - - return locs - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: if not isinstance(dtype, IntervalDtype): return False diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index de7c522b4fbec..ea2d5d9eec6ac 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -37,7 +37,6 @@ ) from pandas.core.dtypes.generic import ABCSeries -import pandas.core.common as com from pandas.core.indexes.base import ( Index, maybe_extract_name, @@ -107,20 +106,22 @@ def _can_hold_na(self) -> bool: else: return False - @cache_readonly + _engine_types: dict[np.dtype, type[libindex.IndexEngine]] = { + np.dtype(np.int8): libindex.Int8Engine, + np.dtype(np.int16): libindex.Int16Engine, + np.dtype(np.int32): libindex.Int32Engine, + np.dtype(np.int64): libindex.Int64Engine, + np.dtype(np.uint8): libindex.UInt8Engine, + np.dtype(np.uint16): libindex.UInt16Engine, + np.dtype(np.uint32): libindex.UInt32Engine, + np.dtype(np.uint64): libindex.UInt64Engine, + np.dtype(np.float32): libindex.Float32Engine, + np.dtype(np.float64): libindex.Float64Engine, + } + + @property def _engine_type(self): - return { - np.int8: libindex.Int8Engine, - np.int16: libindex.Int16Engine, - np.int32: libindex.Int32Engine, - np.int64: libindex.Int64Engine, - np.uint8: libindex.UInt8Engine, - np.uint16: libindex.UInt16Engine, - np.uint32: libindex.UInt32Engine, - np.uint64: libindex.UInt64Engine, - np.float32: libindex.Float32Engine, - np.float64: libindex.Float64Engine, - }[self.dtype.type] + return self._engine_types[self.dtype] @cache_readonly def inferred_type(self) -> str: @@ -250,21 +251,6 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): # we will try to coerce to integers return self._maybe_cast_indexer(label) - @doc(Index._convert_arr_indexer) - def _convert_arr_indexer(self, keyarr) -> np.ndarray: - if not is_unsigned_integer_dtype(self.dtype): - return super()._convert_arr_indexer(keyarr) - - # Cast the indexer to uint64 if possible so that the values returned - # from indexing are also uint64. - dtype = None - if is_integer_dtype(keyarr) or ( - lib.infer_dtype(keyarr, skipna=False) == "integer" - ): - dtype = np.dtype(np.uint64) - - return com.asarray_tuplesafe(keyarr, dtype=dtype) - # ---------------------------------------------------------------- @doc(Index._shallow_copy) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d5555561088eb..66de374121fb0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -51,8 +51,11 @@ length_of_indexer, ) from pandas.core.indexes.api import ( + CategoricalIndex, Index, + IntervalIndex, MultiIndex, + ensure_index, ) if TYPE_CHECKING: @@ -1297,6 +1300,11 @@ def _get_listlike_indexer(self, key, axis: int): keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) self._validate_read_indexer(keyarr, indexer, axis) + + if isinstance(ax, (IntervalIndex, CategoricalIndex)): + # take instead of reindex to preserve dtype. For IntervalIndex + # this is to map integers to the Intervals they match to. + keyarr = ax.take(indexer) return keyarr, indexer def _validate_read_indexer(self, key, indexer, axis: int): @@ -1329,13 +1337,22 @@ def _validate_read_indexer(self, key, indexer, axis: int): missing = (missing_mask).sum() if missing: + ax = self.obj._get_axis(axis) + + # TODO: remove special-case; this is just to keep exception + # message tests from raising while debugging + use_interval_msg = isinstance(ax, IntervalIndex) or ( + isinstance(ax, CategoricalIndex) + and isinstance(ax.categories, IntervalIndex) + ) + if missing == len(indexer): axis_name = self.obj._get_axis_name(axis) + if use_interval_msg: + key = list(key) raise KeyError(f"None of [{key}] are in the [{axis_name}]") - ax = self.obj._get_axis(axis) - - not_found = list(set(key) - set(ax)) + not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) raise KeyError(f"{not_found} not in index") diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 270eddf2bd3a5..81bf3ca4ba07a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -22,11 +22,9 @@ DtypeObj, Manager, ) -from pandas.errors import IntCastingNaNError from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - construct_1d_ndarray_preserving_na, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -303,22 +301,12 @@ def ndarray_to_mgr( shape = values.shape flat = values.ravel() - if not is_integer_dtype(dtype): - # TODO: skipping integer_dtype is needed to keep the tests passing, - # not clear it is correct - # Note: we really only need _try_cast, but keeping to exposed funcs - values = sanitize_array( - flat, None, dtype=dtype, copy=copy, raise_cast_failure=True - ) - else: - try: - values = construct_1d_ndarray_preserving_na( - flat, dtype=dtype, copy=False - ) - except IntCastingNaNError: - # following Series, we ignore the dtype and retain floating - # values instead of casting nans to meaningless ints - pass + # GH#40110 see similar check inside sanitize_array + rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") + + values = sanitize_array( + flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf + ) values = values.reshape(shape) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c36552f59da71..8b7070e945439 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -214,7 +214,7 @@ def test_api(self): + self.funcs_to + self.private_modules ) - self.check(pd, checkthese, self.ignored) + self.check(namespace=pd, expected=checkthese, ignored=self.ignored) def test_depr(self): deprecated_list = ( diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 26e785a2796b1..cabe766a4e9eb 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -23,12 +23,12 @@ def test_unique(index_or_series_obj): if isinstance(obj, pd.MultiIndex): expected = pd.MultiIndex.from_tuples(unique_values) expected.names = obj.names - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) elif isinstance(obj, pd.Index): expected = pd.Index(unique_values, dtype=obj.dtype) if is_datetime64tz_dtype(obj.dtype): expected = expected.normalize() - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(unique_values) tm.assert_numpy_array_equal(result, expected) @@ -67,7 +67,7 @@ def test_unique_null(null_obj, index_or_series_obj): if is_datetime64tz_dtype(obj.dtype): result = result.normalize() expected = expected.normalize() - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(unique_values, dtype=obj.dtype) tm.assert_numpy_array_equal(result, expected) @@ -118,7 +118,7 @@ def test_unique_bad_unicode(idx_or_series_w_bad_unicode): if isinstance(obj, pd.Index): expected = pd.Index(["\ud83d"], dtype=object) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(["\ud83d"], dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 6e176310da6b4..dac3c0382df01 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -603,7 +603,7 @@ def test_sort_index_level_large_cardinality(self): # GH#2684 (int64) index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) + df = DataFrame(np.random.randn(4000).astype("int64"), index=index) # it works! result = df.sort_index(level=0) @@ -611,7 +611,7 @@ def test_sort_index_level_large_cardinality(self): # GH#2684 (int32) index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) + df = DataFrame(np.random.randn(4000).astype("int32"), index=index) # it works! result = df.sort_index(level=0) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 769b08373b890..5156d0371e9b7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -714,7 +714,9 @@ def create_cols(name): np.random.randn(100, 5), dtype="float64", columns=create_cols("float") ) df_int = DataFrame( - np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + np.random.randn(100, 5).astype("int64"), + dtype="int64", + columns=create_cols("int"), ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( @@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self): tm.assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") - df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") + df_int = DataFrame(np.random.randn(1000, 3)).astype("int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 784969c199c9f..6e0013c196760 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -10,6 +10,7 @@ import functools import itertools import re +import warnings import numpy as np import numpy.ma as ma @@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self): assert isna(frame).values.all() # cast type - frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + msg = r"datetime64\[ns\] values and dtype=int64" + with tm.assert_produces_warning(FutureWarning, match=msg): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + message="elementwise comparison failed", + ) + frame = DataFrame( + mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64 + ) assert frame.values.dtype == np.int64 # Check non-masked values @@ -2484,6 +2495,27 @@ def test_nested_list_columns(self): tm.assert_frame_equal(result, expected) +class TestDataFrameConstructorWithDtypeCoercion: + def test_floating_values_integer_dtype(self): + # GH#40110 make DataFrame behavior with arraylike floating data and + # inty dtype match Series behavior + + arr = np.random.randn(10, 5) + + msg = "if they cannot be cast losslessly" + with tm.assert_produces_warning(FutureWarning, match=msg): + DataFrame(arr, dtype="i8") + + with tm.assert_produces_warning(None): + # if they can be cast losslessly, no warning + DataFrame(arr.round(), dtype="i8") + + # with NaNs, we already have the correct behavior, so no warning + arr[0, 0] = np.nan + with tm.assert_produces_warning(None): + DataFrame(arr, dtype="i8") + + class TestDataFrameConstructorWithDatetimeTZ: @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_construction_preserves_tzaware_dtypes(self, tz): diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index c9a39eb460cf4..d010426bee53e 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -294,7 +294,7 @@ def test_multi_dtype2(self): def test_dups_across_blocks(self, using_array_manager): # dups across blocks df_float = DataFrame(np.random.randn(10, 3), dtype="float64") - df_int = DataFrame(np.random.randn(10, 3), dtype="int64") + df_int = DataFrame(np.random.randn(10, 3).astype("int64")) df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns) df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns) df_dt = DataFrame( diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index c796a25faf0a6..9572aeaf41c91 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -531,6 +531,14 @@ def test_constructor(self, dtype): res = Index([1, 2 ** 63 + 1], dtype=dtype) tm.assert_index_equal(res, idx) + @pytest.mark.xfail(reason="https://github.com/numpy/numpy/issues/19146") + def test_constructor_does_not_cast_to_float(self): + # https://github.com/numpy/numpy/issues/19146 + values = [0, np.iinfo(np.uint64).max] + + result = UInt64Index(values) + assert list(result) == values + @pytest.mark.parametrize( "box", diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f75e4af888643..d7abaf0b5dfbe 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1622,6 +1622,18 @@ def test_ensure_index_mixed_closed_intervals(self): expected = Index(intervals, dtype=object) tm.assert_index_equal(result, expected) + def test_ensure_index_uint64(self): + # with both 0 and a large-uint64, np.array will infer to float64 + # https://github.com/numpy/numpy/issues/19146 + # but a more accurate choice would be uint64 + values = [0, np.iinfo(np.uint64).max] + + result = ensure_index(values) + assert list(result) == values + + expected = Index(values, dtype="uint64") + tm.assert_index_equal(result, expected) + def test_get_combined_index(self): result = _get_combined_index([]) expected = Index([]) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 395e9297a8fde..503e39041a49f 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -65,10 +65,10 @@ def test_getitem_non_matching(self, series_with_interval_index, indexer_sl): # this is a departure from our current # indexing scheme, but simpler - with pytest.raises(KeyError, match=r"^\[-1\]$"): + with pytest.raises(KeyError, match=r"\[-1\] not in index"): indexer_sl(ser)[[-1, 3, 4, 5]] - with pytest.raises(KeyError, match=r"^\[-1\]$"): + with pytest.raises(KeyError, match=r"\[-1\] not in index"): indexer_sl(ser)[[-1, 3]] @pytest.mark.slow @@ -107,11 +107,11 @@ def test_loc_getitem_frame(self): expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError, match=r"^\[10\]$"): + with pytest.raises(KeyError, match=r"None of \[\[10\]\] are"): df.loc[[10]] # partial missing - with pytest.raises(KeyError, match=r"^\[10\]$"): + with pytest.raises(KeyError, match=r"\[10\] not in index"): df.loc[[10, 4]] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 34dc5d604e90d..aad6523357df6 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -150,7 +150,8 @@ def test_loc_with_overlap(self, indexer_sl): with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): indexer_sl(ser)[Interval(3, 5)] - with pytest.raises(KeyError, match=r"^\[Interval\(3, 5, closed='right'\)\]$"): + msg = r"None of \[\[Interval\(3, 5, closed='right'\)\]\]" + with pytest.raises(KeyError, match=msg): indexer_sl(ser)[[Interval(3, 5)]] # slices with interval (only exact matches) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 6f4949267c00c..26f2ba577d184 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request): ) request.node.add_marker(mark) - exp = pd.Series([1, val, 3, 4], dtype=np.int8) + warn = None if exp_dtype is np.int8 else FutureWarning + msg = "Values are too large to be losslessly cast to int8" + with tm.assert_produces_warning(warn, match=msg): + exp = pd.Series([1, val, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 772aa97c47233..9c6a39c991912 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -338,7 +338,7 @@ def test_multitype_list_index_access(self): # GH 10610 df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) - with pytest.raises(KeyError, match=re.escape("'[-8, 26] not in index'")): + with pytest.raises(KeyError, match=re.escape("'[26, -8] not in index'")): df[[22, 26, -8]] assert df[21].shape[0] == df.shape[0] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ab868a3d3713d..dcccd42c52c8c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1010,18 +1010,32 @@ def test_loc_copy_vs_view(self): def test_loc_uint64(self): # GH20722 # Test whether loc accept uint64 max value as index. - s = Series([1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]) + umax = np.iinfo("uint64").max + ser = Series([1, 2], index=[umax - 1, umax]) - result = s.loc[np.iinfo("uint64").max - 1] - expected = s.iloc[0] + result = ser.loc[umax - 1] + expected = ser.iloc[0] assert result == expected - result = s.loc[[np.iinfo("uint64").max - 1]] - expected = s.iloc[[0]] + result = ser.loc[[umax - 1]] + expected = ser.iloc[[0]] tm.assert_series_equal(result, expected) - result = s.loc[[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]] - tm.assert_series_equal(result, s) + result = ser.loc[[umax - 1, umax]] + tm.assert_series_equal(result, ser) + + def test_loc_uint64_disallow_negative(self): + # GH#41775 + umax = np.iinfo("uint64").max + ser = Series([1, 2], index=[umax - 1, umax]) + + with pytest.raises(KeyError, match="-1"): + # don't wrap around + ser.loc[-1] + + with pytest.raises(KeyError, match="-1"): + # don't wrap around + ser.loc[[-1]] def test_loc_setitem_empty_append_expands_rows(self): # GH6173, various appends to an empty dataframe diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 0b1f807f2da63..5b7e90fe16d8f 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -206,15 +206,3 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default - - -def test_clean_index_list(): - # with both 0 and a large-uint64, np.array will infer to float64 - # https://github.com/numpy/numpy/issues/19146 - # but a more accurate choice would be uint64 - values = [0, np.iinfo(np.uint64).max] - - result, _ = lib.clean_index_list(values) - - expected = np.array(values, dtype="uint64") - tm.assert_numpy_array_equal(result, expected, check_dtype=True)