Skip to content

BUG: Index.get_loc(np.nan) non-unique non-monotonic #43711

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ Indexing
- Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`)
- Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)

- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
-

Missing
^^^^^^^
Expand Down
35 changes: 33 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ from pandas._libs import (
algos,
hashtable as _hash,
)
from pandas._libs.missing import checknull

from pandas._libs.missing cimport (
checknull,
is_matching_na,
)


cdef inline bint is_definitely_invalid_key(object val):
Expand Down Expand Up @@ -146,9 +150,17 @@ cdef class IndexEngine:
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer

indexer = self.values == val
indexer = self._get_bool_indexer(val)
return self._unpack_bool_indexer(indexer, val)

cdef ndarray _get_bool_indexer(self, object val):
"""
Return a ndarray[bool] of locations where val matches self.values.

If val is not NA, this is equivalent to `self.values == val`
"""
raise NotImplementedError("Implemented by subclasses")

cdef _unpack_bool_indexer(self,
ndarray[uint8_t, ndim=1, cast=True] indexer,
object val):
Expand Down Expand Up @@ -420,6 +432,25 @@ cdef class ObjectEngine(IndexEngine):
raise KeyError(val) from err
return loc

cdef ndarray _get_bool_indexer(self, object val):
# We need to check for equality and for matching NAs
cdef:
ndarray values = self.values

if not checknull(val):
return values == val

cdef:
ndarray[uint8_t] result = np.empty(len(values), dtype=np.uint8)
Py_ssize_t i
object item

for i in range(len(values)):
item = values[i]
result[i] = is_matching_na(item, val)

return result.view(bool)


cdef class DatetimeEngine(Int64Engine):

Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/index_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ cdef class {{name}}Engine(IndexEngine):
cdef void _call_map_locations(self, ndarray[{{dtype}}_t] values):
self.mapping.map_locations(values)

cdef _maybe_get_bool_indexer(self, object val):
# Returns ndarray[bool] or int
cdef ndarray _get_bool_indexer(self, object val):
cdef:
ndarray[uint8_t, ndim=1, cast=True] indexer
ndarray[{{dtype}}_t, ndim=1] values
Expand All @@ -71,6 +70,7 @@ cdef class {{name}}Engine(IndexEngine):
# when trying to cast it to ndarray
raise KeyError(val)

return self._unpack_bool_indexer(indexer, val)
return indexer


{{endfor}}
22 changes: 21 additions & 1 deletion pandas/tests/indexes/base_class/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import pytest

import pandas as pd
from pandas import Index
from pandas import (
Index,
NaT,
)
import pandas._testing as tm


Expand Down Expand Up @@ -56,3 +59,20 @@ def test_get_loc_tuple_monotonic_above_size_cutoff(self):

res = oidx.get_loc(tup)
assert res == loc

def test_get_loc_nan_object_dtype_nonmonotonic_nonunique(self):
# case that goes through _maybe_get_bool_indexer
idx = Index(["foo", np.nan, None, "foo", 1.0, None], dtype=object)

# we dont raise KeyError on nan
res = idx.get_loc(np.nan)
assert res == 1

# we only match on None, not on np.nan
res = idx.get_loc(None)
expected = np.array([False, False, True, False, False, True])
tm.assert_numpy_array_equal(res, expected)

# we don't match at all on mismatched NA
with pytest.raises(KeyError, match="NaT"):
idx.get_loc(NaT)