Skip to content

Commit

Permalink
PERF: improve MultiIndex get_loc performance (pandas-dev#16346)
Browse files Browse the repository at this point in the history
* PERF: improve hash collision check for single MI labels
* PERF: specialized hash function for single tuples
  • Loading branch information
jorisvandenbossche authored May 17, 2017
1 parent a3021ea commit 34ebad8
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 7 deletions.
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self):
def time_multiindex_large_get_loc(self):
self.mi_large.get_loc((999, 19, 'Z'))

def time_multiindex_large_get_loc_warm(self):
for _ in range(1000):
self.mi_large.get_loc((999, 19, 'Z'))

def time_multiindex_med_get_loc(self):
self.mi_med.get_loc((999, 9, 'A'))

def time_multiindex_med_get_loc_warm(self):
for _ in range(1000):
self.mi_med.get_loc((999, 9, 'A'))

def time_multiindex_string_get_loc(self):
self.mi_small.get_loc((99, 'A', 'A'))

def time_multiindex_small_get_loc_warm(self):
for _ in range(1000):
self.mi_small.get_loc((99, 'A', 'A'))

def time_is_monotonic(self):
self.miint.is_monotonic

Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance regression fix when indexing with a list-like (:issue:`16285`)
- Performance regression fix for small MultiIndexes (:issuse:`16319`)
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)


.. _whatsnew_0202.bug_fixes:

Bug Fixes
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable):

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)


cdef class StringHashTable(HashTable):
cdef kh_str_t *table
Expand Down
19 changes: 17 additions & 2 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Template for each `dtype` helper function for hashtable
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

from lib cimport is_null_datetimelike


#----------------------------------------------------------------------
# VectorData
#----------------------------------------------------------------------
Expand Down Expand Up @@ -921,6 +924,19 @@ cdef class MultiIndexHashTable(HashTable):
"hash collision\nlocs:\n{}\n"
"result:\n{}\nmi:\n{}".format(alocs, result, mi))

cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
# validate that the loc maps to the actual value
# version of _check_for_collisions above for single label (tuple)

result = self.mi[loc]

if not all(l == r or (is_null_datetimelike(l)
and is_null_datetimelike(r))
for l, r in zip(result, label)):
raise AssertionError(
"hash collision\nloc:\n{}\n"
"result:\n{}\nmi:\n{}".format(loc, result, label))

def __contains__(self, object key):
try:
self.get_item(key)
Expand All @@ -939,8 +955,7 @@ cdef class MultiIndexHashTable(HashTable):
k = kh_get_uint64(self.table, value)
if k != self.table.n_buckets:
loc = self.table.vals[k]
locs = np.array([loc], dtype=np.int64)
self._check_for_collisions(locs, key)
self._check_for_collision(loc, key)
return loc
else:
raise KeyError(key)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key):
we need to stringify if we have mixed levels
"""
from pandas.core.util.hashing import hash_tuples
from pandas.core.util.hashing import hash_tuples, hash_tuple

if not isinstance(key, tuple):
return hash_tuples(key)
Expand All @@ -762,7 +762,7 @@ def f(k, stringify):
return k
key = tuple([f(k, stringify)
for k, stringify in zip(key, self._have_mixed_levels)])
return hash_tuples(key)
return hash_tuple(key)

@Appender(base._shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, keep='first'):
Expand Down
56 changes: 55 additions & 1 deletion pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import itertools

import numpy as np
from pandas._libs import hashing
from pandas._libs import hashing, tslib
from pandas.core.dtypes.generic import (
ABCMultiIndex,
ABCIndexClass,
ABCSeries,
ABCDataFrame)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_list_like)
from pandas.core.dtypes.missing import isnull
from pandas.core.dtypes.cast import infer_dtype_from_scalar


# 16 byte long hashing key
_default_hash_key = '0123456789123456'
Expand Down Expand Up @@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
return h


def hash_tuple(val, encoding='utf8', hash_key=None):
"""
Hash a single tuple efficiently
Parameters
----------
val : single tuple
encoding : string, default 'utf8'
hash_key : string key to encode, default to _default_hash_key
Returns
-------
hash
"""
hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
for v in val)

h = _combine_hash_arrays(hashes, len(val))[0]

return h


def _hash_categorical(c, encoding, hash_key):
"""
Hash a Categorical by hashing its categories, and then mapping the codes
Expand Down Expand Up @@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
vals *= np.uint64(0x94d049bb133111eb)
vals ^= vals >> 31
return vals


def _hash_scalar(val, encoding='utf8', hash_key=None):
"""
Hash scalar value
Returns
-------
1d uint64 numpy array of hash value, of length 1
"""

if isnull(val):
# this is to be consistent with the _hash_categorical implementation
return np.array([np.iinfo(np.uint64).max], dtype='u8')

if getattr(val, 'tzinfo', None) is not None:
# for tz-aware datetimes, we need the underlying naive UTC value and
# not the tz aware object or pd extension type (as
# infer_dtype_from_scalar would do)
if not isinstance(val, tslib.Timestamp):
val = tslib.Timestamp(val)
val = val.tz_convert(None)

dtype, val = infer_dtype_from_scalar(val)
vals = np.array([val], dtype=dtype)

return hash_array(vals, hash_key=hash_key, encoding=encoding,
categorize=False)
24 changes: 23 additions & 1 deletion pandas/tests/util/test_hashing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import pytest
import datetime

from warnings import catch_warnings
import numpy as np
import pandas as pd

from pandas import DataFrame, Series, Index, MultiIndex
from pandas.util import hash_array, hash_pandas_object
from pandas.core.util.hashing import hash_tuples
from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
import pandas.util.testing as tm


Expand Down Expand Up @@ -79,6 +80,27 @@ def test_hash_tuples(self):
result = hash_tuples(tups[0])
assert result == expected[0]

def test_hash_tuple(self):
# test equivalence between hash_tuples and hash_tuple
for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
('A', pd.Timestamp("2012-01-01"))]:
result = hash_tuple(tup)
expected = hash_tuples([tup])[0]
assert result == expected

def test_hash_scalar(self):
for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
datetime.datetime(2012, 1, 1),
pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
pd.Timedelta('1 days'), datetime.timedelta(1),
pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
np.nan, pd.NaT, None]:
result = _hash_scalar(val)
expected = hash_array(np.array([val], dtype=object),
categorize=True)
assert result[0] == expected[0]

def test_hash_tuples_err(self):

for val in [5, 'foo', pd.Timestamp('20130101')]:
Expand Down

0 comments on commit 34ebad8

Please sign in to comment.