feedback

pandas-dev · jorisvandenbossche · May 17, 2017 · May 11, 2017 · May 12, 2017 · May 15, 2017
commit 664d2b35bd0ec240c4d638ccf85f21761dcf8fee
diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd
@@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable):
 
     cpdef get_item(self, object val)
     cpdef set_item(self, object key, Py_ssize_t val)
+    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
+
 
 cdef class StringHashTable(HashTable):
     cdef kh_str_t *table

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -4,7 +4,8 @@ Template for each `dtype` helper function for hashtable
 WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
 """
 
-from pandas.core.dtypes.missing import array_equivalent
+from lib cimport is_null_datetimelike
+
 
 #----------------------------------------------------------------------
 # VectorData
@@ -923,12 +924,15 @@ cdef class MultiIndexHashTable(HashTable):
                     "hash collision\nlocs:\n{}\n"
                     "result:\n{}\nmi:\n{}".format(alocs, result, mi))
 
-    def _check_for_collision(self, Py_ssize_t loc, object label):
+    cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
         # validate that the loc maps to the actual value
         # version of _check_for_collisions above for single label (tuple)
 
         result = self.mi[loc]
-        if not array_equivalent(result, label):
+
+        if not all(l == r or (is_null_datetimelike(l)
+                              and is_null_datetimelike(r))
+                   for l, r in zip(result, label)):
             raise AssertionError(
                 "hash collision\nloc:\n{}\n"
                 "result:\n{}\nmi:\n{}".format(loc, result, label))

diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
@@ -5,13 +5,16 @@
 
 import numpy as np
 from pandas._libs import hashing
+from pandas.compat import string_and_binary_types, text_type
 from pandas.core.dtypes.generic import (
     ABCMultiIndex,
     ABCIndexClass,
     ABCSeries,
     ABCDataFrame)
 from pandas.core.dtypes.common import (
     is_categorical_dtype, is_list_like)
+from pandas.core.dtypes.missing import isnull
+
 
 # 16 byte long hashing key
 _default_hash_key = '0123456789123456'
@@ -179,9 +182,17 @@ def hash_tuple(val, encoding='utf8', hash_key=None):
     hash
 
     """
-    hashes = (hash_array(np.array([v]), encoding=encoding, hash_key=hash_key,
-                         categorize=False)
+    #def to_array(v):
+    #    dtype, arr = infer_dtype_from_array([v])
+    #    return np.asarray(arr, dtype=dtype)
+
+    #hashes = (hash_array(to_array(v), encoding=encoding, hash_key=hash_key,
+    #                     categorize=False)
+    #          for v in val)
+
+    hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
               for v in val)
+
     h = _combine_hash_arrays(hashes, len(val))[0]
 
     return h
@@ -299,3 +310,63 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
     vals *= np.uint64(0x94d049bb133111eb)
     vals ^= vals >> 31
     return vals
+
+
+def _hash_scalar(val, encoding='utf8', hash_key=None):
+    """
+    Hash scalar value
+
+    Returns
+    -------
+    1d uint64 numpy array of hash value, of length 1
+    """
+
+    if hash_key is None:
+        hash_key = _default_hash_key
+
+    if isnull(val):
+        # this is to be consistent with the _hash_categorical implementation
+        return np.array([np.iinfo(np.uint64).max], dtype='u8')
+
+    if isinstance(val, string_and_binary_types + (text_type,)):
+        vals = np.array([val], dtype=object)
+        string_like = True
+    else:
+        vals = np.array([val])
+        string_like = False
+
+    dtype = vals.dtype
+
+    #dtype, vals = infer_dtype_from_array([vals])
+    #if dtype == np.object_:
+    #    vals = np.asarray(vals, dtype='object')
+    #    dtype = vals.dtype
+
+    # we'll be working with everything as 64-bit values, so handle this
+    # 128-bit value early
+    if np.issubdtype(dtype, np.complex128):
+        return hash_array(vals.real) + 23 * hash_array(vals.imag)
+
+    # First, turn whatever array this is into unsigned 64-bit ints, if we can
+    # manage it.
+    elif isinstance(dtype, np.bool):
+        vals = vals.astype('u8')
+    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
+        vals = vals.view('i8').astype('u8', copy=False)
+    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
+        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
+    else:
+        if not string_like:
+            from pandas import Index
+            vals = Index(vals).values
+            return hash_array(vals, hash_key=hash_key, encoding=encoding,
+                              categorize=False)
+        vals = hashing.hash_object_array(vals, hash_key, encoding)
+
+    # Then, redistribute these 64-bit ints within the space of 64-bit ints
+    vals ^= vals >> 30
+    vals *= np.uint64(0xbf58476d1ce4e5b9)
+    vals ^= vals >> 27
+    vals *= np.uint64(0x94d049bb133111eb)
+    vals ^= vals >> 31
+    return vals
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
@@ -6,7 +6,7 @@
 
 from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.util import hash_array, hash_pandas_object
-from pandas.core.util.hashing import hash_tuples, hash_tuple
+from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
 import pandas.util.testing as tm
 
 
@@ -81,10 +81,20 @@ def test_hash_tuples(self):
 
     def test_hash_tuple(self):
         # test equivalence between hash_tuples and hash_tuple
-        tup = (1, 'one')
-        result = hash_tuple(tup)
-        expected = hash_tuples([tup])[0]
-        assert result == expected
+        for tup in [(1, 'one'), (1, np.nan)]:
+            result = hash_tuple(tup)
+            expected = hash_tuples([tup])[0]
+            assert result == expected
+
+    def test_hash_scalar(self):
+        for val in [1, 1.4, 'A', b'A', u'A',  pd.Timestamp("2012-01-01"),
+                    pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
+                    pd.Period('2012-01-01', freq='D'), pd.Timedelta('1 days'),
+                    pd.Interval(0, 1), np.nan, pd.NaT, None]:
+            result = _hash_scalar(val)
+            expected = hash_array(np.array([val], dtype=object),
+                                  categorize=True)
+            assert result[0] == expected[0]
 
     def test_hash_tuples_err(self):