Skip to content

CLN: prepare unifying hashtable.factorize and .unique; add doc-strings #22986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Oct 18, 2018
Merged
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
640162f
Fix ASV import error
h-vetinari Oct 3, 2018
31d0dc5
Add return_inverse to hashtable.unique
h-vetinari Sep 27, 2018
c5e5147
Pure copy/paste: Group unique/factorize functions next to each other
h-vetinari Sep 30, 2018
9918d52
Unify hashtable.factorize and .unique
h-vetinari Oct 3, 2018
52ae84e
Force compilation of different code paths
h-vetinari Oct 4, 2018
dbe4e0e
Add separate functions for return_inverse=False
h-vetinari Oct 4, 2018
8481e19
Finish split in _unique_with_inverse and _unique_no_inverse
h-vetinari Oct 4, 2018
27ceb4d
Add cython.wraparound(False)
h-vetinari Oct 4, 2018
f5cd5e9
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 6, 2018
b1705a9
Unmove unique-implementation (review jreback)
h-vetinari Oct 6, 2018
a6ed5dd
Undo line artefacts
h-vetinari Oct 6, 2018
17752ce
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 7, 2018
19eaf32
Clean up test_algos.test_vector_resize
h-vetinari Oct 7, 2018
ce7626f
Add test for hashtable.unique (esp. for return_inverse=True)
h-vetinari Oct 7, 2018
7b9014f
Review (jreback)
h-vetinari Oct 7, 2018
471c4da
Fix typo
h-vetinari Oct 8, 2018
9d45378
Small fixes
h-vetinari Oct 8, 2018
00b2ccb
Review (jorisvandenbossche)
h-vetinari Oct 8, 2018
8687315
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 11, 2018
a267d4a
Review (jorisvandenbossche)
h-vetinari Oct 11, 2018
7f1bb40
Improve comment
h-vetinari Oct 11, 2018
9593992
Test for writable; expand comments
h-vetinari Oct 12, 2018
08d7f50
Simplify factorize test
h-vetinari Oct 12, 2018
d91be98
Add simple test
h-vetinari Oct 12, 2018
e27ec9a
Tiny fixes
h-vetinari Oct 12, 2018
d825be0
Remove idx_duplicated from test (now unnecessary)
h-vetinari Oct 12, 2018
1a342d0
Review (jreback)
h-vetinari Oct 14, 2018
28e0441
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 15, 2018
d65e4fd
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 15, 2018
bca615c
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 17, 2018
3438727
Review (jreback)
h-vetinari Oct 17, 2018
facc111
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 18, 2018
6d0e86b
Retrigger Circle
h-vetinari Oct 18, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Unmove unique-implementation (review jreback)
  • Loading branch information
h-vetinari committed Oct 6, 2018
commit b1705a995b561b9f36f0edab10f3f1bb3984b606
156 changes: 78 additions & 78 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -415,30 +415,6 @@ cdef class {{name}}HashTable(HashTable):

return uniques.to_array(), np.asarray(labels)

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_no_inverse(self, const {{dtype}}_t[:] values):
# define separate functions without inverse for performance
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t val
khiter_t k
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
ud = uniques.data
with nogil:
for i in range(n):
val = values[i]
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
return uniques.to_array()

def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
if return_inverse:
return self._unique_with_inverse(values, uniques={{name}}Vector(),
Expand Down Expand Up @@ -503,6 +479,30 @@ cdef class {{name}}HashTable(HashTable):

return np.asarray(labels), arr_uniques

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_no_inverse(self, const {{dtype}}_t[:] values):
# define separate functions without inverse for performance
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{dtype}}_t val
khiter_t k
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
ud = uniques.data
with nogil:
for i in range(n):
val = values[i]
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
return uniques.to_array()

{{endfor}}


Expand Down Expand Up @@ -582,6 +582,41 @@ cdef class StringHashTable(HashTable):
free(vecs)
return labels

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_no_inverse(self, ndarray[object] values):
# define separate functions without inverse for performance
cdef:
Py_ssize_t i, count, n = len(values)
int64_t[:] uindexer
int ret = 0
object val
ObjectVector uniques
khiter_t k
const char *v
const char **vecs
vecs = <const char **> malloc(n * sizeof(char *))
uindexer = np.empty(n, dtype=np.int64)
for i in range(n):
val = values[i]
v = util.get_c_string(val)
vecs[i] = v
count = 0
with nogil:
for i in range(n):
v = vecs[i]
k = kh_get_str(self.table, v)
if k == self.table.n_buckets:
kh_put_str(self.table, v, &ret)
uindexer[count] = i
count += 1
free(vecs)
# uniques
uniques = ObjectVector()
for i in range(count):
uniques.append(values[uindexer[i]])
return uniques.to_array()

@cython.boundscheck(False)
def lookup(self, ndarray[object] values):
cdef:
Expand Down Expand Up @@ -705,41 +740,6 @@ cdef class StringHashTable(HashTable):

return uniques.to_array(), np.asarray(labels)

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_no_inverse(self, ndarray[object] values):
# define separate functions without inverse for performance
cdef:
Py_ssize_t i, count, n = len(values)
int64_t[:] uindexer
int ret = 0
object val
ObjectVector uniques
khiter_t k
const char *v
const char **vecs
vecs = <const char **> malloc(n * sizeof(char *))
uindexer = np.empty(n, dtype=np.int64)
for i in range(n):
val = values[i]
v = util.get_c_string(val)
vecs[i] = v
count = 0
with nogil:
for i in range(n):
v = vecs[i]
k = kh_get_str(self.table, v)
if k == self.table.n_buckets:
kh_put_str(self.table, v, &ret)
uindexer[count] = i
count += 1
free(vecs)
# uniques
uniques = ObjectVector()
for i in range(count):
uniques.append(values[uindexer[i]])
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False):
if return_inverse:
return self._unique_with_inverse(values, uniques=ObjectVector(),
Expand Down Expand Up @@ -845,6 +845,25 @@ cdef class PyObjectHashTable(HashTable):

return np.asarray(locs)

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_no_inverse(self, ndarray[object] values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i am bothered by the naming, call these just _unique, you can leave _unique_with_inverse, though add a comment that makes it clear this is less performant as its doing more ops. bonus for doc-strings on these.

# define separate functions without inverse for performance
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
khiter_t k
ObjectVector uniques = ObjectVector()
for i in range(n):
val = values[i]
hash(val)
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
return uniques.to_array()

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_with_inverse(self, ndarray[object] values,
Expand Down Expand Up @@ -886,25 +905,6 @@ cdef class PyObjectHashTable(HashTable):

return uniques.to_array(), np.asarray(labels)

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_no_inverse(self, ndarray[object] values):
# define separate functions without inverse for performance
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
khiter_t k
ObjectVector uniques = ObjectVector()
for i in range(n):
val = values[i]
hash(val)
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False):
if return_inverse:
return self._unique_with_inverse(values, uniques=ObjectVector(),
Expand Down