Skip to content

Commit 24527a4

Browse files
committed
ensure_platform_int earlier
1 parent 8b8d6a2 commit 24527a4

File tree

8 files changed

+68
-45
lines changed

8 files changed

+68
-45
lines changed

pandas/_libs/algos.pyx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:
183183

184184
@cython.boundscheck(False)
185185
@cython.wraparound(False)
186-
def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
186+
def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
187187
"""
188188
Compute a 1-d indexer.
189189
@@ -192,7 +192,7 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
192192
193193
Parameters
194194
----------
195-
index: int64 ndarray
195+
index: np.ndarray[np.intp]
196196
Mappings from group -> position.
197197
ngroups: int64
198198
Number of groups.
@@ -210,13 +210,13 @@ def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
210210
"""
211211
cdef:
212212
Py_ssize_t i, loc, label, n
213-
ndarray[int64_t] counts, where
214-
ndarray[intp_t] indexer
213+
ndarray[int64_t] counts
214+
ndarray[intp_t] indexer, where
215215

216216
counts = np.zeros(ngroups + 1, dtype=np.int64)
217217
n = len(index)
218218
indexer = np.zeros(n, dtype=np.intp)
219-
where = np.zeros(ngroups + 1, dtype=np.int64)
219+
where = np.zeros(ngroups + 1, dtype=np.intp)
220220

221221
with nogil:
222222

pandas/_libs/algos_take_helper.pxi.in

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
88
# take_1d, take_2d
99
# ----------------------------------------------------------------------
1010

11+
12+
@cython.wraparound(False)
13+
@cython.boundscheck(False)
14+
def take_1d_intp_intp(
15+
const intp_t[:] values,
16+
const intp_t[:] indexer,
17+
intp_t[::1] out,
18+
intp_t fill_value=-1,
19+
):
20+
cdef:
21+
Py_ssize_t i, n, idx
22+
intp_t fv
23+
24+
n = indexer.shape[0]
25+
26+
fv = fill_value
27+
28+
with nogil:
29+
for i in range(n):
30+
idx = indexer[i]
31+
if idx == -1:
32+
out[i] = fv
33+
else:
34+
out[i] = values[idx]
35+
36+
1137
{{py:
1238

1339
# c_type_in, c_type_out

pandas/_libs/groupby.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ from pandas._libs.util cimport (
3737
)
3838

3939
from pandas._libs.algos import (
40+
ensure_platform_int,
4041
groupsort_indexer,
4142
rank_1d,
4243
take_2d_axis1_float64_float64,
@@ -150,7 +151,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
150151
ngroups = len(counts)
151152
N, K = (<object>values).shape
152153

153-
indexer, _counts = groupsort_indexer(labels, ngroups)
154+
indexer, _counts = groupsort_indexer(ensure_platform_int(labels), ngroups)
154155
counts[:] = _counts[1:]
155156

156157
data = np.empty((K, N), dtype=np.float64)

pandas/_libs/join.pyx

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@ from numpy cimport (
2121
cnp.import_array()
2222

2323
from pandas._libs.algos import (
24-
ensure_int64,
25-
ensure_platform_int,
2624
groupsort_indexer,
2725
take_1d_int64_int64,
26+
take_1d_intp_intp,
2827
)
2928

3029

@@ -34,16 +33,16 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
3433
cdef:
3534
Py_ssize_t i, j, k, count = 0
3635
ndarray[intp_t] left_sorter, right_sorter
37-
ndarray[int64_t] left_count, right_count
38-
ndarray[int64_t] left_indexer, right_indexer
39-
int64_t lc, rc
36+
ndarray[intp_t] left_count, right_count
37+
ndarray[intp_t] left_indexer, right_indexer
38+
intp_t lc, rc
4039
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
4140
Py_ssize_t offset
4241

4342
# NA group in location 0
4443

45-
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
46-
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
44+
left_sorter, left_count = groupsort_indexer(left, max_groups)
45+
right_sorter, right_count = groupsort_indexer(right, max_groups)
4746

4847
with nogil:
4948
# First pass, determine size of result set, do not use the NA group
@@ -58,8 +57,8 @@ def inner_join(const intp_t[:] left, const intp_t[:] right,
5857
left_pos = left_count[0]
5958
right_pos = right_count[0]
6059

61-
left_indexer = np.empty(count, dtype=np.int64)
62-
right_indexer = np.empty(count, dtype=np.int64)
60+
left_indexer = np.empty(count, dtype=np.intp)
61+
right_indexer = np.empty(count, dtype=np.intp)
6362

6463
with nogil:
6564
for i in range(1, max_groups + 1):
@@ -85,17 +84,17 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
8584
Py_ssize_t max_groups, bint sort=True):
8685
cdef:
8786
Py_ssize_t i, j, k, count = 0
88-
ndarray[int64_t] left_count, right_count
87+
ndarray[intp_t] left_count, right_count
8988
ndarray[intp_t] rev, left_sorter, right_sorter
90-
ndarray[int64_t] left_indexer, right_indexer
91-
int64_t lc, rc
89+
ndarray[intp_t] left_indexer, right_indexer
90+
intp_t lc, rc
9291
Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
9392
Py_ssize_t offset
9493

9594
# NA group in location 0
9695

97-
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
98-
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
96+
left_sorter, left_count = groupsort_indexer(left, max_groups)
97+
right_sorter, right_count = groupsort_indexer(right, max_groups)
9998

10099
with nogil:
101100
# First pass, determine size of result set, do not use the NA group
@@ -109,8 +108,8 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
109108
left_pos = left_count[0]
110109
right_pos = right_count[0]
111110

112-
left_indexer = np.empty(count, dtype=np.int64)
113-
right_indexer = np.empty(count, dtype=np.int64)
111+
left_indexer = np.empty(count, dtype=np.intp)
112+
right_indexer = np.empty(count, dtype=np.intp)
114113

115114
with nogil:
116115
for i in range(1, max_groups + 1):
@@ -142,11 +141,10 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right,
142141
# this is a short-cut to avoid groupsort_indexer
143142
# otherwise, the `else` path also works in this case
144143
rev = np.empty(len(left), dtype=np.intp)
145-
rev.put(ensure_platform_int(left_sorter), np.arange(len(left)))
144+
rev.put(left_sorter, np.arange(len(left)))
146145
else:
147146
rev, _ = groupsort_indexer(left_indexer, len(left))
148147

149-
rev = ensure_platform_int(rev)
150148
right_indexer = right_indexer.take(rev)
151149
left_indexer = left_indexer.take(rev)
152150

@@ -159,16 +157,16 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
159157
cdef:
160158
Py_ssize_t i, j, k, count = 0
161159
ndarray[intp_t] left_sorter, right_sorter
162-
ndarray[int64_t] left_count, right_count
163-
ndarray[int64_t] left_indexer, right_indexer
164-
int64_t lc, rc
165-
int64_t left_pos = 0, right_pos = 0
160+
ndarray[intp_t] left_count, right_count
161+
ndarray[intp_t] left_indexer, right_indexer
162+
intp_t lc, rc
163+
intp_t left_pos = 0, right_pos = 0
166164
Py_ssize_t offset, position = 0
167165

168166
# NA group in location 0
169167

170-
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
171-
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)
168+
left_sorter, left_count = groupsort_indexer(left, max_groups)
169+
right_sorter, right_count = groupsort_indexer(right, max_groups)
172170

173171
with nogil:
174172
# First pass, determine size of result set, do not use the NA group
@@ -185,8 +183,8 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
185183
left_pos = left_count[0]
186184
right_pos = right_count[0]
187185

188-
left_indexer = np.empty(count, dtype=np.int64)
189-
right_indexer = np.empty(count, dtype=np.int64)
186+
left_indexer = np.empty(count, dtype=np.intp)
187+
right_indexer = np.empty(count, dtype=np.intp)
190188

191189
with nogil:
192190
for i in range(1, max_groups + 1):
@@ -217,19 +215,17 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
217215
_get_result_indexer(right_sorter, right_indexer))
218216

219217

220-
cdef ndarray[int64_t] _get_result_indexer(
221-
ndarray[intp_t] sorter, ndarray[int64_t] indexer
218+
cdef ndarray[intp_t] _get_result_indexer(
219+
ndarray[intp_t] sorter, ndarray[intp_t] indexer
222220
):
223221
if len(sorter) > 0:
224222
# cython-only equivalent to
225223
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
226-
res = np.empty(len(indexer), dtype=np.int64)
227-
take_1d_int64_int64(ensure_int64(sorter), ensure_platform_int(indexer), res, -1)
228-
# FIXME: sorter is intp_t, not int64_t, opposite for indexer;
229-
# will this break on 32bit builds?
224+
res = np.empty(len(indexer), dtype=np.intp)
225+
take_1d_intp_intp(sorter, indexer, res, -1)
230226
else:
231227
# length-0 case
232-
res = np.empty(len(indexer), dtype=np.int64)
228+
res = np.empty(len(indexer), dtype=np.intp)
233229
res[:] = -1
234230

235231
return res

pandas/core/arrays/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1980,7 +1980,7 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
19801980
"""
19811981
categories = self.categories
19821982
r, counts = libalgos.groupsort_indexer(
1983-
self.codes.astype("int64", copy=False), categories.size
1983+
libalgos.ensure_platform_int(self.codes), categories.size
19841984
)
19851985
counts = counts.cumsum()
19861986
_result = (r[start:end] for start, end in zip(counts, counts[1:]))

pandas/core/indexes/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4153,7 +4153,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
41534153
return np.empty(0, dtype=np.intp)
41544154

41554155
if len(labels) == 1:
4156-
return get_group_index_sorter(labels[0])
4156+
return get_group_index_sorter(ensure_platform_int(labels[0]))
41574157

41584158
# find indexers of beginning of each set of
41594159
# same-key labels w.r.t all but last level
@@ -4223,7 +4223,7 @@ def _get_leaf_sorter(labels: List[np.ndarray]) -> np.ndarray:
42234223
if level == 0: # outer most level, take the fast route
42244224
ngroups = 1 + new_lev_codes.max()
42254225
left_indexer, counts = libalgos.groupsort_indexer(
4226-
ensure_int64(new_lev_codes), ngroups
4226+
new_lev_codes, ngroups
42274227
)
42284228

42294229
# missing values are placed first; drop them!

pandas/core/sorting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,7 @@ def get_group_index_sorter(
605605
(alpha + beta * ngroups) < (count * np.log(count)) # type: ignore[operator]
606606
)
607607
if do_groupsort:
608-
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
608+
sorter, _ = algos.groupsort_indexer(ensure_platform_int(group_index), ngroups)
609609
# sorter _should_ already be intp, but mypy is not yet able to verify
610610
else:
611611
sorter = group_index.argsort(kind="mergesort")

pandas/tests/test_algos.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2116,8 +2116,8 @@ def test_is_lexsorted():
21162116

21172117

21182118
def test_groupsort_indexer():
2119-
a = np.random.randint(0, 1000, 100).astype(np.int64)
2120-
b = np.random.randint(0, 1000, 100).astype(np.int64)
2119+
a = np.random.randint(0, 1000, 100).astype(np.intp)
2120+
b = np.random.randint(0, 1000, 100).astype(np.intp)
21212121

21222122
result = libalgos.groupsort_indexer(a, 1000)[0]
21232123

0 commit comments

Comments
 (0)