Skip to content

Commit 6d708f6

Browse files
committed
Added comments to groupby_helper
1 parent b947ffa commit 6d708f6

File tree

1 file changed

+40
-1
lines changed

1 file changed

+40
-1
lines changed

pandas/_libs/groupby_helper.pxi.in

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
470470
keep_na = na_option == 'keep'
471471
N, K = (<object> values).shape
472472

473+
# Copy values into new array in order to fill missing data
474+
# with mask, without obfuscating location of missing data
475+
# in values array
473476
masked_vals = np.array(values[:, 0], copy=True)
474477
{{if name=='int64'}}
475478
mask = (masked_vals == {{nan_val}}).astype(np.uint8)
@@ -493,20 +496,47 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
493496
order = (masked_vals, ~mask, labels)
494497
np.putmask(masked_vals, mask, nan_fill_val)
495498

499+
# lexsort using labels, then mask, then actual values
500+
# each label corresponds to a different group value,
501+
# the mask helps you differentiate missing values before
502+
# performing sort on the actual values
496503
_as = np.lexsort(order)
497504

498505
if not ascending:
499506
_as = _as[::-1]
500507

501508
with nogil:
509+
# Loop over the length of the value array
510+
# each incremental i value can be looked up in the _as array
511+
# that we sorted previously, which gives us the location of
512+
# that sorted value for retrieval back from the original
513+
# values / masked_vals arrays
502514
for i in range(N):
515+
# dups and sum_ranks will be incremented each loop where
516+
# the value / group remains the same, and should be reset
517+
# when either of those change
518+
# Used to calculate tiebreakers
503519
dups += 1
504520
sum_ranks += i - grp_start + 1
505521

522+
# if keep_na, check for missing values and assign back
523+
# to the result where appropriate
506524
if keep_na and masked_vals[_as[i]] == nan_fill_val:
507525
grp_na_count += 1
508526
out[_as[i], 0] = nan
509527
else:
528+
# this implementation is inefficient because it will
529+
# continue overwriting previously encountered dups
530+
# i.e. if 5 duplicated values are encountered it will
531+
# write to the result as follows (assumes avg tiebreaker):
532+
# 1
533+
# .5 .5
534+
# .33 .33 .33
535+
# .25 .25 .25 .25
536+
# .2 .2 .2 .2 .2
537+
#
538+
# could potentially be optimized to only write to the
539+
# result once the last duplicate value is encountered
510540
if tiebreak == TIEBREAK_AVERAGE:
511541
for j in range(i - dups + 1, i + 1):
512542
out[_as[j], 0] = sum_ranks / dups
@@ -526,6 +556,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
526556
for j in range(i - dups + 1, i + 1):
527557
out[_as[j], 0] = grp_vals_seen
528558

559+
# look forward to the next value (using the sorting in _as)
560+
# if the value does not equal the current value then we need to
561+
# reset the dups and sum_ranks, knowing that a new value is coming
562+
# up. the conditional also needs to handle nan equality and the
563+
# end of iteration
529564
{{if name=='int64'}}
530565
if (i == N - 1 or (
531566
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not
@@ -543,7 +578,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
543578
val_start = i
544579
grp_vals_seen += 1
545580

546-
# Move to the next group, cleaning up any values
581+
# Similar to the previous conditional, check now if we are moving to a
582+
# new group. If so, keep track of the index where the new group occurs,
583+
# so the tiebreaker calculations can decrement that from their position
584+
# if the pct flag is True, go back and overwrite the result for
585+
# the group to be divided by the size of the group (excluding na values)
547586
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
548587
if pct:
549588
for j in range(grp_start, i + 1):

0 commit comments

Comments
 (0)