@@ -470,6 +470,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
470
470
keep_na = na_option == 'keep'
471
471
N, K = (<object> values).shape
472
472
473
+ # Copy values into new array in order to fill missing data
474
+ # with mask, without obfuscating location of missing data
475
+ # in values array
473
476
masked_vals = np.array(values[:, 0], copy=True)
474
477
{{if name=='int64'}}
475
478
mask = (masked_vals == {{nan_val}}).astype(np.uint8)
@@ -493,20 +496,47 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
493
496
order = (masked_vals, ~mask, labels)
494
497
np.putmask(masked_vals, mask, nan_fill_val)
495
498
499
+ # lexsort using labels, then mask, then actual values
500
+ # each label corresponds to a different group value,
501
+ # the mask helps you differentiate missing values before
502
+ # performing sort on the actual values
496
503
_as = np.lexsort(order)
497
504
498
505
if not ascending:
499
506
_as = _as[::-1]
500
507
501
508
with nogil:
509
+ # Loop over the length of the value array
510
+ # each incremental i value can be looked up in the _as array
511
+ # that we sorted previously, which gives us the location of
512
+ # that sorted value for retrieval back from the original
513
+ # values / masked_vals arrays
502
514
for i in range(N):
515
+ # dups and sum_ranks will be incremented each loop where
516
+ # the value / group remains the same, and should be reset
517
+ # when either of those change
518
+ # Used to calculate tiebreakers
503
519
dups += 1
504
520
sum_ranks += i - grp_start + 1
505
521
522
+ # if keep_na, check for missing values and assign back
523
+ # to the result where appropriate
506
524
if keep_na and masked_vals[_as[i]] == nan_fill_val:
507
525
grp_na_count += 1
508
526
out[_as[i], 0] = nan
509
527
else:
528
+ # this implementation is inefficient because it will
529
+ # continue overwriting previously encountered dups
530
+ # i.e. if 5 duplicated values are encountered it will
531
+ # write to the result as follows (assumes avg tiebreaker):
532
+ # 1
533
+ # .5 .5
534
+ # .33 .33 .33
535
+ # .25 .25 .25 .25
536
+ # .2 .2 .2 .2 .2
537
+ #
538
+ # could potentially be optimized to only write to the
539
+ # result once the last duplicate value is encountered
510
540
if tiebreak == TIEBREAK_AVERAGE:
511
541
for j in range(i - dups + 1, i + 1):
512
542
out[_as[j], 0] = sum_ranks / dups
@@ -526,6 +556,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
526
556
for j in range(i - dups + 1, i + 1):
527
557
out[_as[j], 0] = grp_vals_seen
528
558
559
+ # look forward to the next value (using the sorting in _as)
560
+ # if the value does not equal the current value then we need to
561
+ # reset the dups and sum_ranks, knowing that a new value is coming
562
+ # up. the conditional also needs to handle nan equality and the
563
+ # end of iteration
529
564
{{if name=='int64'}}
530
565
if (i == N - 1 or (
531
566
(masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not
@@ -543,7 +578,11 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
543
578
val_start = i
544
579
grp_vals_seen += 1
545
580
546
- # Move to the next group, cleaning up any values
581
+ # Similar to the previous conditional, check now if we are moving to a
582
+ # new group. If so, keep track of the index where the new group occurs,
583
+ # so the tiebreaker calculations can decrement that from their position
584
+ # if the pct flag is True, go back and overwrite the result for
585
+ # the group to be divided by the size of the group (excluding na values)
547
586
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
548
587
if pct:
549
588
for j in range(grp_start, i + 1):
0 commit comments