forked from nodejs/node
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalphaindex.cpp
1235 lines (1086 loc) · 41.5 KB
/
alphaindex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/alphaindex.h"
#include "unicode/coll.h"
#include "unicode/localpointer.h"
#include "unicode/normalizer2.h"
#include "unicode/tblcoll.h"
#include "unicode/uchar.h"
#include "unicode/ulocdata.h"
#include "unicode/uniset.h"
#include "unicode/uobject.h"
#include "unicode/usetiter.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "uvector.h"
#include "uvectr64.h"
//#include <string>
//#include <iostream>
U_NAMESPACE_BEGIN
namespace {
/**
* Prefix string for Chinese index buckets.
* See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
*/
const UChar BASE[1] = { 0xFDD0 };
const int32_t BASE_LENGTH = 1;
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
const UnicodeString &one, const UnicodeString &other);
} // namespace
static int32_t U_CALLCONV
collatorComparator(const void *context, const void *left, const void *right);
static int32_t U_CALLCONV
recordCompareFn(const void *context, const void *left, const void *right);
// UVector<Record *> support function, delete a Record.
static void U_CALLCONV
alphaIndex_deleteRecord(void *obj) {
delete static_cast<AlphabeticIndex::Record *>(obj);
}
namespace {
UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return NULL; }
if (owned.isValid()) {
return owned.orphan();
}
UnicodeString *p = new UnicodeString(s);
if (p == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
return p;
}
inline UnicodeString *getString(const UVector &list, int32_t i) {
return static_cast<UnicodeString *>(list[i]);
}
inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
return static_cast<AlphabeticIndex::Bucket *>(list[i]);
}
inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
return static_cast<AlphabeticIndex::Record *>(list[i]);
}
/**
* Like Java Collections.binarySearch(List, String, Comparator).
*
* @return the index>=0 where the item was found,
* or the index<0 for inserting the string at ~index in sorted order
*/
int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
if (list.size() == 0) { return ~0; }
int32_t start = 0;
int32_t limit = list.size();
for (;;) {
int32_t i = (start + limit) / 2;
const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
UErrorCode errorCode = U_ZERO_ERROR;
UCollationResult cmp = coll.compare(s, *si, errorCode);
if (cmp == UCOL_EQUAL) {
return i;
} else if (cmp < 0) {
if (i == start) {
return ~start; // insert s before *si
}
limit = i;
} else {
if (i == start) {
return ~(start + 1); // insert s after *si
}
start = i;
}
}
}
} // namespace
// The BucketList is not in the anonymous namespace because only Clang
// seems to support its use in other classes from there.
// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
class BucketList : public UObject {
public:
BucketList(UVector *bucketList, UVector *publicBucketList)
: bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
int32_t displayIndex = 0;
for (int32_t i = 0; i < publicBucketList->size(); ++i) {
getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
}
}
// The virtual destructor must not be inline.
// See ticket #8454 for details.
virtual ~BucketList();
int32_t getBucketCount() const {
return immutableVisibleList_->size();
}
int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
UErrorCode &errorCode) {
// binary search
int32_t start = 0;
int32_t limit = bucketList_->size();
while ((start + 1) < limit) {
int32_t i = (start + limit) / 2;
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
UCollationResult nameVsBucket =
collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
if (nameVsBucket < 0) {
limit = i;
} else {
start = i;
}
}
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
if (bucket->displayBucket_ != NULL) {
bucket = bucket->displayBucket_;
}
return bucket->displayIndex_;
}
/** All of the buckets, visible and invisible. */
UVector *bucketList_;
/** Just the visible buckets. */
UVector *immutableVisibleList_;
};
BucketList::~BucketList() {
delete bucketList_;
if (immutableVisibleList_ != bucketList_) {
delete immutableVisibleList_;
}
}
AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
delete buckets_;
delete collatorPrimaryOnly_;
}
int32_t
AlphabeticIndex::ImmutableIndex::getBucketCount() const {
return buckets_->getBucketCount();
}
int32_t
AlphabeticIndex::ImmutableIndex::getBucketIndex(
const UnicodeString &name, UErrorCode &errorCode) const {
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
}
const AlphabeticIndex::Bucket *
AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
if (0 <= index && index < buckets_->getBucketCount()) {
return icu::getBucket(*buckets_->immutableVisibleList_, index);
} else {
return NULL;
}
}
AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
: inputList_(NULL),
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
maxLabelCount_(99),
initialLabels_(NULL), firstCharsInScripts_(NULL),
collator_(NULL), collatorPrimaryOnly_(NULL),
buckets_(NULL) {
init(&locale, status);
}
AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
: inputList_(NULL),
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
maxLabelCount_(99),
initialLabels_(NULL), firstCharsInScripts_(NULL),
collator_(collator), collatorPrimaryOnly_(NULL),
buckets_(NULL) {
init(NULL, status);
}
AlphabeticIndex::~AlphabeticIndex() {
delete collator_;
delete collatorPrimaryOnly_;
delete firstCharsInScripts_;
delete buckets_;
delete inputList_;
delete initialLabels_;
}
AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
initialLabels_->addAll(additions);
clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
addIndexExemplars(locale, status);
clearBuckets();
return *this;
}
AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return NULL; }
// In C++, the ImmutableIndex must own its copy of the BucketList,
// even if it contains no records, for proper memory management.
// We could clone the buckets_ if they are not NULL,
// but that would be worth it only if this method is called multiple times,
// or called after using the old-style bucket iterator API.
LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone());
if (immutableBucketList.isNull() || coll.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
if (immIndex == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// The ImmutableIndex adopted its parameter objects.
immutableBucketList.orphan();
coll.orphan();
return immIndex;
}
int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
initBuckets(status);
if (U_FAILURE(status)) {
return 0;
}
return buckets_->getBucketCount();
}
int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
if (U_FAILURE(status) || inputList_ == NULL) {
return 0;
}
return inputList_->size();
}
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
U_ASSERT(indexCharacters.hasDeleter());
const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
if (U_FAILURE(errorCode)) { return; }
const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
const UnicodeString &overflowBoundary =
*getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
// We make a sorted array of elements.
// Some of the input may be redundant.
// That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
// We filter out those cases.
UnicodeSetIterator iter(*initialLabels_);
while (U_SUCCESS(errorCode) && iter.next()) {
const UnicodeString *item = &iter.getString();
LocalPointer<UnicodeString> ownedItem;
UBool checkDistinct;
int32_t itemLength = item->length();
if (!item->hasMoreChar32Than(0, itemLength, 1)) {
checkDistinct = FALSE;
} else if(item->charAt(itemLength - 1) == 0x2a && // '*'
item->charAt(itemLength - 2) != 0x2a) {
// Use a label if it is marked with one trailing star,
// even if the label string sorts the same when all contractions are suppressed.
ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
item = ownedItem.getAlias();
if (item == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
checkDistinct = FALSE;
} else {
checkDistinct = TRUE;
}
if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
// Ignore a primary-ignorable or non-alphabetic index character.
} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
// Ignore an index character that will land in the overflow bucket.
} else if (checkDistinct &&
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
// Ignore a multi-code point index character that does not sort distinctly
// from the sequence of its separate characters.
} else {
int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
if (insertionPoint < 0) {
indexCharacters.insertElementAt(
ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
} else {
const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
indexCharacters.setElementAt(
ownedString(*item, ownedItem, errorCode), insertionPoint);
}
}
}
}
if (U_FAILURE(errorCode)) { return; }
// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
int32_t size = indexCharacters.size() - 1;
if (size > maxLabelCount_) {
int32_t count = 0;
int32_t old = -1;
for (int32_t i = 0; i < indexCharacters.size();) {
++count;
int32_t bump = count * maxLabelCount_ / size;
if (bump == old) {
indexCharacters.removeElementAt(i);
} else {
old = bump;
++i;
}
}
}
}
namespace {
const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) {
if (!current.startsWith(BASE, BASE_LENGTH)) {
return current;
}
UChar rest = current.charAt(BASE_LENGTH);
if (0x2800 < rest && rest <= 0x28FF) { // stroke count
int32_t count = rest-0x2800;
temp.setTo((UChar)(0x30 + count % 10));
if (count >= 10) {
count /= 10;
temp.insert(0, (UChar)(0x30 + count % 10));
if (count >= 10) {
count /= 10;
temp.insert(0, (UChar)(0x30 + count));
}
}
return temp.append((UChar)0x5283);
}
return temp.setTo(current, BASE_LENGTH);
}
UBool hasMultiplePrimaryWeights(
const RuleBasedCollator &coll, uint32_t variableTop,
const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
ces.removeAllElements();
coll.internalGetCEs(s, ces, errorCode);
if (U_FAILURE(errorCode)) { return FALSE; }
UBool seenPrimary = FALSE;
for (int32_t i = 0; i < ces.size(); ++i) {
int64_t ce = ces.elementAti(i);
uint32_t p = (uint32_t)(ce >> 32);
if (p > variableTop) {
// not primary ignorable
if (seenPrimary) {
return TRUE;
}
seenPrimary = TRUE;
}
}
return FALSE;
}
} // namespace
BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
// Initialize indexCharacters.
UVector indexCharacters(errorCode);
indexCharacters.setDeleter(uprv_deleteUObject);
initLabels(indexCharacters, errorCode);
if (U_FAILURE(errorCode)) { return NULL; }
// Variables for hasMultiplePrimaryWeights().
UVector64 ces(errorCode);
uint32_t variableTop;
if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
} else {
variableTop = 0;
}
UBool hasInvisibleBuckets = FALSE;
// Helper arrays for Chinese Pinyin collation.
Bucket *asciiBuckets[26] = {
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
Bucket *pinyinBuckets[26] = {
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
UBool hasPinyin = FALSE;
LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
bucketList->setDeleter(uprv_deleteUObject);
// underflow bucket
LocalPointer<Bucket> bucket(new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW), errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
bucketList->adoptElement(bucket.orphan(), errorCode);
if (U_FAILURE(errorCode)) { return NULL; }
UnicodeString temp;
// fix up the list, adding underflow, additions, overflow
// Insert inflow labels as needed.
int32_t scriptIndex = -1;
const UnicodeString *scriptUpperBoundary = &emptyString_;
for (int32_t i = 0; i < indexCharacters.size(); ++i) {
UnicodeString ¤t = *getString(indexCharacters, i);
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
// We crossed the script boundary into a new script.
const UnicodeString &inflowBoundary = *scriptUpperBoundary;
UBool skippedScript = FALSE;
for (;;) {
scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
break;
}
skippedScript = TRUE;
}
if (skippedScript && bucketList->size() > 1) {
// We are skipping one or more scripts,
// and we are not just getting out of the underflow label.
bucket.adoptInsteadAndCheckErrorCode(
new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW), errorCode);
bucketList->adoptElement(bucket.orphan(), errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
}
}
// Add a bucket with the current label.
bucket.adoptInsteadAndCheckErrorCode(
new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL), errorCode);
bucketList->adoptElement(bucket.orphan(), errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
// Remember ASCII and Pinyin buckets for Pinyin redirects.
UChar c;
if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
asciiBuckets[c - 0x41] = (Bucket *)bucketList->lastElement();
} else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
pinyinBuckets[c - 0x41] = (Bucket *)bucketList->lastElement();
hasPinyin = TRUE;
}
// Check for multiple primary weights.
if (!current.startsWith(BASE, BASE_LENGTH) &&
hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
ces, errorCode) &&
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
// "AE-ligature" or "Sch" etc.
for (int32_t j = bucketList->size() - 2;; --j) {
Bucket *singleBucket = getBucket(*bucketList, j);
if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
// There is no single-character bucket since the last
// underflow or inflow label.
break;
}
if (singleBucket->displayBucket_ == NULL &&
!hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
singleBucket->lowerBoundary_,
ces, errorCode)) {
// Add an invisible bucket that redirects strings greater than the expansion
// to the previous single-character bucket.
// For example, after ... Q R S Sch we add Sch\uFFFF->S
// and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
bucket.adoptInsteadAndCheckErrorCode(new Bucket(emptyString_,
UnicodeString(current).append((UChar)0xFFFF),
U_ALPHAINDEX_NORMAL),
errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
bucket->displayBucket_ = singleBucket;
bucketList->adoptElement(bucket.orphan(), errorCode);
if (U_FAILURE(errorCode)) { return nullptr; }
hasInvisibleBuckets = TRUE;
break;
}
}
}
}
if (U_FAILURE(errorCode)) { return NULL; }
if (bucketList->size() == 1) {
// No real labels, show only the underflow label.
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
if (bl == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList.orphan();
return bl;
}
// overflow bucket
bucket.adoptInsteadAndCheckErrorCode(
new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW), errorCode);
bucketList->adoptElement(bucket.orphan(), errorCode); // final
if (U_FAILURE(errorCode)) { return nullptr; }
if (hasPinyin) {
// Redirect Pinyin buckets.
Bucket *asciiBucket = NULL;
for (int32_t i = 0; i < 26; ++i) {
if (asciiBuckets[i] != NULL) {
asciiBucket = asciiBuckets[i];
}
if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
pinyinBuckets[i]->displayBucket_ = asciiBucket;
hasInvisibleBuckets = TRUE;
}
}
}
if (U_FAILURE(errorCode)) { return NULL; }
if (!hasInvisibleBuckets) {
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
if (bl == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList.orphan();
return bl;
}
// Merge inflow buckets that are visually adjacent.
// Iterate backwards: Merge inflow into overflow rather than the other way around.
int32_t i = bucketList->size() - 1;
Bucket *nextBucket = getBucket(*bucketList, i);
while (--i > 0) {
Bucket *bucket = getBucket(*bucketList, i);
if (bucket->displayBucket_ != NULL) {
continue; // skip invisible buckets
}
if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
bucket->displayBucket_ = nextBucket;
continue;
}
}
nextBucket = bucket;
}
LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
// Do not call publicBucketList->setDeleter():
// This vector shares its objects with the bucketList.
for (int32_t j = 0; j < bucketList->size(); ++j) {
Bucket *bucket = getBucket(*bucketList, j);
if (bucket->displayBucket_ == NULL) {
publicBucketList->addElement(bucket, errorCode);
}
}
if (U_FAILURE(errorCode)) { return NULL; }
BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
if (bl == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList.orphan();
publicBucketList.orphan();
return bl;
}
/**
* Creates an index, and buckets and sorts the list of records into the index.
*/
void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || buckets_ != NULL) {
return;
}
buckets_ = createBucketList(errorCode);
if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
return;
}
// Sort the records by name.
// Stable sort preserves input order of collation duplicates.
inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
// Now, we traverse all of the input, which is now sorted.
// If the item doesn't go in the current bucket, we find the next bucket that contains it.
// This makes the process order n*log(n), since we just sort the list and then do a linear process.
// However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
// we need to improve it for that case.
Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
int32_t bucketIndex = 1;
Bucket *nextBucket;
const UnicodeString *upperBoundary;
if (bucketIndex < buckets_->bucketList_->size()) {
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
upperBoundary = &nextBucket->lowerBoundary_;
} else {
nextBucket = NULL;
upperBoundary = NULL;
}
for (int32_t i = 0; i < inputList_->size(); ++i) {
Record *r = getRecord(*inputList_, i);
// if the current bucket isn't the right one, find the one that is
// We have a special flag for the last bucket so that we don't look any further
while (upperBoundary != NULL &&
collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
currentBucket = nextBucket;
// now reset the boundary that we compare against
if (bucketIndex < buckets_->bucketList_->size()) {
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
upperBoundary = &nextBucket->lowerBoundary_;
} else {
upperBoundary = NULL;
}
}
// now put the record into the bucket.
Bucket *bucket = currentBucket;
if (bucket->displayBucket_ != NULL) {
bucket = bucket->displayBucket_;
}
if (bucket->records_ == NULL) {
LocalPointer<UVector> records(new UVector(errorCode), errorCode);
if (U_FAILURE(errorCode)) {
return;
}
bucket->records_ = records.orphan();
}
bucket->records_->addElement(r, errorCode);
}
}
void AlphabeticIndex::clearBuckets() {
if (buckets_ != NULL) {
delete buckets_;
buckets_ = NULL;
internalResetBucketIterator();
}
}
void AlphabeticIndex::internalResetBucketIterator() {
labelsIterIndex_ = -1;
currentBucket_ = NULL;
}
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
if (U_FAILURE(status)) {
return;
}
UnicodeSet exemplars;
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
if (U_SUCCESS(status)) {
initialLabels_->addAll(exemplars);
return;
}
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
// The locale data did not include explicit Index characters.
// Synthesize a set of them from the locale's standard exemplar characters.
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
if (U_FAILURE(status)) {
return;
}
// question: should we add auxiliary exemplars?
if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) {
exemplars.add(0x61, 0x7A);
}
if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
// cut down to small list
exemplars.remove(0xAC00, 0xD7A3).
add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
add(0xD30C).add(0xD558);
}
if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
// cut down to small list
// make use of the fact that Ethiopic is allocated in 8's, where
// the base is 0 mod 8.
UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
ethiopic.retainAll(exemplars);
exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic);
}
// Upper-case any that aren't already so.
// (We only do this for synthesized index characters.)
UnicodeSetIterator it(exemplars);
UnicodeString upperC;
while (it.next()) {
const UnicodeString &exemplarC = it.getString();
upperC = exemplarC;
upperC.toUpper(locale);
initialLabels_->add(upperC);
}
}
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
UnicodeSet contractions;
collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
initialLabels_->addAll(contractions);
UnicodeSetIterator iter(contractions);
while (iter.next()) {
const UnicodeString &s = iter.getString();
U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
UChar c = s.charAt(s.length() - 1);
if (0x41 <= c && c <= 0x5A) { // A-Z
// There are Pinyin labels, add ASCII A-Z labels as well.
initialLabels_->add(0x41, 0x5A); // A-Z
break;
}
}
return TRUE;
}
/*
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
*/
static const UChar CGJ = 0x034F;
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
UnicodeString result;
if (item.length() == 0) {
return result;
}
int32_t i = 0;
for (;;) {
UChar32 cp = item.char32At(i);
result.append(cp);
i = item.moveIndex32(i, 1);
if (i >= item.length()) {
break;
}
result.append(CGJ);
}
return result;
}
bool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
return false;
}
bool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
return false;
}
const RuleBasedCollator &AlphabeticIndex::getCollator() const {
return *collator_;
}
const UnicodeString &AlphabeticIndex::getInflowLabel() const {
return inflowLabel_;
}
const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
return overflowLabel_;
}
const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
return underflowLabel_;
}
AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
inflowLabel_ = label;
clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
overflowLabel_ = label;
clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
underflowLabel_ = label;
clearBuckets();
return *this;
}
int32_t AlphabeticIndex::getMaxLabelCount() const {
return maxLabelCount_;
}
AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (maxLabelCount <= 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
maxLabelCount_ = maxLabelCount;
clearBuckets();
return *this;
}
//
// init() - Common code for constructors.
//
void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
if (U_FAILURE(status)) { return; }
if (locale == NULL && collator_ == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
initialLabels_ = new UnicodeSet();
if (initialLabels_ == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
inflowLabel_.setTo((UChar)0x2026); // Ellipsis
overflowLabel_ = inflowLabel_;
underflowLabel_ = inflowLabel_;
if (collator_ == NULL) {
Collator *coll = Collator::createInstance(*locale, status);
if (U_FAILURE(status)) {
delete coll;
return;
}
if (coll == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
collator_ = dynamic_cast<RuleBasedCollator *>(coll);
if (collator_ == NULL) {
delete coll;
status = U_UNSUPPORTED_ERROR;
return;
}
}
collatorPrimaryOnly_ = collator_->clone();
if (collatorPrimaryOnly_ == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
firstCharsInScripts_ = firstStringsInScript(status);
if (U_FAILURE(status)) { return; }
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
// Guard against a degenerate collator where
// some script boundary strings are primary ignorable.
for (;;) {
if (U_FAILURE(status)) { return; }
if (firstCharsInScripts_->isEmpty()) {
// AlphabeticIndex requires some non-ignorable script boundary strings.
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (collatorPrimaryOnly_->compare(
*static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
emptyString_, status) == UCOL_EQUAL) {
firstCharsInScripts_->removeElementAt(0);
} else {
break;
}
}
// Chinese index characters, which are specific to each of the several Chinese tailorings,
// take precedence over the single locale data exemplar set per language.
if (!addChineseIndexCharacters(status) && locale != NULL) {
addIndexExemplars(*locale, status);
}
}
//
// Comparison function for UVector<UnicodeString *> sorting with a collator.
//
static int32_t U_CALLCONV
collatorComparator(const void *context, const void *left, const void *right) {
const UElement *leftElement = static_cast<const UElement *>(left);
const UElement *rightElement = static_cast<const UElement *>(right);
const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
if (leftString == rightString) {
// Catches case where both are NULL
return 0;
}
if (leftString == NULL) {
return 1;
}
if (rightString == NULL) {
return -1;
}
const Collator *col = static_cast<const Collator *>(context);
UErrorCode errorCode = U_ZERO_ERROR;
return col->compare(*leftString, *rightString, errorCode);
}
//
// Comparison function for UVector<Record *> sorting with a collator.
//
static int32_t U_CALLCONV
recordCompareFn(const void *context, const void *left, const void *right) {
const UElement *leftElement = static_cast<const UElement *>(left);
const UElement *rightElement = static_cast<const UElement *>(right);
const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
const Collator *col = static_cast<const Collator *>(context);
UErrorCode errorCode = U_ZERO_ERROR;
return col->compare(leftRec->name_, rightRec->name_, errorCode);
}
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
LocalPointer<UVector> dest(new UVector(status), status);
if (U_FAILURE(status)) {
return NULL;
}
dest->setDeleter(uprv_deleteUObject);
// Fetch the script-first-primary contractions which are defined in the root collator.
// They all start with U+FDD1.
UnicodeSet set;
collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
if (U_FAILURE(status)) {
return NULL;
}
if (set.isEmpty()) {
status = U_UNSUPPORTED_ERROR;
return NULL;
}
UnicodeSetIterator iter(set);
while (iter.next()) {
const UnicodeString &boundary = iter.getString();