1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2009-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_COLLATION
14 #include "unicode/alphaindex.h"
15 #include "unicode/coll.h"
16 #include "unicode/localpointer.h"
17 #include "unicode/normalizer2.h"
18 #include "unicode/tblcoll.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ulocdata.h"
21 #include "unicode/uniset.h"
22 #include "unicode/uobject.h"
23 #include "unicode/usetiter.h"
24 #include "unicode/utf16.h"
40 * Prefix string for Chinese index buckets.
41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
43 const UChar BASE[1] = { 0xFDD0 };
44 const int32_t BASE_LENGTH = 1;
46 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
47 const UnicodeString &one, const UnicodeString &other);
51 static int32_t U_CALLCONV
52 collatorComparator(const void *context, const void *left, const void *right);
54 static int32_t U_CALLCONV
55 recordCompareFn(const void *context, const void *left, const void *right);
57 // UVector<Record *> support function, delete a Record.
58 static void U_CALLCONV
59 alphaIndex_deleteRecord(void *obj) {
60 delete static_cast<AlphabeticIndex::Record *>(obj);
65 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
66 UErrorCode &errorCode) {
67 if (U_FAILURE(errorCode)) { return NULL; }
68 if (owned.isValid()) {
69 return owned.orphan();
71 UnicodeString *p = new UnicodeString(s);
73 errorCode = U_MEMORY_ALLOCATION_ERROR;
78 inline UnicodeString *getString(const UVector &list, int32_t i) {
79 return static_cast<UnicodeString *>(list[i]);
82 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
83 return static_cast<AlphabeticIndex::Bucket *>(list[i]);
86 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
87 return static_cast<AlphabeticIndex::Record *>(list[i]);
91 * Like Java Collections.binarySearch(List, String, Comparator).
93 * @return the index>=0 where the item was found,
94 * or the index<0 for inserting the string at ~index in sorted order
96 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
97 if (list.size() == 0) { return ~0; }
99 int32_t limit = list.size();
101 int32_t i = (start + limit) / 2;
102 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
103 UErrorCode errorCode = U_ZERO_ERROR;
104 UCollationResult cmp = coll.compare(s, *si, errorCode);
105 if (cmp == UCOL_EQUAL) {
107 } else if (cmp < 0) {
109 return ~start; // insert s before *si
114 return ~(start + 1); // insert s after *si
123 // The BucketList is not in the anonymous namespace because only Clang
124 // seems to support its use in other classes from there.
125 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
126 class BucketList : public UObject {
128 BucketList(UVector *bucketList, UVector *publicBucketList)
129 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
130 int32_t displayIndex = 0;
131 for (int32_t i = 0; i < publicBucketList->size(); ++i) {
132 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
136 // The virtual destructor must not be inline.
137 // See ticket #8454 for details.
138 virtual ~BucketList();
140 int32_t getBucketCount() const {
141 return immutableVisibleList_->size();
144 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
145 UErrorCode &errorCode) {
148 int32_t limit = bucketList_->size();
149 while ((start + 1) < limit) {
150 int32_t i = (start + limit) / 2;
151 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
152 UCollationResult nameVsBucket =
153 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
154 if (nameVsBucket < 0) {
160 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
161 if (bucket->displayBucket_ != NULL) {
162 bucket = bucket->displayBucket_;
164 return bucket->displayIndex_;
167 /** All of the buckets, visible and invisible. */
168 UVector *bucketList_;
169 /** Just the visible buckets. */
170 UVector *immutableVisibleList_;
173 BucketList::~BucketList() {
175 if (immutableVisibleList_ != bucketList_) {
176 delete immutableVisibleList_;
180 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
182 delete collatorPrimaryOnly_;
186 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
187 return buckets_->getBucketCount();
191 AlphabeticIndex::ImmutableIndex::getBucketIndex(
192 const UnicodeString &name, UErrorCode &errorCode) const {
193 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
196 const AlphabeticIndex::Bucket *
197 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
198 if (0 <= index && index < buckets_->getBucketCount()) {
199 return icu::getBucket(*buckets_->immutableVisibleList_, index);
205 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
209 initialLabels_(NULL), firstCharsInScripts_(NULL),
210 collator_(NULL), collatorPrimaryOnly_(NULL),
212 init(&locale, status);
216 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
220 initialLabels_(NULL), firstCharsInScripts_(NULL),
221 collator_(collator), collatorPrimaryOnly_(NULL),
228 AlphabeticIndex::~AlphabeticIndex() {
230 delete collatorPrimaryOnly_;
231 delete firstCharsInScripts_;
234 delete initialLabels_;
238 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
239 if (U_FAILURE(status)) {
242 initialLabels_->addAll(additions);
248 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
249 addIndexExemplars(locale, status);
255 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
256 if (U_FAILURE(errorCode)) { return NULL; }
257 // In C++, the ImmutableIndex must own its copy of the BucketList,
258 // even if it contains no records, for proper memory management.
259 // We could clone the buckets_ if they are not NULL,
260 // but that would be worth it only if this method is called multiple times,
261 // or called after using the old-style bucket iterator API.
262 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
263 LocalPointer<RuleBasedCollator> coll(
264 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
265 if (immutableBucketList.isNull() || coll.isNull()) {
266 errorCode = U_MEMORY_ALLOCATION_ERROR;
269 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
270 if (immIndex == NULL) {
271 errorCode = U_MEMORY_ALLOCATION_ERROR;
274 // The ImmutableIndex adopted its parameter objects.
275 immutableBucketList.orphan();
280 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
282 if (U_FAILURE(status)) {
285 return buckets_->getBucketCount();
289 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
290 if (U_FAILURE(status) || inputList_ == NULL) {
293 return inputList_->size();
296 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
297 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
298 if (U_FAILURE(errorCode)) { return; }
300 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
301 const UnicodeString &overflowBoundary =
302 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
304 // We make a sorted array of elements.
305 // Some of the input may be redundant.
306 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
307 // We filter out those cases.
308 UnicodeSetIterator iter(*initialLabels_);
309 while (iter.next()) {
310 const UnicodeString *item = &iter.getString();
311 LocalPointer<UnicodeString> ownedItem;
313 int32_t itemLength = item->length();
314 if (!item->hasMoreChar32Than(0, itemLength, 1)) {
315 checkDistinct = FALSE;
316 } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
317 item->charAt(itemLength - 2) != 0x2a) {
318 // Use a label if it is marked with one trailing star,
319 // even if the label string sorts the same when all contractions are suppressed.
320 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
321 item = ownedItem.getAlias();
323 errorCode = U_MEMORY_ALLOCATION_ERROR;
326 checkDistinct = FALSE;
328 checkDistinct = TRUE;
330 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
331 // Ignore a primary-ignorable or non-alphabetic index character.
332 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
333 // Ignore an index character that will land in the overflow bucket.
334 } else if (checkDistinct &&
335 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
336 // Ignore a multi-code point index character that does not sort distinctly
337 // from the sequence of its separate characters.
339 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
340 if (insertionPoint < 0) {
341 indexCharacters.insertElementAt(
342 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
344 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
345 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
346 indexCharacters.setElementAt(
347 ownedString(*item, ownedItem, errorCode), insertionPoint);
352 if (U_FAILURE(errorCode)) { return; }
354 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
356 int32_t size = indexCharacters.size() - 1;
357 if (size > maxLabelCount_) {
360 for (int32_t i = 0; i < indexCharacters.size();) {
362 int32_t bump = count * maxLabelCount_ / size;
364 indexCharacters.removeElementAt(i);
375 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) {
376 if (!current.startsWith(BASE, BASE_LENGTH)) {
379 UChar rest = current.charAt(BASE_LENGTH);
380 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
381 int32_t count = rest-0x2800;
382 temp.setTo((UChar)(0x30 + count % 10));
385 temp.insert(0, (UChar)(0x30 + count % 10));
388 temp.insert(0, (UChar)(0x30 + count));
391 return temp.append((UChar)0x5283);
393 return temp.setTo(current, BASE_LENGTH);
396 UBool hasMultiplePrimaryWeights(
397 const RuleBasedCollator &coll, uint32_t variableTop,
398 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
399 ces.removeAllElements();
400 coll.internalGetCEs(s, ces, errorCode);
401 if (U_FAILURE(errorCode)) { return FALSE; }
402 UBool seenPrimary = FALSE;
403 for (int32_t i = 0; i < ces.size(); ++i) {
404 int64_t ce = ces.elementAti(i);
405 uint32_t p = (uint32_t)(ce >> 32);
406 if (p > variableTop) {
407 // not primary ignorable
419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420 // Initialize indexCharacters.
421 UVector indexCharacters(errorCode);
422 indexCharacters.setDeleter(uprv_deleteUObject);
423 initLabels(indexCharacters, errorCode);
424 if (U_FAILURE(errorCode)) { return NULL; }
426 // Variables for hasMultiplePrimaryWeights().
427 UVector64 ces(errorCode);
428 uint32_t variableTop;
429 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
430 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
434 UBool hasInvisibleBuckets = FALSE;
436 // Helper arrays for Chinese Pinyin collation.
437 Bucket *asciiBuckets[26] = {
438 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
439 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
441 Bucket *pinyinBuckets[26] = {
442 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
443 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
445 UBool hasPinyin = FALSE;
447 LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode);
448 if (U_FAILURE(errorCode)) {
451 bucketList->setDeleter(uprv_deleteUObject);
454 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
455 if (bucket == NULL) {
456 errorCode = U_MEMORY_ALLOCATION_ERROR;
459 bucketList->addElement(bucket, errorCode);
460 if (U_FAILURE(errorCode)) { return NULL; }
464 // fix up the list, adding underflow, additions, overflow
465 // Insert inflow labels as needed.
466 int32_t scriptIndex = -1;
467 const UnicodeString *scriptUpperBoundary = &emptyString_;
468 for (int32_t i = 0; i < indexCharacters.size(); ++i) {
469 UnicodeString ¤t = *getString(indexCharacters, i);
470 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
471 // We crossed the script boundary into a new script.
472 const UnicodeString &inflowBoundary = *scriptUpperBoundary;
473 UBool skippedScript = FALSE;
475 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
476 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
479 skippedScript = TRUE;
481 if (skippedScript && bucketList->size() > 1) {
482 // We are skipping one or more scripts,
483 // and we are not just getting out of the underflow label.
484 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
485 if (bucket == NULL) {
486 errorCode = U_MEMORY_ALLOCATION_ERROR;
489 bucketList->addElement(bucket, errorCode);
492 // Add a bucket with the current label.
493 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
494 if (bucket == NULL) {
495 errorCode = U_MEMORY_ALLOCATION_ERROR;
498 bucketList->addElement(bucket, errorCode);
499 // Remember ASCII and Pinyin buckets for Pinyin redirects.
501 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
502 asciiBuckets[c - 0x41] = bucket;
503 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
504 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
505 pinyinBuckets[c - 0x41] = bucket;
508 // Check for multiple primary weights.
509 if (!current.startsWith(BASE, BASE_LENGTH) &&
510 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
512 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
513 // "AE-ligature" or "Sch" etc.
514 for (int32_t i = bucketList->size() - 2;; --i) {
515 Bucket *singleBucket = getBucket(*bucketList, i);
516 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
517 // There is no single-character bucket since the last
518 // underflow or inflow label.
521 if (singleBucket->displayBucket_ == NULL &&
522 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
523 singleBucket->lowerBoundary_,
525 // Add an invisible bucket that redirects strings greater than the expansion
526 // to the previous single-character bucket.
527 // For example, after ... Q R S Sch we add Sch\uFFFF->S
528 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
529 bucket = new Bucket(emptyString_,
530 UnicodeString(current).append((UChar)0xFFFF),
531 U_ALPHAINDEX_NORMAL);
532 if (bucket == NULL) {
533 errorCode = U_MEMORY_ALLOCATION_ERROR;
536 bucket->displayBucket_ = singleBucket;
537 bucketList->addElement(bucket, errorCode);
538 hasInvisibleBuckets = TRUE;
544 if (U_FAILURE(errorCode)) { return NULL; }
545 if (bucketList->size() == 1) {
546 // No real labels, show only the underflow label.
547 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
549 errorCode = U_MEMORY_ALLOCATION_ERROR;
556 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
557 if (bucket == NULL) {
558 errorCode = U_MEMORY_ALLOCATION_ERROR;
561 bucketList->addElement(bucket, errorCode); // final
564 // Redirect Pinyin buckets.
565 Bucket *asciiBucket = NULL;
566 for (int32_t i = 0; i < 26; ++i) {
567 if (asciiBuckets[i] != NULL) {
568 asciiBucket = asciiBuckets[i];
570 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
571 pinyinBuckets[i]->displayBucket_ = asciiBucket;
572 hasInvisibleBuckets = TRUE;
577 if (U_FAILURE(errorCode)) { return NULL; }
578 if (!hasInvisibleBuckets) {
579 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
581 errorCode = U_MEMORY_ALLOCATION_ERROR;
587 // Merge inflow buckets that are visually adjacent.
588 // Iterate backwards: Merge inflow into overflow rather than the other way around.
589 int32_t i = bucketList->size() - 1;
590 Bucket *nextBucket = getBucket(*bucketList, i);
592 bucket = getBucket(*bucketList, i);
593 if (bucket->displayBucket_ != NULL) {
594 continue; // skip invisible buckets
596 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
597 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
598 bucket->displayBucket_ = nextBucket;
605 LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode);
606 if (U_FAILURE(errorCode)) {
609 // Do not call publicBucketList->setDeleter():
610 // This vector shares its objects with the bucketList.
611 for (int32_t i = 0; i < bucketList->size(); ++i) {
612 bucket = getBucket(*bucketList, i);
613 if (bucket->displayBucket_ == NULL) {
614 publicBucketList->addElement(bucket, errorCode);
617 if (U_FAILURE(errorCode)) { return NULL; }
618 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
620 errorCode = U_MEMORY_ALLOCATION_ERROR;
624 publicBucketList.orphan();
629 * Creates an index, and buckets and sorts the list of records into the index.
631 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
632 if (U_FAILURE(errorCode) || buckets_ != NULL) {
635 buckets_ = createBucketList(errorCode);
636 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
640 // Sort the records by name.
641 // Stable sort preserves input order of collation duplicates.
642 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
644 // Now, we traverse all of the input, which is now sorted.
645 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
646 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
647 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
648 // we need to improve it for that case.
650 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
651 int32_t bucketIndex = 1;
653 const UnicodeString *upperBoundary;
654 if (bucketIndex < buckets_->bucketList_->size()) {
655 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
656 upperBoundary = &nextBucket->lowerBoundary_;
659 upperBoundary = NULL;
661 for (int32_t i = 0; i < inputList_->size(); ++i) {
662 Record *r = getRecord(*inputList_, i);
663 // if the current bucket isn't the right one, find the one that is
664 // We have a special flag for the last bucket so that we don't look any further
665 while (upperBoundary != NULL &&
666 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
667 currentBucket = nextBucket;
668 // now reset the boundary that we compare against
669 if (bucketIndex < buckets_->bucketList_->size()) {
670 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
671 upperBoundary = &nextBucket->lowerBoundary_;
673 upperBoundary = NULL;
676 // now put the record into the bucket.
677 Bucket *bucket = currentBucket;
678 if (bucket->displayBucket_ != NULL) {
679 bucket = bucket->displayBucket_;
681 if (bucket->records_ == NULL) {
682 bucket->records_ = new UVector(errorCode);
683 if (bucket->records_ == NULL) {
684 errorCode = U_MEMORY_ALLOCATION_ERROR;
688 bucket->records_->addElement(r, errorCode);
692 void AlphabeticIndex::clearBuckets() {
693 if (buckets_ != NULL) {
696 internalResetBucketIterator();
700 void AlphabeticIndex::internalResetBucketIterator() {
701 labelsIterIndex_ = -1;
702 currentBucket_ = NULL;
706 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
707 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
708 if (U_FAILURE(status)) {
712 UnicodeSet exemplars;
713 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
714 if (U_SUCCESS(status)) {
715 initialLabels_->addAll(exemplars);
718 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
720 // The locale data did not include explicit Index characters.
721 // Synthesize a set of them from the locale's standard exemplar characters.
722 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
723 if (U_FAILURE(status)) {
727 // question: should we add auxiliary exemplars?
728 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
729 exemplars.add(0x61, 0x7A);
731 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
732 // cut down to small list
733 exemplars.remove(0xAC00, 0xD7A3).
734 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
735 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
736 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
737 add(0xD30C).add(0xD558);
739 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
740 // cut down to small list
741 // make use of the fact that Ethiopic is allocated in 8's, where
742 // the base is 0 mod 8.
744 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
745 UnicodeSetIterator it(ethiopic);
746 while (it.next() && !it.isString()) {
747 if ((it.getCodepoint() & 0x7) != 0) {
748 exemplars.remove(it.getCodepoint());
753 // Upper-case any that aren't already so.
754 // (We only do this for synthesized index characters.)
755 UnicodeSetIterator it(exemplars);
756 UnicodeString upperC;
758 const UnicodeString &exemplarC = it.getString();
760 upperC.toUpper(locale);
761 initialLabels_->add(upperC);
765 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
766 UnicodeSet contractions;
767 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
768 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
769 initialLabels_->addAll(contractions);
770 UnicodeSetIterator iter(contractions);
771 while (iter.next()) {
772 const UnicodeString &s = iter.getString();
773 U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
774 UChar c = s.charAt(s.length() - 1);
775 if (0x41 <= c && c <= 0x5A) { // A-Z
776 // There are Pinyin labels, add ASCII A-Z labels as well.
777 initialLabels_->add(0x41, 0x5A); // A-Z
786 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
788 static const UChar CGJ = 0x034F;
789 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
790 UnicodeString result;
791 if (item.length() == 0) {
796 UChar32 cp = item.char32At(i);
798 i = item.moveIndex32(i, 1);
799 if (i >= item.length()) {
808 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
813 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
818 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
823 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
827 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
828 return overflowLabel_;
832 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
833 return underflowLabel_;
837 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
838 inflowLabel_ = label;
844 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
845 overflowLabel_ = label;
851 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
852 underflowLabel_ = label;
858 int32_t AlphabeticIndex::getMaxLabelCount() const {
859 return maxLabelCount_;
863 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
864 if (U_FAILURE(status)) {
867 if (maxLabelCount <= 0) {
868 status = U_ILLEGAL_ARGUMENT_ERROR;
871 maxLabelCount_ = maxLabelCount;
878 // init() - Common code for constructors.
881 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
882 if (U_FAILURE(status)) { return; }
883 if (locale == NULL && collator_ == NULL) {
884 status = U_ILLEGAL_ARGUMENT_ERROR;
888 initialLabels_ = new UnicodeSet();
889 if (initialLabels_ == NULL) {
890 status = U_MEMORY_ALLOCATION_ERROR;
894 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
895 overflowLabel_ = inflowLabel_;
896 underflowLabel_ = inflowLabel_;
898 if (collator_ == NULL) {
899 Collator *coll = Collator::createInstance(*locale, status);
900 if (U_FAILURE(status)) {
905 status = U_MEMORY_ALLOCATION_ERROR;
908 collator_ = dynamic_cast<RuleBasedCollator *>(coll);
909 if (collator_ == NULL) {
911 status = U_UNSUPPORTED_ERROR;
915 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
916 if (collatorPrimaryOnly_ == NULL) {
917 status = U_MEMORY_ALLOCATION_ERROR;
920 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
921 firstCharsInScripts_ = firstStringsInScript(status);
922 if (U_FAILURE(status)) { return; }
923 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
924 // Guard against a degenerate collator where
925 // some script boundary strings are primary ignorable.
927 if (U_FAILURE(status)) { return; }
928 if (firstCharsInScripts_->isEmpty()) {
929 // AlphabeticIndex requires some non-ignorable script boundary strings.
930 status = U_ILLEGAL_ARGUMENT_ERROR;
933 if (collatorPrimaryOnly_->compare(
934 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
935 emptyString_, status) == UCOL_EQUAL) {
936 firstCharsInScripts_->removeElementAt(0);
942 // Chinese index characters, which are specific to each of the several Chinese tailorings,
943 // take precedence over the single locale data exemplar set per language.
944 if (!addChineseIndexCharacters(status) && locale != NULL) {
945 addIndexExemplars(*locale, status);
951 // Comparison function for UVector<UnicodeString *> sorting with a collator.
953 static int32_t U_CALLCONV
954 collatorComparator(const void *context, const void *left, const void *right) {
955 const UElement *leftElement = static_cast<const UElement *>(left);
956 const UElement *rightElement = static_cast<const UElement *>(right);
957 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
958 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
960 if (leftString == rightString) {
961 // Catches case where both are NULL
964 if (leftString == NULL) {
967 if (rightString == NULL) {
970 const Collator *col = static_cast<const Collator *>(context);
971 UErrorCode errorCode = U_ZERO_ERROR;
972 return col->compare(*leftString, *rightString, errorCode);
976 // Comparison function for UVector<Record *> sorting with a collator.
978 static int32_t U_CALLCONV
979 recordCompareFn(const void *context, const void *left, const void *right) {
980 const UElement *leftElement = static_cast<const UElement *>(left);
981 const UElement *rightElement = static_cast<const UElement *>(right);
982 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
983 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
984 const Collator *col = static_cast<const Collator *>(context);
985 UErrorCode errorCode = U_ZERO_ERROR;
986 return col->compare(leftRec->name_, rightRec->name_, errorCode);
989 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
990 if (U_FAILURE(status)) {
993 LocalPointer<UVector> dest(new UVector(status), status);
994 if (U_FAILURE(status)) {
997 dest->setDeleter(uprv_deleteUObject);
998 // Fetch the script-first-primary contractions which are defined in the root collator.
999 // They all start with U+FDD1.
1001 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
1002 if (U_FAILURE(status)) {
1005 if (set.isEmpty()) {
1006 status = U_UNSUPPORTED_ERROR;
1009 UnicodeSetIterator iter(set);
1010 while (iter.next()) {
1011 const UnicodeString &boundary = iter.getString();
1012 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
1013 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
1014 // Ignore boundaries for the special reordering groups.
1015 // Take only those for "real scripts" (where the sample character is a Letter,
1016 // and the one for unassigned implicit weights (Cn).
1019 UnicodeString *s = new UnicodeString(boundary);
1021 status = U_MEMORY_ALLOCATION_ERROR;
1024 dest->addElement(s, status);
1026 return dest.orphan();
1033 * Returns true if one index character string is "better" than the other.
1034 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1035 * better, and otherwise binary-less-than is better.
1037 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1038 const UnicodeString &one, const UnicodeString &other) {
1039 // This is called with primary-equal strings, but never with one.equals(other).
1040 UErrorCode status = U_ZERO_ERROR;
1041 UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1042 UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1043 if (U_FAILURE(status)) { return FALSE; }
1044 int32_t result = n1.countChar32() - n2.countChar32();
1048 result = n1.compareCodePointOrder(n2);
1052 return one.compareCodePointOrder(other) < 0;
1058 // Constructor & Destructor for AlphabeticIndex::Record
1060 // Records are internal only, instances are not directly surfaced in the public API.
1061 // This class is mostly struct-like, with all public fields.
1063 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1064 : name_(name), data_(data) {}
1066 AlphabeticIndex::Record::~Record() {
1070 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1071 if (U_FAILURE(status)) {
1074 if (inputList_ == NULL) {
1075 inputList_ = new UVector(status);
1076 if (inputList_ == NULL) {
1077 status = U_MEMORY_ALLOCATION_ERROR;
1080 inputList_->setDeleter(alphaIndex_deleteRecord);
1082 Record *r = new Record(name, data);
1084 status = U_MEMORY_ALLOCATION_ERROR;
1087 inputList_->addElement(r, status);
1091 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1092 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1097 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1098 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1099 inputList_->removeAllElements();
1105 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1106 initBuckets(status);
1107 if (U_FAILURE(status)) {
1110 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1114 int32_t AlphabeticIndex::getBucketIndex() const {
1115 return labelsIterIndex_;
1119 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1120 if (U_FAILURE(status)) {
1123 if (buckets_ == NULL && currentBucket_ != NULL) {
1124 status = U_ENUM_OUT_OF_SYNC_ERROR;
1127 initBuckets(status);
1128 if (U_FAILURE(status)) {
1132 if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1133 labelsIterIndex_ = buckets_->getBucketCount();
1136 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1137 resetRecordIterator();
1141 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1142 if (currentBucket_ != NULL) {
1143 return currentBucket_->label_;
1145 return emptyString_;
1150 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1151 if (currentBucket_ != NULL) {
1152 return currentBucket_->labelType_;
1154 return U_ALPHAINDEX_NORMAL;
1159 int32_t AlphabeticIndex::getBucketRecordCount() const {
1160 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1161 return currentBucket_->records_->size();
1168 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1169 if (U_FAILURE(status)) {
1172 internalResetBucketIterator();
1177 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1178 if (U_FAILURE(status)) {
1181 if (currentBucket_ == NULL) {
1182 // We are trying to iterate over the items in a bucket, but there is no
1183 // current bucket from the enumeration of buckets.
1184 status = U_INVALID_STATE_ERROR;
1187 if (buckets_ == NULL) {
1188 status = U_ENUM_OUT_OF_SYNC_ERROR;
1191 if (currentBucket_->records_ == NULL) {
1195 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1196 itemsIterIndex_ = currentBucket_->records_->size();
1203 const UnicodeString &AlphabeticIndex::getRecordName() const {
1204 const UnicodeString *retStr = &emptyString_;
1205 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1206 itemsIterIndex_ >= 0 &&
1207 itemsIterIndex_ < currentBucket_->records_->size()) {
1208 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1209 retStr = &item->name_;
1214 const void *AlphabeticIndex::getRecordData() const {
1215 const void *retPtr = NULL;
1216 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1217 itemsIterIndex_ >= 0 &&
1218 itemsIterIndex_ < currentBucket_->records_->size()) {
1219 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1220 retPtr = item->data_;
1226 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1227 itemsIterIndex_ = -1;
1233 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1234 const UnicodeString &lowerBoundary,
1235 UAlphabeticIndexLabelType type)
1236 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1237 displayBucket_(NULL), displayIndex_(-1),
1242 AlphabeticIndex::Bucket::~Bucket() {
1248 #endif // !UCONFIG_NO_COLLATION