2 *******************************************************************************
3 * Copyright (C) 2009-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
12 #include "unicode/alphaindex.h"
13 #include "unicode/coleitr.h"
14 #include "unicode/coll.h"
15 #include "unicode/localpointer.h"
16 #include "unicode/normalizer2.h"
17 #include "unicode/tblcoll.h"
18 #include "unicode/ulocdata.h"
19 #include "unicode/uniset.h"
20 #include "unicode/uobject.h"
21 #include "unicode/usetiter.h"
22 #include "unicode/utf16.h"
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39 * Prefix string for Chinese index buckets.
40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
42 const UChar BASE[1] = { 0xFDD0 };
43 const int32_t BASE_LENGTH = 1;
45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
46 const UnicodeString &one, const UnicodeString &other);
50 static int32_t U_CALLCONV
51 collatorComparator(const void *context, const void *left, const void *right);
53 static int32_t U_CALLCONV
54 recordCompareFn(const void *context, const void *left, const void *right);
56 // UVector<Record *> support function, delete a Record.
57 static void U_CALLCONV
58 alphaIndex_deleteRecord(void *obj) {
59 delete static_cast<AlphabeticIndex::Record *>(obj);
64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
65 UErrorCode &errorCode) {
66 if (U_FAILURE(errorCode)) { return NULL; }
67 if (owned.isValid()) {
68 return owned.orphan();
70 UnicodeString *p = new UnicodeString(s);
72 errorCode = U_MEMORY_ALLOCATION_ERROR;
77 inline UnicodeString *getString(const UVector &list, int32_t i) {
78 return static_cast<UnicodeString *>(list[i]);
81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
82 return static_cast<AlphabeticIndex::Bucket *>(list[i]);
85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
86 return static_cast<AlphabeticIndex::Record *>(list[i]);
90 * Like Java Collections.binarySearch(List, String, Comparator).
92 * @return the index>=0 where the item was found,
93 * or the index<0 for inserting the string at ~index in sorted order
95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
96 if (list.size() == 0) { return ~0; }
98 int32_t limit = list.size();
100 int32_t i = (start + limit) / 2;
101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
102 UErrorCode errorCode = U_ZERO_ERROR;
103 UCollationResult cmp = coll.compare(s, *si, errorCode);
104 if (cmp == UCOL_EQUAL) {
106 } else if (cmp < 0) {
108 return ~start; // insert s before *si
113 return ~(start + 1); // insert s after *si
122 // The BucketList is not in the anonymous namespace because only Clang
123 // seems to support its use in other classes from there.
124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
125 class BucketList : public UObject {
127 BucketList(UVector *bucketList, UVector *publicBucketList)
128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
129 int32_t displayIndex = 0;
130 for (int32_t i = 0; i < publicBucketList->size(); ++i) {
131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
135 // The virtual destructor must not be inline.
136 // See ticket #8454 for details.
137 virtual ~BucketList();
139 int32_t getBucketCount() const {
140 return immutableVisibleList_->size();
143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
144 UErrorCode &errorCode) {
147 int32_t limit = bucketList_->size();
148 while ((start + 1) < limit) {
149 int32_t i = (start + limit) / 2;
150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
151 UCollationResult nameVsBucket =
152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
153 if (nameVsBucket < 0) {
159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
160 if (bucket->displayBucket_ != NULL) {
161 bucket = bucket->displayBucket_;
163 return bucket->displayIndex_;
166 /** All of the buckets, visible and invisible. */
167 UVector *bucketList_;
168 /** Just the visible buckets. */
169 UVector *immutableVisibleList_;
172 BucketList::~BucketList() {
174 if (immutableVisibleList_ != bucketList_) {
175 delete immutableVisibleList_;
179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
181 delete collatorPrimaryOnly_;
185 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
186 return buckets_->getBucketCount();
190 AlphabeticIndex::ImmutableIndex::getBucketIndex(
191 const UnicodeString &name, UErrorCode &errorCode) const {
192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
195 const AlphabeticIndex::Bucket *
196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
197 if (0 <= index && index < buckets_->getBucketCount()) {
198 return icu::getBucket(*buckets_->immutableVisibleList_, index);
204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
208 initialLabels_(NULL), firstCharsInScripts_(NULL),
209 collator_(NULL), collatorPrimaryOnly_(NULL),
211 init(&locale, status);
215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
219 initialLabels_(NULL), firstCharsInScripts_(NULL),
220 collator_(collator), collatorPrimaryOnly_(NULL),
227 AlphabeticIndex::~AlphabeticIndex() {
229 delete collatorPrimaryOnly_;
230 delete firstCharsInScripts_;
233 delete initialLabels_;
237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
238 if (U_FAILURE(status)) {
241 initialLabels_->addAll(additions);
247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
248 addIndexExemplars(locale, status);
254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
255 if (U_FAILURE(errorCode)) { return NULL; }
256 // In C++, the ImmutableIndex must own its copy of the BucketList,
257 // even if it contains no records, for proper memory management.
258 // We could clone the buckets_ if they are not NULL,
259 // but that would be worth it only if this method is called multiple times,
260 // or called after using the old-style bucket iterator API.
261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
262 LocalPointer<RuleBasedCollator> coll(
263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
264 if (immutableBucketList.isNull() || coll.isNull()) {
265 errorCode = U_MEMORY_ALLOCATION_ERROR;
268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
269 if (immIndex == NULL) {
270 errorCode = U_MEMORY_ALLOCATION_ERROR;
273 // The ImmutableIndex adopted its parameter objects.
274 immutableBucketList.orphan();
279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
281 if (U_FAILURE(status)) {
284 return buckets_->getBucketCount();
288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
289 if (U_FAILURE(status) || inputList_ == NULL) {
292 return inputList_->size();
295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
297 if (U_FAILURE(errorCode)) { return; }
299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
300 const UnicodeString &overflowBoundary =
301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
303 // We make a sorted array of elements.
304 // Some of the input may be redundant.
305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
306 // We filter out those cases.
307 UnicodeSetIterator iter(*initialLabels_);
308 while (iter.next()) {
309 const UnicodeString *item = &iter.getString();
310 LocalPointer<UnicodeString> ownedItem;
312 int32_t itemLength = item->length();
313 if (!item->hasMoreChar32Than(0, itemLength, 1)) {
314 checkDistinct = FALSE;
315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*'
316 item->charAt(itemLength - 2) != 0x2a) {
317 // Use a label if it is marked with one trailing star,
318 // even if the label string sorts the same when all contractions are suppressed.
319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
320 item = ownedItem.getAlias();
322 errorCode = U_MEMORY_ALLOCATION_ERROR;
325 checkDistinct = FALSE;
327 checkDistinct = TRUE;
329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
330 // Ignore a primary-ignorable or non-alphabetic index character.
331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
332 // Ignore an index characters that will land in the overflow bucket.
333 } else if (checkDistinct &&
334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
335 // Ignore a multi-code point index character that does not sort distinctly
336 // from the sequence of its separate characters.
338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
339 if (insertionPoint < 0) {
340 indexCharacters.insertElementAt(
341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
345 indexCharacters.setElementAt(
346 ownedString(*item, ownedItem, errorCode), insertionPoint);
351 if (U_FAILURE(errorCode)) { return; }
353 // if the result is still too large, cut down to maxCount elements, by removing every nth element
355 int32_t size = indexCharacters.size() - 1;
356 if (size > maxLabelCount_) {
359 for (int32_t i = 0; i < indexCharacters.size();) {
361 int32_t bump = count * maxLabelCount_ / size;
363 indexCharacters.removeElementAt(i);
374 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) {
375 if (!current.startsWith(BASE, BASE_LENGTH)) {
378 UChar rest = current.charAt(BASE_LENGTH);
379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count
380 int32_t count = rest-0x2800;
381 temp.setTo((UChar)(0x30 + count % 10));
384 temp.insert(0, (UChar)(0x30 + count % 10));
387 temp.insert(0, (UChar)(0x30 + count));
390 return temp.append((UChar)0x5283);
392 return temp.setTo(current, BASE_LENGTH);
395 UBool hasMultiplePrimaryWeights(
396 CollationElementIterator &cei, int32_t variableTop,
397 const UnicodeString &s, UErrorCode &errorCode) {
398 cei.setText(s, errorCode);
399 UBool seenPrimary = FALSE;
401 int32_t ce32 = cei.next(errorCode);
402 if (ce32 == CollationElementIterator::NULLORDER) {
405 int32_t p = CollationElementIterator::primaryOrder(ce32);
406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
407 // not primary ignorable, and not a continuation CE
419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
420 // Initialize indexCharacters.
421 UVector indexCharacters(errorCode);
422 indexCharacters.setDeleter(uprv_deleteUObject);
423 initLabels(indexCharacters, errorCode);
424 if (U_FAILURE(errorCode)) { return NULL; }
426 // Variables for hasMultiplePrimaryWeights().
427 LocalPointer<CollationElementIterator> cei(
428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
430 errorCode = U_MEMORY_ALLOCATION_ERROR;
434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
435 variableTop = CollationElementIterator::primaryOrder(
436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
440 UBool hasInvisibleBuckets = FALSE;
442 // Helper arrays for Chinese Pinyin collation.
443 Bucket *asciiBuckets[26] = {
444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
447 Bucket *pinyinBuckets[26] = {
448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
451 UBool hasPinyin = FALSE;
453 LocalPointer<UVector> bucketList(new UVector(errorCode));
454 if (bucketList.isNull()) {
455 errorCode = U_MEMORY_ALLOCATION_ERROR;
458 bucketList->setDeleter(uprv_deleteUObject);
461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
462 if (bucket == NULL) {
463 errorCode = U_MEMORY_ALLOCATION_ERROR;
466 bucketList->addElement(bucket, errorCode);
467 if (U_FAILURE(errorCode)) { return NULL; }
471 // fix up the list, adding underflow, additions, overflow
472 // Insert inflow labels as needed.
473 int32_t scriptIndex = -1;
474 const UnicodeString *scriptUpperBoundary = &emptyString_;
475 for (int32_t i = 0; i < indexCharacters.size(); ++i) {
476 UnicodeString ¤t = *getString(indexCharacters, i);
477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
478 // We crossed the script boundary into a new script.
479 const UnicodeString &inflowBoundary = *scriptUpperBoundary;
480 UBool skippedScript = FALSE;
482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
486 skippedScript = TRUE;
488 if (skippedScript && bucketList->size() > 1) {
489 // We are skipping one or more scripts,
490 // and we are not just getting out of the underflow label.
491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
492 if (bucket == NULL) {
493 errorCode = U_MEMORY_ALLOCATION_ERROR;
496 bucketList->addElement(bucket, errorCode);
499 // Add a bucket with the current label.
500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
501 if (bucket == NULL) {
502 errorCode = U_MEMORY_ALLOCATION_ERROR;
505 bucketList->addElement(bucket, errorCode);
506 // Remember ASCII and Pinyin buckets for Pinyin redirects.
508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
509 asciiBuckets[c - 0x41] = bucket;
510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
512 pinyinBuckets[c - 0x41] = bucket;
515 // Check for multiple primary weights.
516 if (!current.startsWith(BASE, BASE_LENGTH) &&
517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
519 // "AE-ligature" or "Sch" etc.
520 for (int32_t i = bucketList->size() - 2;; --i) {
521 Bucket *singleBucket = getBucket(*bucketList, i);
522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
523 // There is no single-character bucket since the last
524 // underflow or inflow label.
527 if (singleBucket->displayBucket_ == NULL &&
528 !hasMultiplePrimaryWeights(
529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
530 // Add an invisible bucket that redirects strings greater than the expansion
531 // to the previous single-character bucket.
532 // For example, after ... Q R S Sch we add Sch\uFFFF->S
533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
534 bucket = new Bucket(emptyString_,
535 UnicodeString(current).append((UChar)0xFFFF),
536 U_ALPHAINDEX_NORMAL);
537 if (bucket == NULL) {
538 errorCode = U_MEMORY_ALLOCATION_ERROR;
541 bucket->displayBucket_ = singleBucket;
542 bucketList->addElement(bucket, errorCode);
543 hasInvisibleBuckets = TRUE;
549 if (U_FAILURE(errorCode)) { return NULL; }
550 if (bucketList->size() == 1) {
551 // No real labels, show only the underflow label.
552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
554 errorCode = U_MEMORY_ALLOCATION_ERROR;
561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
562 if (bucket == NULL) {
563 errorCode = U_MEMORY_ALLOCATION_ERROR;
566 bucketList->addElement(bucket, errorCode); // final
569 // Redirect Pinyin buckets.
570 Bucket *asciiBucket = NULL;
571 for (int32_t i = 0; i < 26; ++i) {
572 if (asciiBuckets[i] != NULL) {
573 asciiBucket = asciiBuckets[i];
575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
576 pinyinBuckets[i]->displayBucket_ = asciiBucket;
577 hasInvisibleBuckets = TRUE;
582 if (U_FAILURE(errorCode)) { return NULL; }
583 if (!hasInvisibleBuckets) {
584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
586 errorCode = U_MEMORY_ALLOCATION_ERROR;
592 // Merge inflow buckets that are visually adjacent.
593 // Iterate backwards: Merge inflow into overflow rather than the other way around.
594 int32_t i = bucketList->size() - 1;
595 Bucket *nextBucket = getBucket(*bucketList, i);
597 bucket = getBucket(*bucketList, i);
598 if (bucket->displayBucket_ != NULL) {
599 continue; // skip invisible buckets
601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
603 bucket->displayBucket_ = nextBucket;
610 LocalPointer<UVector> publicBucketList(new UVector(errorCode));
611 if (bucketList.isNull()) {
612 errorCode = U_MEMORY_ALLOCATION_ERROR;
615 // Do not call publicBucketList->setDeleter():
616 // This vector shares its objects with the bucketList.
617 for (int32_t i = 0; i < bucketList->size(); ++i) {
618 bucket = getBucket(*bucketList, i);
619 if (bucket->displayBucket_ == NULL) {
620 publicBucketList->addElement(bucket, errorCode);
623 if (U_FAILURE(errorCode)) { return NULL; }
624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
626 errorCode = U_MEMORY_ALLOCATION_ERROR;
630 publicBucketList.orphan();
635 * Creates an index, and buckets and sorts the list of records into the index.
637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
638 if (U_FAILURE(errorCode) || buckets_ != NULL) {
641 buckets_ = createBucketList(errorCode);
642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
646 // Sort the records by name.
647 // Stable sort preserves input order of collation duplicates.
648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
650 // Now, we traverse all of the input, which is now sorted.
651 // If the item doesn't go in the current bucket, we find the next bucket that contains it.
652 // This makes the process order n*log(n), since we just sort the list and then do a linear process.
653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
654 // we need to improve it for that case.
656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
657 int32_t bucketIndex = 1;
659 const UnicodeString *upperBoundary;
660 if (bucketIndex < buckets_->bucketList_->size()) {
661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
662 upperBoundary = &nextBucket->lowerBoundary_;
665 upperBoundary = NULL;
667 for (int32_t i = 0; i < inputList_->size(); ++i) {
668 Record *r = getRecord(*inputList_, i);
669 // if the current bucket isn't the right one, find the one that is
670 // We have a special flag for the last bucket so that we don't look any further
671 while (upperBoundary != NULL &&
672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
673 currentBucket = nextBucket;
674 // now reset the boundary that we compare against
675 if (bucketIndex < buckets_->bucketList_->size()) {
676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
677 upperBoundary = &nextBucket->lowerBoundary_;
679 upperBoundary = NULL;
682 // now put the record into the bucket.
683 Bucket *bucket = currentBucket;
684 if (bucket->displayBucket_ != NULL) {
685 bucket = bucket->displayBucket_;
687 if (bucket->records_ == NULL) {
688 bucket->records_ = new UVector(errorCode);
689 if (bucket->records_ == NULL) {
690 errorCode = U_MEMORY_ALLOCATION_ERROR;
694 bucket->records_->addElement(r, errorCode);
698 void AlphabeticIndex::clearBuckets() {
699 if (buckets_ != NULL) {
702 internalResetBucketIterator();
706 void AlphabeticIndex::internalResetBucketIterator() {
707 labelsIterIndex_ = -1;
708 currentBucket_ = NULL;
712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
713 if (U_FAILURE(status)) { return; }
714 // Chinese index characters, which are specific to each of the several Chinese tailorings,
715 // take precedence over the single locale data exemplar set per language.
716 const char *language = locale.getLanguage();
717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
718 uprv_strcmp(language, "ko") == 0) {
719 // TODO: This should be done regardless of the language, but it's expensive.
720 // We should add a Collator function (can be @internal)
721 // to enumerate just the contractions that start with a given code point or string.
722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
728 if (U_FAILURE(status)) {
732 UnicodeSet exemplars;
733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
734 if (U_SUCCESS(status)) {
735 initialLabels_->addAll(exemplars);
738 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
740 // The locale data did not include explicit Index characters.
741 // Synthesize a set of them from the locale's standard exemplar characters.
742 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
743 if (U_FAILURE(status)) {
747 // question: should we add auxiliary exemplars?
748 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
749 exemplars.add(0x61, 0x7A);
751 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
752 // cut down to small list
753 exemplars.remove(0xAC00, 0xD7A3).
754 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
755 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
756 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
757 add(0xD30C).add(0xD558);
759 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
760 // cut down to small list
761 // make use of the fact that Ethiopic is allocated in 8's, where
762 // the base is 0 mod 8.
764 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
765 UnicodeSetIterator it(ethiopic);
766 while (it.next() && !it.isString()) {
767 if ((it.getCodepoint() & 0x7) != 0) {
768 exemplars.remove(it.getCodepoint());
773 // Upper-case any that aren't already so.
774 // (We only do this for synthesized index characters.)
775 UnicodeSetIterator it(exemplars);
776 UnicodeString upperC;
778 const UnicodeString &exemplarC = it.getString();
780 upperC.toUpper(locale);
781 initialLabels_->add(upperC);
785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
786 UnicodeSet contractions;
787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
788 contractions.toUSet(), NULL, FALSE, &errorCode);
789 if (U_FAILURE(errorCode)) { return FALSE; }
790 UnicodeString firstHanBoundary;
791 UBool hasPinyin = FALSE;
792 UnicodeSetIterator iter(contractions);
793 while (iter.next()) {
794 const UnicodeString &s = iter.getString();
795 if (s.startsWith(BASE, BASE_LENGTH)) {
796 initialLabels_->add(s);
797 if (firstHanBoundary.isEmpty() ||
798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
799 firstHanBoundary = s;
801 UChar c = s.charAt(s.length() - 1);
802 if (0x41 <= c && c <= 0x5A) { // A-Z
808 initialLabels_->add(0x41, 0x5A); // A-Z
810 if (!firstHanBoundary.isEmpty()) {
811 // The hardcoded list of script boundaries includes U+4E00
812 // which is tailored to not be the first primary
813 // in all Chinese tailorings except "unihan".
814 // Replace U+4E00 with the first boundary string from the tailoring.
815 // TODO: This becomes obsolete when the root collator gets
816 // reliable script-first-primary mappings.
817 int32_t hanIndex = binarySearch(
818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
820 UnicodeString *fh = new UnicodeString(firstHanBoundary);
822 errorCode = U_MEMORY_ALLOCATION_ERROR;
825 firstCharsInScripts_->setElementAt(fh, hanIndex);
835 * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
837 static const UChar CGJ = 0x034F;
838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
839 UnicodeString result;
840 if (item.length() == 0) {
845 UChar32 cp = item.char32At(i);
847 i = item.moveIndex32(i, 1);
848 if (i >= item.length()) {
857 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
867 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
868 // There are no known non-RuleBasedCollator collators, and none ever expected.
869 // But, in case that changes, better a null pointer than a wrong type.
870 return *dynamic_cast<RuleBasedCollator *>(collator_);
874 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
879 return overflowLabel_;
883 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
884 return underflowLabel_;
888 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
889 inflowLabel_ = label;
895 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
896 overflowLabel_ = label;
902 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
903 underflowLabel_ = label;
909 int32_t AlphabeticIndex::getMaxLabelCount() const {
910 return maxLabelCount_;
914 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
915 if (U_FAILURE(status)) {
918 if (maxLabelCount <= 0) {
919 status = U_ILLEGAL_ARGUMENT_ERROR;
922 maxLabelCount_ = maxLabelCount;
929 // init() - Common code for constructors.
932 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
933 if (U_FAILURE(status)) { return; }
934 if (locale == NULL && collator_ == NULL) {
935 status = U_ILLEGAL_ARGUMENT_ERROR;
939 initialLabels_ = new UnicodeSet();
940 if (initialLabels_ == NULL) {
941 status = U_MEMORY_ALLOCATION_ERROR;
945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis
946 overflowLabel_ = inflowLabel_;
947 underflowLabel_ = inflowLabel_;
949 if (collator_ == NULL) {
950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
951 if (U_FAILURE(status)) { return; }
952 if (collator_ == NULL) {
953 status = U_MEMORY_ALLOCATION_ERROR;
957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
958 if (collatorPrimaryOnly_ == NULL) {
959 status = U_MEMORY_ALLOCATION_ERROR;
962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
963 firstCharsInScripts_ = firstStringsInScript(status);
964 if (U_FAILURE(status)) { return; }
965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
966 UnicodeString _4E00((UChar)0x4E00);
967 UnicodeString _1100((UChar)0x1100);
968 UnicodeString _1112((UChar)0x1112);
969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
971 // The standard Korean tailoring sorts Hanja (Han characters)
972 // as secondary differences from Hangul syllables.
973 // This makes U+4E00 not useful as a Han-script boundary.
974 // TODO: This becomes obsolete when the root collator gets
975 // reliable script-first-primary mappings.
976 int32_t hanIndex = binarySearch(
977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
979 firstCharsInScripts_->removeElementAt(hanIndex);
982 // Guard against a degenerate collator where
983 // some script boundary strings are primary ignorable.
985 if (U_FAILURE(status)) { return; }
986 if (firstCharsInScripts_->isEmpty()) {
987 // AlphabeticIndex requires some non-ignorable script boundary strings.
988 status = U_ILLEGAL_ARGUMENT_ERROR;
991 if (collatorPrimaryOnly_->compare(
992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
993 emptyString_, status) == UCOL_EQUAL) {
994 firstCharsInScripts_->removeElementAt(0);
1000 if (locale != NULL) {
1001 addIndexExemplars(*locale, status);
1007 // Comparison function for UVector<UnicodeString *> sorting with a collator.
1009 static int32_t U_CALLCONV
1010 collatorComparator(const void *context, const void *left, const void *right) {
1011 const UElement *leftElement = static_cast<const UElement *>(left);
1012 const UElement *rightElement = static_cast<const UElement *>(right);
1013 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
1014 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
1016 if (leftString == rightString) {
1017 // Catches case where both are NULL
1020 if (leftString == NULL) {
1023 if (rightString == NULL) {
1026 const Collator *col = static_cast<const Collator *>(context);
1027 UErrorCode errorCode = U_ZERO_ERROR;
1028 return col->compare(*leftString, *rightString, errorCode);
1032 // Comparison function for UVector<Record *> sorting with a collator.
1034 static int32_t U_CALLCONV
1035 recordCompareFn(const void *context, const void *left, const void *right) {
1036 const UElement *leftElement = static_cast<const UElement *>(left);
1037 const UElement *rightElement = static_cast<const UElement *>(right);
1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
1040 const Collator *col = static_cast<const Collator *>(context);
1041 UErrorCode errorCode = U_ZERO_ERROR;
1042 return col->compare(leftRec->name_, rightRec->name_, errorCode);
1047 * This list contains one character per script that has the
1048 * lowest primary weight for that script in the root collator.
1049 * This list will be copied and sorted to account for script reordering.
1051 * <p>TODO: This is fragile. If the first character of a script is tailored
1052 * so that it does not map to the script's lowest primary weight any more,
1053 * then the buckets will be off.
1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = {
1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1066 0xAAF2, 0, // Meetei Mayek
1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
1071 0xD802, 0xDE00, 0, 0x0E01, 0,
1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
1089 // TODO: The overflow bucket's lowerBoundary string should be the
1090 // first item after the last reordering group in the collator's script order.
1091 // This should normally be the first Unicode code point
1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1093 // However, at least up to ICU 51 the Hani reordering group includes
1094 // unassigned code points,
1095 // and there is no stable string for the start of the trailing-weights range.
1096 // The only known string that sorts "high" is U+FFFF.
1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1098 // and fix relevant test code.
1099 // Ideally, FractionalUCA.txt will have a "script first primary"
1100 // for unassigned code points.
1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1105 if (U_FAILURE(status)) {
1108 UVector *dest = new UVector(status);
1110 status = U_MEMORY_ALLOCATION_ERROR;
1113 dest->setDeleter(uprv_deleteUObject);
1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
1117 if (U_FAILURE(status)) {
1120 UnicodeString *str = new UnicodeString(src, -1);
1122 status = U_MEMORY_ALLOCATION_ERROR;
1125 dest->addElement(str, status);
1126 src += str->length() + 1;
1127 } while (src < limit);
1135 * Returns true if one index character string is "better" than the other.
1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1137 * better, and otherwise binary-less-than is better.
1139 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1140 const UnicodeString &one, const UnicodeString &other) {
1141 // This is called with primary-equal strings, but never with one.equals(other).
1142 UErrorCode status = U_ZERO_ERROR;
1143 UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1144 UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1145 if (U_FAILURE(status)) { return FALSE; }
1146 int32_t result = n1.countChar32() - n2.countChar32();
1150 result = n1.compareCodePointOrder(n2);
1154 return one.compareCodePointOrder(other) < 0;
1160 // Constructor & Destructor for AlphabeticIndex::Record
1162 // Records are internal only, instances are not directly surfaced in the public API.
1163 // This class is mostly struct-like, with all public fields.
1165 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1166 : name_(name), data_(data) {}
1168 AlphabeticIndex::Record::~Record() {
1172 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1173 if (U_FAILURE(status)) {
1176 if (inputList_ == NULL) {
1177 inputList_ = new UVector(status);
1178 if (inputList_ == NULL) {
1179 status = U_MEMORY_ALLOCATION_ERROR;
1182 inputList_->setDeleter(alphaIndex_deleteRecord);
1184 Record *r = new Record(name, data);
1186 status = U_MEMORY_ALLOCATION_ERROR;
1189 inputList_->addElement(r, status);
1193 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1194 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1199 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1200 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1201 inputList_->removeAllElements();
1207 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1208 initBuckets(status);
1209 if (U_FAILURE(status)) {
1212 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1216 int32_t AlphabeticIndex::getBucketIndex() const {
1217 return labelsIterIndex_;
1221 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1222 if (U_FAILURE(status)) {
1225 if (buckets_ == NULL && currentBucket_ != NULL) {
1226 status = U_ENUM_OUT_OF_SYNC_ERROR;
1229 initBuckets(status);
1230 if (U_FAILURE(status)) {
1234 if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1235 labelsIterIndex_ = buckets_->getBucketCount();
1238 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1239 resetRecordIterator();
1243 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1244 if (currentBucket_ != NULL) {
1245 return currentBucket_->label_;
1247 return emptyString_;
1252 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1253 if (currentBucket_ != NULL) {
1254 return currentBucket_->labelType_;
1256 return U_ALPHAINDEX_NORMAL;
1261 int32_t AlphabeticIndex::getBucketRecordCount() const {
1262 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1263 return currentBucket_->records_->size();
1270 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1271 if (U_FAILURE(status)) {
1274 internalResetBucketIterator();
1279 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1280 if (U_FAILURE(status)) {
1283 if (currentBucket_ == NULL) {
1284 // We are trying to iterate over the items in a bucket, but there is no
1285 // current bucket from the enumeration of buckets.
1286 status = U_INVALID_STATE_ERROR;
1289 if (buckets_ == NULL) {
1290 status = U_ENUM_OUT_OF_SYNC_ERROR;
1293 if (currentBucket_->records_ == NULL) {
1297 if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1298 itemsIterIndex_ = currentBucket_->records_->size();
1305 const UnicodeString &AlphabeticIndex::getRecordName() const {
1306 const UnicodeString *retStr = &emptyString_;
1307 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1308 itemsIterIndex_ >= 0 &&
1309 itemsIterIndex_ < currentBucket_->records_->size()) {
1310 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1311 retStr = &item->name_;
1316 const void *AlphabeticIndex::getRecordData() const {
1317 const void *retPtr = NULL;
1318 if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1319 itemsIterIndex_ >= 0 &&
1320 itemsIterIndex_ < currentBucket_->records_->size()) {
1321 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1322 retPtr = item->data_;
1328 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1329 itemsIterIndex_ = -1;
1335 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1336 const UnicodeString &lowerBoundary,
1337 UAlphabeticIndexLabelType type)
1338 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1339 displayBucket_(NULL), displayIndex_(-1),
1344 AlphabeticIndex::Bucket::~Bucket() {