src/third_party/icu/source/i18n/alphaindex.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2009-2013, International Business Machines Corporation and
   4 * others. All Rights Reserved.
   5 *******************************************************************************
   6 */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
  11
  12 #include "unicode/alphaindex.h"
  13 #include "unicode/coleitr.h"
  14 #include "unicode/coll.h"
  15 #include "unicode/localpointer.h"
  16 #include "unicode/normalizer2.h"
  17 #include "unicode/tblcoll.h"
  18 #include "unicode/ulocdata.h"
  19 #include "unicode/uniset.h"
  20 #include "unicode/uobject.h"
  21 #include "unicode/usetiter.h"
  22 #include "unicode/utf16.h"
  23
  24 #include "cmemory.h"
  25 #include "cstring.h"
  26 #include "uassert.h"
  27 #include "uvector.h"
  28
  29 //#include <string>
  30 //#include <iostream>
  31
  32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  33
  34 U_NAMESPACE_BEGIN
  35
  36 namespace {
  37
  38 /**
  39  * Prefix string for Chinese index buckets.
  40  * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
  41  */
  42 const UChar BASE[1] = { 0xFDD0 };
  43 const int32_t BASE_LENGTH = 1;
  44
  45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
  46                                 const UnicodeString &one, const UnicodeString &other);
  47
  48 }  // namespace
  49
  50 static int32_t U_CALLCONV
  51 collatorComparator(const void *context, const void *left, const void *right);
  52
  53 static int32_t U_CALLCONV
  54 recordCompareFn(const void *context, const void *left, const void *right);
  55
  56 //  UVector<Record *> support function, delete a Record.
  57 static void U_CALLCONV
  58 alphaIndex_deleteRecord(void *obj) {
  59     delete static_cast<AlphabeticIndex::Record *>(obj);
  60 }
  61
  62 namespace {
  63
  64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
  65                            UErrorCode &errorCode) {
  66     if (U_FAILURE(errorCode)) { return NULL; }
  67     if (owned.isValid()) {
  68         return owned.orphan();
  69     }
  70     UnicodeString *p = new UnicodeString(s);
  71     if (p == NULL) {
  72         errorCode = U_MEMORY_ALLOCATION_ERROR;
  73     }
  74     return p;
  75 }
  76
  77 inline UnicodeString *getString(const UVector &list, int32_t i) {
  78     return static_cast<UnicodeString *>(list[i]);
  79 }
  80
  81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
  82     return static_cast<AlphabeticIndex::Bucket *>(list[i]);
  83 }
  84
  85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
  86     return static_cast<AlphabeticIndex::Record *>(list[i]);
  87 }
  88
  89 /**
  90  * Like Java Collections.binarySearch(List, String, Comparator).
  91  *
  92  * @return the index>=0 where the item was found,
  93  *         or the index<0 for inserting the string at ~index in sorted order
  94  */
  95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
  96     if (list.size() == 0) { return ~0; }
  97     int32_t start = 0;
  98     int32_t limit = list.size();
  99     for (;;) {
 100         int32_t i = (start + limit) / 2;
 101         const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
 102         UErrorCode errorCode = U_ZERO_ERROR;
 103         UCollationResult cmp = coll.compare(s, *si, errorCode);
 104         if (cmp == UCOL_EQUAL) {
 105             return i;
 106         } else if (cmp < 0) {
 107             if (i == start) {
 108                 return ~start;  // insert s before *si
 109             }
 110             limit = i;
 111         } else {
 112             if (i == start) {
 113                 return ~(start + 1);  // insert s after *si
 114             }
 115             start = i;
 116         }
 117     }
 118 }
 119
 120 }  // namespace
 121
 122 // The BucketList is not in the anonymous namespace because only Clang
 123 // seems to support its use in other classes from there.
 124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
 125 class BucketList : public UObject {
 126 public:
 127     BucketList(UVector *bucketList, UVector *publicBucketList)
 128             : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
 129         int32_t displayIndex = 0;
 130         for (int32_t i = 0; i < publicBucketList->size(); ++i) {
 131             getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
 132         }
 133     }
 134
 135     // The virtual destructor must not be inline.
 136     // See ticket #8454 for details.
 137     virtual ~BucketList();
 138
 139     int32_t getBucketCount() const {
 140         return immutableVisibleList_->size();
 141     }
 142
 143     int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
 144                            UErrorCode &errorCode) {
 145         // binary search
 146         int32_t start = 0;
 147         int32_t limit = bucketList_->size();
 148         while ((start + 1) < limit) {
 149             int32_t i = (start + limit) / 2;
 150             const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
 151             UCollationResult nameVsBucket =
 152                 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
 153             if (nameVsBucket < 0) {
 154                 limit = i;
 155             } else {
 156                 start = i;
 157             }
 158         }
 159         const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
 160         if (bucket->displayBucket_ != NULL) {
 161             bucket = bucket->displayBucket_;
 162         }
 163         return bucket->displayIndex_;
 164     }
 165
 166     /** All of the buckets, visible and invisible. */
 167     UVector *bucketList_;
 168     /** Just the visible buckets. */
 169     UVector *immutableVisibleList_;
 170 };
 171
 172 BucketList::~BucketList() {
 173     delete bucketList_;
 174     if (immutableVisibleList_ != bucketList_) {
 175         delete immutableVisibleList_;
 176     }
 177 }
 178
 179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
 180     delete buckets_;
 181     delete collatorPrimaryOnly_;
 182 }
 183
 184 int32_t
 185 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
 186     return buckets_->getBucketCount();
 187 }
 188
 189 int32_t
 190 AlphabeticIndex::ImmutableIndex::getBucketIndex(
 191         const UnicodeString &name, UErrorCode &errorCode) const {
 192     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
 193 }
 194
 195 const AlphabeticIndex::Bucket *
 196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
 197     if (0 <= index && index < buckets_->getBucketCount()) {
 198         return icu::getBucket(*buckets_->immutableVisibleList_, index);
 199     } else {
 200         return NULL;
 201     }
 202 }
 203
 204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
 205         : inputList_(NULL),
 206           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
 207           maxLabelCount_(99),
 208           initialLabels_(NULL), firstCharsInScripts_(NULL),
 209           collator_(NULL), collatorPrimaryOnly_(NULL),
 210           buckets_(NULL) {
 211     init(&locale, status);
 212 }
 213
 214
 215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
 216         : inputList_(NULL),
 217           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
 218           maxLabelCount_(99),
 219           initialLabels_(NULL), firstCharsInScripts_(NULL),
 220           collator_(collator), collatorPrimaryOnly_(NULL),
 221           buckets_(NULL) {
 222     init(NULL, status);
 223 }
 224
 225
 226
 227 AlphabeticIndex::~AlphabeticIndex() {
 228     delete collator_;
 229     delete collatorPrimaryOnly_;
 230     delete firstCharsInScripts_;
 231     delete buckets_;
 232     delete inputList_;
 233     delete initialLabels_;
 234 }
 235
 236
 237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
 238     if (U_FAILURE(status)) {
 239         return *this;
 240     }
 241     initialLabels_->addAll(additions);
 242     clearBuckets();
 243     return *this;
 244 }
 245
 246
 247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
 248     addIndexExemplars(locale, status);
 249     clearBuckets();
 250     return *this;
 251 }
 252
 253
 254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
 255     if (U_FAILURE(errorCode)) { return NULL; }
 256     // In C++, the ImmutableIndex must own its copy of the BucketList,
 257     // even if it contains no records, for proper memory management.
 258     // We could clone the buckets_ if they are not NULL,
 259     // but that would be worth it only if this method is called multiple times,
 260     // or called after using the old-style bucket iterator API.
 261     LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
 262     LocalPointer<RuleBasedCollator> coll(
 263         static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
 264     if (immutableBucketList.isNull() || coll.isNull()) {
 265         errorCode = U_MEMORY_ALLOCATION_ERROR;
 266         return NULL;
 267     }
 268     ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
 269     if (immIndex == NULL) {
 270         errorCode = U_MEMORY_ALLOCATION_ERROR;
 271         return NULL;
 272     }
 273     // The ImmutableIndex adopted its parameter objects.
 274     immutableBucketList.orphan();
 275     coll.orphan();
 276     return immIndex;
 277 }
 278
 279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
 280     initBuckets(status);
 281     if (U_FAILURE(status)) {
 282         return 0;
 283     }
 284     return buckets_->getBucketCount();
 285 }
 286
 287
 288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
 289     if (U_FAILURE(status) || inputList_ == NULL) {
 290         return 0;
 291     }
 292     return inputList_->size();
 293 }
 294
 295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
 296     const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
 297     if (U_FAILURE(errorCode)) { return; }
 298
 299     const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
 300     const UnicodeString &overflowBoundary =
 301         *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
 302
 303     // We make a sorted array of elements.
 304     // Some of the input may be redundant.
 305     // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
 306     // We filter out those cases.
 307     UnicodeSetIterator iter(*initialLabels_);
 308     while (iter.next()) {
 309         const UnicodeString *item = &iter.getString();
 310         LocalPointer<UnicodeString> ownedItem;
 311         UBool checkDistinct;
 312         int32_t itemLength = item->length();
 313         if (!item->hasMoreChar32Than(0, itemLength, 1)) {
 314             checkDistinct = FALSE;
 315         } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
 316                 item->charAt(itemLength - 2) != 0x2a) {
 317             // Use a label if it is marked with one trailing star,
 318             // even if the label string sorts the same when all contractions are suppressed.
 319             ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
 320             item = ownedItem.getAlias();
 321             if (item == NULL) {
 322                 errorCode = U_MEMORY_ALLOCATION_ERROR;
 323                 return;
 324             }
 325             checkDistinct = FALSE;
 326         } else {
 327             checkDistinct = TRUE;
 328         }
 329         if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
 330             // Ignore a primary-ignorable or non-alphabetic index character.
 331         } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
 332             // Ignore an index characters that will land in the overflow bucket.
 333         } else if (checkDistinct &&
 334                 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
 335             // Ignore a multi-code point index character that does not sort distinctly
 336             // from the sequence of its separate characters.
 337         } else {
 338             int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
 339             if (insertionPoint < 0) {
 340                 indexCharacters.insertElementAt(
 341                     ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
 342             } else {
 343                 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
 344                 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
 345                     indexCharacters.setElementAt(
 346                         ownedString(*item, ownedItem, errorCode), insertionPoint);
 347                 }
 348             }
 349         }
 350     }
 351     if (U_FAILURE(errorCode)) { return; }
 352
 353     // if the result is still too large, cut down to maxCount elements, by removing every nth element
 354
 355     int32_t size = indexCharacters.size() - 1;
 356     if (size > maxLabelCount_) {
 357         int32_t count = 0;
 358         int32_t old = -1;
 359         for (int32_t i = 0; i < indexCharacters.size();) {
 360             ++count;
 361             int32_t bump = count * maxLabelCount_ / size;
 362             if (bump == old) {
 363                 indexCharacters.removeElementAt(i);
 364             } else {
 365                 old = bump;
 366                 ++i;
 367             }
 368         }
 369     }
 370 }
 371
 372 namespace {
 373
 374 const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
 375     if (!current.startsWith(BASE, BASE_LENGTH)) {
 376         return current;
 377     }
 378     UChar rest = current.charAt(BASE_LENGTH);
 379     if (0x2800 < rest && rest <= 0x28FF) { // stroke count
 380         int32_t count = rest-0x2800;
 381         temp.setTo((UChar)(0x30 + count % 10));
 382         if (count >= 10) {
 383             count /= 10;
 384             temp.insert(0, (UChar)(0x30 + count % 10));
 385             if (count >= 10) {
 386                 count /= 10;
 387                 temp.insert(0, (UChar)(0x30 + count));
 388             }
 389         }
 390         return temp.append((UChar)0x5283);
 391     }
 392     return temp.setTo(current, BASE_LENGTH);
 393 }
 394
 395 UBool hasMultiplePrimaryWeights(
 396         CollationElementIterator &cei, int32_t variableTop,
 397         const UnicodeString &s, UErrorCode &errorCode) {
 398     cei.setText(s, errorCode);
 399     UBool seenPrimary = FALSE;
 400     for (;;) {
 401         int32_t ce32 = cei.next(errorCode);
 402         if (ce32 == CollationElementIterator::NULLORDER) {
 403             break;
 404         }
 405         int32_t p = CollationElementIterator::primaryOrder(ce32);
 406         if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
 407             // not primary ignorable, and not a continuation CE
 408             if (seenPrimary) {
 409                 return TRUE;
 410             }
 411             seenPrimary = TRUE;
 412         }
 413     }
 414     return FALSE;
 415 }
 416
 417 }  // namespace
 418
 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
 420     // Initialize indexCharacters.
 421     UVector indexCharacters(errorCode);
 422     indexCharacters.setDeleter(uprv_deleteUObject);
 423     initLabels(indexCharacters, errorCode);
 424     if (U_FAILURE(errorCode)) { return NULL; }
 425
 426     // Variables for hasMultiplePrimaryWeights().
 427     LocalPointer<CollationElementIterator> cei(
 428         collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
 429     if (cei.isNull()) {
 430         errorCode = U_MEMORY_ALLOCATION_ERROR;
 431         return NULL;
 432     }
 433     int32_t variableTop;
 434     if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
 435         variableTop = CollationElementIterator::primaryOrder(
 436             (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
 437     } else {
 438         variableTop = 0;
 439     }
 440     UBool hasInvisibleBuckets = FALSE;
 441
 442     // Helper arrays for Chinese Pinyin collation.
 443     Bucket *asciiBuckets[26] = {
 444         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 445         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
 446     };
 447     Bucket *pinyinBuckets[26] = {
 448         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 449         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
 450     };
 451     UBool hasPinyin = FALSE;
 452
 453     LocalPointer<UVector> bucketList(new UVector(errorCode));
 454     if (bucketList.isNull()) {
 455         errorCode = U_MEMORY_ALLOCATION_ERROR;
 456         return NULL;
 457     }
 458     bucketList->setDeleter(uprv_deleteUObject);
 459
 460     // underflow bucket
 461     Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
 462     if (bucket == NULL) {
 463         errorCode = U_MEMORY_ALLOCATION_ERROR;
 464         return NULL;
 465     }
 466     bucketList->addElement(bucket, errorCode);
 467     if (U_FAILURE(errorCode)) { return NULL; }
 468
 469     UnicodeString temp;
 470
 471     // fix up the list, adding underflow, additions, overflow
 472     // Insert inflow labels as needed.
 473     int32_t scriptIndex = -1;
 474     const UnicodeString *scriptUpperBoundary = &emptyString_;
 475     for (int32_t i = 0; i < indexCharacters.size(); ++i) {
 476         UnicodeString &current = *getString(indexCharacters, i);
 477         if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
 478             // We crossed the script boundary into a new script.
 479             const UnicodeString &inflowBoundary = *scriptUpperBoundary;
 480             UBool skippedScript = FALSE;
 481             for (;;) {
 482                 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
 483                 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
 484                     break;
 485                 }
 486                 skippedScript = TRUE;
 487             }
 488             if (skippedScript && bucketList->size() > 1) {
 489                 // We are skipping one or more scripts,
 490                 // and we are not just getting out of the underflow label.
 491                 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
 492                 if (bucket == NULL) {
 493                     errorCode = U_MEMORY_ALLOCATION_ERROR;
 494                     return NULL;
 495                 }
 496                 bucketList->addElement(bucket, errorCode);
 497             }
 498         }
 499         // Add a bucket with the current label.
 500         bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
 501         if (bucket == NULL) {
 502             errorCode = U_MEMORY_ALLOCATION_ERROR;
 503             return NULL;
 504         }
 505         bucketList->addElement(bucket, errorCode);
 506         // Remember ASCII and Pinyin buckets for Pinyin redirects.
 507         UChar c;
 508         if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) {  // A-Z
 509             asciiBuckets[c - 0x41] = bucket;
 510         } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
 511                 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
 512             pinyinBuckets[c - 0x41] = bucket;
 513             hasPinyin = TRUE;
 514         }
 515         // Check for multiple primary weights.
 516         if (!current.startsWith(BASE, BASE_LENGTH) &&
 517                 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
 518                 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
 519             // "AE-ligature" or "Sch" etc.
 520             for (int32_t i = bucketList->size() - 2;; --i) {
 521                 Bucket *singleBucket = getBucket(*bucketList, i);
 522                 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
 523                     // There is no single-character bucket since the last
 524                     // underflow or inflow label.
 525                     break;
 526                 }
 527                 if (singleBucket->displayBucket_ == NULL &&
 528                         !hasMultiplePrimaryWeights(
 529                             *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
 530                     // Add an invisible bucket that redirects strings greater than the expansion
 531                     // to the previous single-character bucket.
 532                     // For example, after ... Q R S Sch we add Sch\uFFFF->S
 533                     // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
 534                     bucket = new Bucket(emptyString_,
 535                         UnicodeString(current).append((UChar)0xFFFF),
 536                         U_ALPHAINDEX_NORMAL);
 537                     if (bucket == NULL) {
 538                         errorCode = U_MEMORY_ALLOCATION_ERROR;
 539                         return NULL;
 540                     }
 541                     bucket->displayBucket_ = singleBucket;
 542                     bucketList->addElement(bucket, errorCode);
 543                     hasInvisibleBuckets = TRUE;
 544                     break;
 545                 }
 546             }
 547         }
 548     }
 549     if (U_FAILURE(errorCode)) { return NULL; }
 550     if (bucketList->size() == 1) {
 551         // No real labels, show only the underflow label.
 552         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
 553         if (bl == NULL) {
 554             errorCode = U_MEMORY_ALLOCATION_ERROR;
 555             return NULL;
 556         }
 557         bucketList.orphan();
 558         return bl;
 559     }
 560     // overflow bucket
 561     bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
 562     if (bucket == NULL) {
 563         errorCode = U_MEMORY_ALLOCATION_ERROR;
 564         return NULL;
 565     }
 566     bucketList->addElement(bucket, errorCode); // final
 567
 568     if (hasPinyin) {
 569         // Redirect Pinyin buckets.
 570         Bucket *asciiBucket = NULL;
 571         for (int32_t i = 0; i < 26; ++i) {
 572             if (asciiBuckets[i] != NULL) {
 573                 asciiBucket = asciiBuckets[i];
 574             }
 575             if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
 576                 pinyinBuckets[i]->displayBucket_ = asciiBucket;
 577                 hasInvisibleBuckets = TRUE;
 578             }
 579         }
 580     }
 581
 582     if (U_FAILURE(errorCode)) { return NULL; }
 583     if (!hasInvisibleBuckets) {
 584         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
 585         if (bl == NULL) {
 586             errorCode = U_MEMORY_ALLOCATION_ERROR;
 587             return NULL;
 588         }
 589         bucketList.orphan();
 590         return bl;
 591     }
 592     // Merge inflow buckets that are visually adjacent.
 593     // Iterate backwards: Merge inflow into overflow rather than the other way around.
 594     int32_t i = bucketList->size() - 1;
 595     Bucket *nextBucket = getBucket(*bucketList, i);
 596     while (--i > 0) {
 597         bucket = getBucket(*bucketList, i);
 598         if (bucket->displayBucket_ != NULL) {
 599             continue;  // skip invisible buckets
 600         }
 601         if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
 602             if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
 603                 bucket->displayBucket_ = nextBucket;
 604                 continue;
 605             }
 606         }
 607         nextBucket = bucket;
 608     }
 609
 610     LocalPointer<UVector> publicBucketList(new UVector(errorCode));
 611     if (bucketList.isNull()) {
 612         errorCode = U_MEMORY_ALLOCATION_ERROR;
 613         return NULL;
 614     }
 615     // Do not call publicBucketList->setDeleter():
 616     // This vector shares its objects with the bucketList.
 617     for (int32_t i = 0; i < bucketList->size(); ++i) {
 618         bucket = getBucket(*bucketList, i);
 619         if (bucket->displayBucket_ == NULL) {
 620             publicBucketList->addElement(bucket, errorCode);
 621         }
 622     }
 623     if (U_FAILURE(errorCode)) { return NULL; }
 624     BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
 625     if (bl == NULL) {
 626         errorCode = U_MEMORY_ALLOCATION_ERROR;
 627         return NULL;
 628     }
 629     bucketList.orphan();
 630     publicBucketList.orphan();
 631     return bl;
 632 }
 633
 634 /**
 635  * Creates an index, and buckets and sorts the list of records into the index.
 636  */
 637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
 638     if (U_FAILURE(errorCode) || buckets_ != NULL) {
 639         return;
 640     }
 641     buckets_ = createBucketList(errorCode);
 642     if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
 643         return;
 644     }
 645
 646     // Sort the records by name.
 647     // Stable sort preserves input order of collation duplicates.
 648     inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
 649
 650     // Now, we traverse all of the input, which is now sorted.
 651     // If the item doesn't go in the current bucket, we find the next bucket that contains it.
 652     // This makes the process order n*log(n), since we just sort the list and then do a linear process.
 653     // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
 654     // we need to improve it for that case.
 655
 656     Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
 657     int32_t bucketIndex = 1;
 658     Bucket *nextBucket;
 659     const UnicodeString *upperBoundary;
 660     if (bucketIndex < buckets_->bucketList_->size()) {
 661         nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
 662         upperBoundary = &nextBucket->lowerBoundary_;
 663     } else {
 664         nextBucket = NULL;
 665         upperBoundary = NULL;
 666     }
 667     for (int32_t i = 0; i < inputList_->size(); ++i) {
 668         Record *r = getRecord(*inputList_, i);
 669         // if the current bucket isn't the right one, find the one that is
 670         // We have a special flag for the last bucket so that we don't look any further
 671         while (upperBoundary != NULL &&
 672                 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
 673             currentBucket = nextBucket;
 674             // now reset the boundary that we compare against
 675             if (bucketIndex < buckets_->bucketList_->size()) {
 676                 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
 677                 upperBoundary = &nextBucket->lowerBoundary_;
 678             } else {
 679                 upperBoundary = NULL;
 680             }
 681         }
 682         // now put the record into the bucket.
 683         Bucket *bucket = currentBucket;
 684         if (bucket->displayBucket_ != NULL) {
 685             bucket = bucket->displayBucket_;
 686         }
 687         if (bucket->records_ == NULL) {
 688             bucket->records_ = new UVector(errorCode);
 689             if (bucket->records_ == NULL) {
 690                 errorCode = U_MEMORY_ALLOCATION_ERROR;
 691                 return;
 692             }
 693         }
 694         bucket->records_->addElement(r, errorCode);
 695     }
 696 }
 697
 698 void AlphabeticIndex::clearBuckets() {
 699     if (buckets_ != NULL) {
 700         delete buckets_;
 701         buckets_ = NULL;
 702         internalResetBucketIterator();
 703     }
 704 }
 705
 706 void AlphabeticIndex::internalResetBucketIterator() {
 707     labelsIterIndex_ = -1;
 708     currentBucket_ = NULL;
 709 }
 710
 711
 712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
 713     if (U_FAILURE(status)) { return; }
 714     // Chinese index characters, which are specific to each of the several Chinese tailorings,
 715     // take precedence over the single locale data exemplar set per language.
 716     const char *language = locale.getLanguage();
 717     if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
 718             uprv_strcmp(language, "ko") == 0) {
 719         // TODO: This should be done regardless of the language, but it's expensive.
 720         // We should add a Collator function (can be @internal)
 721         // to enumerate just the contractions that start with a given code point or string.
 722         if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
 723             return;
 724         }
 725     }
 726
 727     LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
 728     if (U_FAILURE(status)) {
 729         return;
 730     }
 731
 732     UnicodeSet exemplars;
 733     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
 734     if (U_SUCCESS(status)) {
 735         initialLabels_->addAll(exemplars);
 736         return;
 737     }
 738     status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
 739
 740     // The locale data did not include explicit Index characters.
 741     // Synthesize a set of them from the locale's standard exemplar characters.
 742     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
 743     if (U_FAILURE(status)) {
 744         return;
 745     }
 746
 747     // question: should we add auxiliary exemplars?
 748     if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
 749         exemplars.add(0x61, 0x7A);
 750     }
 751     if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
 752         // cut down to small list
 753         exemplars.remove(0xAC00, 0xD7A3).
 754             add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
 755             add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
 756             add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
 757             add(0xD30C).add(0xD558);
 758     }
 759     if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
 760         // cut down to small list
 761         // make use of the fact that Ethiopic is allocated in 8's, where
 762         // the base is 0 mod 8.
 763         UnicodeSet ethiopic(
 764             UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
 765         UnicodeSetIterator it(ethiopic);
 766         while (it.next() && !it.isString()) {
 767             if ((it.getCodepoint() & 0x7) != 0) {
 768                 exemplars.remove(it.getCodepoint());
 769             }
 770         }
 771     }
 772
 773     // Upper-case any that aren't already so.
 774     //   (We only do this for synthesized index characters.)
 775     UnicodeSetIterator it(exemplars);
 776     UnicodeString upperC;
 777     while (it.next()) {
 778         const UnicodeString &exemplarC = it.getString();
 779         upperC = exemplarC;
 780         upperC.toUpper(locale);
 781         initialLabels_->add(upperC);
 782     }
 783 }
 784
 785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
 786     UnicodeSet contractions;
 787     ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
 788                                       contractions.toUSet(), NULL, FALSE, &errorCode);
 789     if (U_FAILURE(errorCode)) { return FALSE; }
 790     UnicodeString firstHanBoundary;
 791     UBool hasPinyin = FALSE;
 792     UnicodeSetIterator iter(contractions);
 793     while (iter.next()) {
 794         const UnicodeString &s = iter.getString();
 795         if (s.startsWith(BASE, BASE_LENGTH)) {
 796             initialLabels_->add(s);
 797             if (firstHanBoundary.isEmpty() ||
 798                     collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
 799                 firstHanBoundary = s;
 800             }
 801             UChar c = s.charAt(s.length() - 1);
 802             if (0x41 <= c && c <= 0x5A) {  // A-Z
 803                 hasPinyin = TRUE;
 804             }
 805         }
 806     }
 807     if (hasPinyin) {
 808         initialLabels_->add(0x41, 0x5A);  // A-Z
 809     }
 810     if (!firstHanBoundary.isEmpty()) {
 811         // The hardcoded list of script boundaries includes U+4E00
 812         // which is tailored to not be the first primary
 813         // in all Chinese tailorings except "unihan".
 814         // Replace U+4E00 with the first boundary string from the tailoring.
 815         // TODO: This becomes obsolete when the root collator gets
 816         // reliable script-first-primary mappings.
 817         int32_t hanIndex = binarySearch(
 818                 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
 819         if (hanIndex >= 0) {
 820             UnicodeString *fh = new UnicodeString(firstHanBoundary);
 821             if (fh == NULL) {
 822                 errorCode = U_MEMORY_ALLOCATION_ERROR;
 823                 return FALSE;
 824             }
 825             firstCharsInScripts_->setElementAt(fh, hanIndex);
 826         }
 827         return TRUE;
 828     } else {
 829         return FALSE;
 830     }
 831 }
 832
 833
 834 /*
 835  * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
 836  */
 837 static const UChar CGJ = 0x034F;
 838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
 839     UnicodeString result;
 840     if (item.length() == 0) {
 841         return result;
 842     }
 843     int32_t i = 0;
 844     for (;;) {
 845         UChar32  cp = item.char32At(i);
 846         result.append(cp);
 847         i = item.moveIndex32(i, 1);
 848         if (i >= item.length()) {
 849             break;
 850         }
 851         result.append(CGJ);
 852     }
 853     return result;
 854 }
 855
 856
 857 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
 858     return FALSE;
 859 }
 860
 861
 862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
 863     return FALSE;
 864 }
 865
 866
 867 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
 868     // There are no known non-RuleBasedCollator collators, and none ever expected.
 869     // But, in case that changes, better a null pointer than a wrong type.
 870     return *dynamic_cast<RuleBasedCollator *>(collator_);
 871 }
 872
 873
 874 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
 875     return inflowLabel_;
 876 }
 877
 878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
 879     return overflowLabel_;
 880 }
 881
 882
 883 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
 884     return underflowLabel_;
 885 }
 886
 887
 888 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
 889     inflowLabel_ = label;
 890     clearBuckets();
 891     return *this;
 892 }
 893
 894
 895 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
 896     overflowLabel_ = label;
 897     clearBuckets();
 898     return *this;
 899 }
 900
 901
 902 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
 903     underflowLabel_ = label;
 904     clearBuckets();
 905     return *this;
 906 }
 907
 908
 909 int32_t AlphabeticIndex::getMaxLabelCount() const {
 910     return maxLabelCount_;
 911 }
 912
 913
 914 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
 915     if (U_FAILURE(status)) {
 916         return *this;
 917     }
 918     if (maxLabelCount <= 0) {
 919         status = U_ILLEGAL_ARGUMENT_ERROR;
 920         return *this;
 921     }
 922     maxLabelCount_ = maxLabelCount;
 923     clearBuckets();
 924     return *this;
 925 }
 926
 927
 928 //
 929 //  init() - Common code for constructors.
 930 //
 931
 932 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
 933     if (U_FAILURE(status)) { return; }
 934     if (locale == NULL && collator_ == NULL) {
 935         status = U_ILLEGAL_ARGUMENT_ERROR;
 936         return;
 937     }
 938
 939     initialLabels_         = new UnicodeSet();
 940     if (initialLabels_ == NULL) {
 941         status = U_MEMORY_ALLOCATION_ERROR;
 942         return;
 943     }
 944
 945     inflowLabel_.setTo((UChar)0x2026);    // Ellipsis
 946     overflowLabel_ = inflowLabel_;
 947     underflowLabel_ = inflowLabel_;
 948
 949     if (collator_ == NULL) {
 950         collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
 951         if (U_FAILURE(status)) { return; }
 952         if (collator_ == NULL) {
 953             status = U_MEMORY_ALLOCATION_ERROR;
 954             return;
 955         }
 956     }
 957     collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
 958     if (collatorPrimaryOnly_ == NULL) {
 959         status = U_MEMORY_ALLOCATION_ERROR;
 960         return;
 961     }
 962     collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
 963     firstCharsInScripts_ = firstStringsInScript(status);
 964     if (U_FAILURE(status)) { return; }
 965     firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
 966     UnicodeString _4E00((UChar)0x4E00);
 967     UnicodeString _1100((UChar)0x1100);
 968     UnicodeString _1112((UChar)0x1112);
 969     if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
 970             collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
 971         // The standard Korean tailoring sorts Hanja (Han characters)
 972         // as secondary differences from Hangul syllables.
 973         // This makes U+4E00 not useful as a Han-script boundary.
 974         // TODO: This becomes obsolete when the root collator gets
 975         // reliable script-first-primary mappings.
 976         int32_t hanIndex = binarySearch(
 977                 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
 978         if (hanIndex >= 0) {
 979             firstCharsInScripts_->removeElementAt(hanIndex);
 980         }
 981     }
 982     // Guard against a degenerate collator where
 983     // some script boundary strings are primary ignorable.
 984     for (;;) {
 985         if (U_FAILURE(status)) { return; }
 986         if (firstCharsInScripts_->isEmpty()) {
 987             // AlphabeticIndex requires some non-ignorable script boundary strings.
 988             status = U_ILLEGAL_ARGUMENT_ERROR;
 989             return;
 990         }
 991         if (collatorPrimaryOnly_->compare(
 992                 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
 993                 emptyString_, status) == UCOL_EQUAL) {
 994             firstCharsInScripts_->removeElementAt(0);
 995         } else {
 996             break;
 997         }
 998     }
 999
1000     if (locale != NULL) {
1001         addIndexExemplars(*locale, status);
1002     }
1003 }
1004
1005
1006 //
1007 //  Comparison function for UVector<UnicodeString *> sorting with a collator.
1008 //
1009 static int32_t U_CALLCONV
1010 collatorComparator(const void *context, const void *left, const void *right) {
1011     const UElement *leftElement = static_cast<const UElement *>(left);
1012     const UElement *rightElement = static_cast<const UElement *>(right);
1013     const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
1014     const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
1015
1016     if (leftString == rightString) {
1017         // Catches case where both are NULL
1018         return 0;
1019     }
1020     if (leftString == NULL) {
1021         return 1;
1022     };
1023     if (rightString == NULL) {
1024         return -1;
1025     }
1026     const Collator *col = static_cast<const Collator *>(context);
1027     UErrorCode errorCode = U_ZERO_ERROR;
1028     return col->compare(*leftString, *rightString, errorCode);
1029 }
1030
1031 //
1032 //  Comparison function for UVector<Record *> sorting with a collator.
1033 //
1034 static int32_t U_CALLCONV
1035 recordCompareFn(const void *context, const void *left, const void *right) {
1036     const UElement *leftElement = static_cast<const UElement *>(left);
1037     const UElement *rightElement = static_cast<const UElement *>(right);
1038     const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
1039     const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
1040     const Collator *col = static_cast<const Collator *>(context);
1041     UErrorCode errorCode = U_ZERO_ERROR;
1042     return col->compare(leftRec->name_, rightRec->name_, errorCode);
1043 }
1044
1045
1046 /**
1047  * This list contains one character per script that has the
1048  * lowest primary weight for that script in the root collator.
1049  * This list will be copied and sorted to account for script reordering.
1050  *
1051  * <p>TODO: This is fragile. If the first character of a script is tailored
1052  * so that it does not map to the script's lowest primary weight any more,
1053  * then the buckets will be off.
1054  * There are hacks in the code to handle the known CJK tailorings of U+4E00.
1055  *
1056  * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
1057  *
1058  * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
1059  * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
1060  */
1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  {
1062     0x41, 0, 0x03B1, 0,
1063     0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
1064     0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
1065     0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
1066     0xAAF2, 0,  // Meetei Mayek
1067     0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
1068     U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada
1069     U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri
1070     0x1B83, 0,
1071     0xD802, 0xDE00, 0, 0x0E01, 0,
1072     0x0EDE, 0,  // Lao
1073     0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
1074     0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
1075     U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma
1076     0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
1077     0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
1078     0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
1079     U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao
1080     0xD800, 0xDE80, 0,
1081     0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
1082     0xD801, 0xDC80, 0,
1083     U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng
1084     0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
1085     0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
1086     U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0,  // Meroitic Cursive
1087     U16_LEAD(0x10980), U16_TRAIL(0x10980), 0,  // Meroitic Hieroglyphs
1088     0x4E00, 0,
1089     // TODO: The overflow bucket's lowerBoundary string should be the
1090     // first item after the last reordering group in the collator's script order.
1091     // This should normally be the first Unicode code point
1092     // that is unassigned (U+0378 in Unicode 6.3) and untailored.
1093     // However, at least up to ICU 51 the Hani reordering group includes
1094     // unassigned code points,
1095     // and there is no stable string for the start of the trailing-weights range.
1096     // The only known string that sorts "high" is U+FFFF.
1097     // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
1098     // and fix relevant test code.
1099     // Ideally, FractionalUCA.txt will have a "script first primary"
1100     // for unassigned code points.
1101     0xFFFF, 0
1102 };
1103
1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
1105     if (U_FAILURE(status)) {
1106         return NULL;
1107     }
1108     UVector *dest = new UVector(status);
1109     if (dest == NULL) {
1110         status = U_MEMORY_ALLOCATION_ERROR;
1111         return NULL;
1112     }
1113     dest->setDeleter(uprv_deleteUObject);
1114     const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
1115     const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
1116     do {
1117         if (U_FAILURE(status)) {
1118             return dest;
1119         }
1120         UnicodeString *str = new UnicodeString(src, -1);
1121         if (str == NULL) {
1122             status = U_MEMORY_ALLOCATION_ERROR;
1123             return dest;
1124         }
1125         dest->addElement(str, status);
1126         src += str->length() + 1;
1127     } while (src < limit);
1128     return dest;
1129 }
1130
1131
1132 namespace {
1133
1134 /**
1135  * Returns true if one index character string is "better" than the other.
1136  * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
1137  * better, and otherwise binary-less-than is better.
1138  */
1139 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
1140                                 const UnicodeString &one, const UnicodeString &other) {
1141     // This is called with primary-equal strings, but never with one.equals(other).
1142     UErrorCode status = U_ZERO_ERROR;
1143     UnicodeString n1 = nfkdNormalizer.normalize(one, status);
1144     UnicodeString n2 = nfkdNormalizer.normalize(other, status);
1145     if (U_FAILURE(status)) { return FALSE; }
1146     int32_t result = n1.countChar32() - n2.countChar32();
1147     if (result != 0) {
1148         return result < 0;
1149     }
1150     result = n1.compareCodePointOrder(n2);
1151     if (result != 0) {
1152         return result < 0;
1153     }
1154     return one.compareCodePointOrder(other) < 0;
1155 }
1156
1157 }  // namespace
1158
1159 //
1160 //  Constructor & Destructor for AlphabeticIndex::Record
1161 //
1162 //     Records are internal only, instances are not directly surfaced in the public API.
1163 //     This class is mostly struct-like, with all public fields.
1164
1165 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
1166         : name_(name), data_(data) {}
1167
1168 AlphabeticIndex::Record::~Record() {
1169 }
1170
1171
1172 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
1173     if (U_FAILURE(status)) {
1174         return *this;
1175     }
1176     if (inputList_ == NULL) {
1177         inputList_ = new UVector(status);
1178         if (inputList_ == NULL) {
1179             status = U_MEMORY_ALLOCATION_ERROR;
1180             return *this;
1181         }
1182         inputList_->setDeleter(alphaIndex_deleteRecord);
1183     }
1184     Record *r = new Record(name, data);
1185     if (r == NULL) {
1186         status = U_MEMORY_ALLOCATION_ERROR;
1187         return *this;
1188     }
1189     inputList_->addElement(r, status);
1190     clearBuckets();
1191     //std::string ss;
1192     //std::string ss2;
1193     //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
1194     //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
1195     return *this;
1196 }
1197
1198
1199 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
1200     if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
1201         inputList_->removeAllElements();
1202         clearBuckets();
1203     }
1204     return *this;
1205 }
1206
1207 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
1208     initBuckets(status);
1209     if (U_FAILURE(status)) {
1210         return 0;
1211     }
1212     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
1213 }
1214
1215
1216 int32_t AlphabeticIndex::getBucketIndex() const {
1217     return labelsIterIndex_;
1218 }
1219
1220
1221 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
1222     if (U_FAILURE(status)) {
1223         return FALSE;
1224     }
1225     if (buckets_ == NULL && currentBucket_ != NULL) {
1226         status = U_ENUM_OUT_OF_SYNC_ERROR;
1227         return FALSE;
1228     }
1229     initBuckets(status);
1230     if (U_FAILURE(status)) {
1231         return FALSE;
1232     }
1233     ++labelsIterIndex_;
1234     if (labelsIterIndex_ >= buckets_->getBucketCount()) {
1235         labelsIterIndex_ = buckets_->getBucketCount();
1236         return FALSE;
1237     }
1238     currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
1239     resetRecordIterator();
1240     return TRUE;
1241 }
1242
1243 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
1244     if (currentBucket_ != NULL) {
1245         return currentBucket_->label_;
1246     } else {
1247         return emptyString_;
1248     }
1249 }
1250
1251
1252 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
1253     if (currentBucket_ != NULL) {
1254         return currentBucket_->labelType_;
1255     } else {
1256         return U_ALPHAINDEX_NORMAL;
1257     }
1258 }
1259
1260
1261 int32_t AlphabeticIndex::getBucketRecordCount() const {
1262     if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
1263         return currentBucket_->records_->size();
1264     } else {
1265         return 0;
1266     }
1267 }
1268
1269
1270 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
1271     if (U_FAILURE(status)) {
1272         return *this;
1273     }
1274     internalResetBucketIterator();
1275     return *this;
1276 }
1277
1278
1279 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
1280     if (U_FAILURE(status)) {
1281         return FALSE;
1282     }
1283     if (currentBucket_ == NULL) {
1284         // We are trying to iterate over the items in a bucket, but there is no
1285         // current bucket from the enumeration of buckets.
1286         status = U_INVALID_STATE_ERROR;
1287         return FALSE;
1288     }
1289     if (buckets_ == NULL) {
1290         status = U_ENUM_OUT_OF_SYNC_ERROR;
1291         return FALSE;
1292     }
1293     if (currentBucket_->records_ == NULL) {
1294         return FALSE;
1295     }
1296     ++itemsIterIndex_;
1297     if (itemsIterIndex_ >= currentBucket_->records_->size()) {
1298         itemsIterIndex_  = currentBucket_->records_->size();
1299         return FALSE;
1300     }
1301     return TRUE;
1302 }
1303
1304
1305 const UnicodeString &AlphabeticIndex::getRecordName() const {
1306     const UnicodeString *retStr = &emptyString_;
1307     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1308         itemsIterIndex_ >= 0 &&
1309         itemsIterIndex_ < currentBucket_->records_->size()) {
1310             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1311             retStr = &item->name_;
1312     }
1313     return *retStr;
1314 }
1315
1316 const void *AlphabeticIndex::getRecordData() const {
1317     const void *retPtr = NULL;
1318     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
1319         itemsIterIndex_ >= 0 &&
1320         itemsIterIndex_ < currentBucket_->records_->size()) {
1321             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
1322             retPtr = item->data_;
1323     }
1324     return retPtr;
1325 }
1326
1327
1328 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
1329     itemsIterIndex_ = -1;
1330     return *this;
1331 }
1332
1333
1334
1335 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
1336                                 const UnicodeString &lowerBoundary,
1337                                 UAlphabeticIndexLabelType type)
1338         : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
1339           displayBucket_(NULL), displayIndex_(-1),
1340           records_(NULL) {
1341 }
1342
1343
1344 AlphabeticIndex::Bucket::~Bucket() {
1345     delete records_;
1346 }
1347
1348 U_NAMESPACE_END
1349
1350 #endif