source/common/unistr.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 ******************************************************************************
   5 * Copyright (C) 1999-2016, International Business Machines Corporation and
   6 * others. All Rights Reserved.
   7 ******************************************************************************
   8 *
   9 * File unistr.cpp
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   09/25/98    stephen     Creation.
  15 *   04/20/99    stephen     Overhauled per 4/16 code review.
  16 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
  17 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
  18 *                           Replaceable.
  19 *   06/25/01    grhoten     Removed the dependency on iostream
  20 ******************************************************************************
  21 */
  22
  23 #include "unicode/utypes.h"
  24 #include "unicode/appendable.h"
  25 #include "unicode/putil.h"
  26 #include "cstring.h"
  27 #include "cmemory.h"
  28 #include "unicode/ustring.h"
  29 #include "unicode/unistr.h"
  30 #include "unicode/utf.h"
  31 #include "unicode/utf16.h"
  32 #include "uelement.h"
  33 #include "ustr_imp.h"
  34 #include "umutex.h"
  35 #include "uassert.h"
  36
  37 #if 0
  38
  39 #include <iostream>
  40 using namespace std;
  41
  42 //DEBUGGING
  43 void
  44 print(const UnicodeString& s,
  45       const char *name)
  46 {
  47   UChar c;
  48   cout << name << ":|";
  49   for(int i = 0; i < s.length(); ++i) {
  50     c = s[i];
  51     if(c>= 0x007E || c < 0x0020)
  52       cout << "[0x" << hex << s[i] << "]";
  53     else
  54       cout << (char) s[i];
  55   }
  56   cout << '|' << endl;
  57 }
  58
  59 void
  60 print(const UChar *s,
  61       int32_t len,
  62       const char *name)
  63 {
  64   UChar c;
  65   cout << name << ":|";
  66   for(int i = 0; i < len; ++i) {
  67     c = s[i];
  68     if(c>= 0x007E || c < 0x0020)
  69       cout << "[0x" << hex << s[i] << "]";
  70     else
  71       cout << (char) s[i];
  72   }
  73   cout << '|' << endl;
  74 }
  75 // END DEBUGGING
  76 #endif
  77
  78 // Local function definitions for now
  79
  80 // need to copy areas that may overlap
  81 static
  82 inline void
  83 us_arrayCopy(const UChar *src, int32_t srcStart,
  84          UChar *dst, int32_t dstStart, int32_t count)
  85 {
  86   if(count>0) {
  87     uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
  88   }
  89 }
  90
  91 // u_unescapeAt() callback to get a UChar from a UnicodeString
  92 U_CDECL_BEGIN
  93 static UChar U_CALLCONV
  94 UnicodeString_charAt(int32_t offset, void *context) {
  95     return ((icu::UnicodeString*) context)->charAt(offset);
  96 }
  97 U_CDECL_END
  98
  99 U_NAMESPACE_BEGIN
 100
 101 /* The Replaceable virtual destructor can't be defined in the header
 102    due to how AIX works with multiple definitions of virtual functions.
 103 */
 104 Replaceable::~Replaceable() {}
 105
 106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
 107
 108 UnicodeString U_EXPORT2
 109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
 110     return
 111         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
 112             append(s1).
 113                 append(s2);
 114 }
 115
 116 //========================================
 117 // Reference Counting functions, put at top of file so that optimizing compilers
 118 //                               have a chance to automatically inline.
 119 //========================================
 120
 121 void
 122 UnicodeString::addRef() {
 123   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 124 }
 125
 126 int32_t
 127 UnicodeString::removeRef() {
 128   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
 129 }
 130
 131 int32_t
 132 UnicodeString::refCount() const {
 133   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
 134 }
 135
 136 void
 137 UnicodeString::releaseArray() {
 138   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
 139     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
 140   }
 141 }
 142
 143
 144
 145 //========================================
 146 // Constructors
 147 //========================================
 148
 149 // The default constructor is inline in unistr.h.
 150
 151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
 152   fUnion.fFields.fLengthAndFlags = 0;
 153   if(count <= 0 || (uint32_t)c > 0x10ffff) {
 154     // just allocate and do not do anything else
 155     allocate(capacity);
 156   } else if(c <= 0xffff) {
 157     int32_t length = count;
 158     if(capacity < length) {
 159       capacity = length;
 160     }
 161     if(allocate(capacity)) {
 162       UChar *array = getArrayStart();
 163       UChar unit = (UChar)c;
 164       for(int32_t i = 0; i < length; ++i) {
 165         array[i] = unit;
 166       }
 167       setLength(length);
 168     }
 169   } else {  // supplementary code point, write surrogate pairs
 170     if(count > (INT32_MAX / 2)) {
 171       // We would get more than 2G UChars.
 172       allocate(capacity);
 173       return;
 174     }
 175     int32_t length = count * 2;
 176     if(capacity < length) {
 177       capacity = length;
 178     }
 179     if(allocate(capacity)) {
 180       UChar *array = getArrayStart();
 181       UChar lead = U16_LEAD(c);
 182       UChar trail = U16_TRAIL(c);
 183       for(int32_t i = 0; i < length; i += 2) {
 184         array[i] = lead;
 185         array[i + 1] = trail;
 186       }
 187       setLength(length);
 188     }
 189   }
 190 }
 191
 192 UnicodeString::UnicodeString(UChar ch) {
 193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
 194   fUnion.fStackFields.fBuffer[0] = ch;
 195 }
 196
 197 UnicodeString::UnicodeString(UChar32 ch) {
 198   fUnion.fFields.fLengthAndFlags = kShortString;
 199   int32_t i = 0;
 200   UBool isError = FALSE;
 201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
 202   // We test isError so that the compiler does not complain that we don't.
 203   // If isError then i==0 which is what we want anyway.
 204   if(!isError) {
 205     setShortLength(i);
 206   }
 207 }
 208
 209 UnicodeString::UnicodeString(const UChar *text) {
 210   fUnion.fFields.fLengthAndFlags = kShortString;
 211   doAppend(text, 0, -1);
 212 }
 213
 214 UnicodeString::UnicodeString(const UChar *text,
 215                              int32_t textLength) {
 216   fUnion.fFields.fLengthAndFlags = kShortString;
 217   doAppend(text, 0, textLength);
 218 }
 219
 220 UnicodeString::UnicodeString(UBool isTerminated,
 221                              const UChar *text,
 222                              int32_t textLength) {
 223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
 224   if(text == NULL) {
 225     // treat as an empty string, do not alias
 226     setToEmpty();
 227   } else if(textLength < -1 ||
 228             (textLength == -1 && !isTerminated) ||
 229             (textLength >= 0 && isTerminated && text[textLength] != 0)
 230   ) {
 231     setToBogus();
 232   } else {
 233     if(textLength == -1) {
 234       // text is terminated, or else it would have failed the above test
 235       textLength = u_strlen(text);
 236     }
 237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
 238   }
 239 }
 240
 241 UnicodeString::UnicodeString(UChar *buff,
 242                              int32_t buffLength,
 243                              int32_t buffCapacity) {
 244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
 245   if(buff == NULL) {
 246     // treat as an empty string, do not alias
 247     setToEmpty();
 248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
 249     setToBogus();
 250   } else {
 251     if(buffLength == -1) {
 252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
 253       const UChar *p = buff, *limit = buff + buffCapacity;
 254       while(p != limit && *p != 0) {
 255         ++p;
 256       }
 257       buffLength = (int32_t)(p - buff);
 258     }
 259     setArray(buff, buffLength, buffCapacity);
 260   }
 261 }
 262
 263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
 264   fUnion.fFields.fLengthAndFlags = kShortString;
 265   if(src==NULL) {
 266     // treat as an empty string
 267   } else {
 268     if(length<0) {
 269       length=(int32_t)uprv_strlen(src);
 270     }
 271     if(cloneArrayIfNeeded(length, length, FALSE)) {
 272       u_charsToUChars(src, getArrayStart(), length);
 273       setLength(length);
 274     } else {
 275       setToBogus();
 276     }
 277   }
 278 }
 279
 280 #if U_CHARSET_IS_UTF8
 281
 282 UnicodeString::UnicodeString(const char *codepageData) {
 283   fUnion.fFields.fLengthAndFlags = kShortString;
 284   if(codepageData != 0) {
 285     setToUTF8(codepageData);
 286   }
 287 }
 288
 289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
 290   fUnion.fFields.fLengthAndFlags = kShortString;
 291   // if there's nothing to convert, do nothing
 292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
 293     return;
 294   }
 295   if(dataLength == -1) {
 296     dataLength = (int32_t)uprv_strlen(codepageData);
 297   }
 298   setToUTF8(StringPiece(codepageData, dataLength));
 299 }
 300
 301 // else see unistr_cnv.cpp
 302 #endif
 303
 304 UnicodeString::UnicodeString(const UnicodeString& that) {
 305   fUnion.fFields.fLengthAndFlags = kShortString;
 306   copyFrom(that);
 307 }
 308
 309 #if U_HAVE_RVALUE_REFERENCES
 310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
 311   fUnion.fFields.fLengthAndFlags = kShortString;
 312   moveFrom(src);
 313 }
 314 #endif
 315
 316 UnicodeString::UnicodeString(const UnicodeString& that,
 317                              int32_t srcStart) {
 318   fUnion.fFields.fLengthAndFlags = kShortString;
 319   setTo(that, srcStart);
 320 }
 321
 322 UnicodeString::UnicodeString(const UnicodeString& that,
 323                              int32_t srcStart,
 324                              int32_t srcLength) {
 325   fUnion.fFields.fLengthAndFlags = kShortString;
 326   setTo(that, srcStart, srcLength);
 327 }
 328
 329 // Replaceable base class clone() default implementation, does not clone
 330 Replaceable *
 331 Replaceable::clone() const {
 332   return NULL;
 333 }
 334
 335 // UnicodeString overrides clone() with a real implementation
 336 Replaceable *
 337 UnicodeString::clone() const {
 338   return new UnicodeString(*this);
 339 }
 340
 341 //========================================
 342 // array allocation
 343 //========================================
 344
 345 namespace {
 346
 347 const int32_t kGrowSize = 128;
 348
 349 // The number of bytes for one int32_t reference counter and capacity UChars
 350 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
 351 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
 352 // and round up to a multiple of 16 bytes.
 353 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
 354 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
 355 // but that does not seem worth it.)
 356 const int32_t kMaxCapacity = 0x7ffffff5;
 357
 358 int32_t getGrowCapacity(int32_t newLength) {
 359   int32_t growSize = (newLength >> 2) + kGrowSize;
 360   if(growSize <= (kMaxCapacity - newLength)) {
 361     return newLength + growSize;
 362   } else {
 363     return kMaxCapacity;
 364   }
 365 }
 366
 367 }  // namespace
 368
 369 UBool
 370 UnicodeString::allocate(int32_t capacity) {
 371   if(capacity <= US_STACKBUF_SIZE) {
 372     fUnion.fFields.fLengthAndFlags = kShortString;
 373     return TRUE;
 374   }
 375   if(capacity <= kMaxCapacity) {
 376     ++capacity;  // for the NUL
 377     // Switch to size_t which is unsigned so that we can allocate up to 4GB.
 378     // Reference counter + UChars.
 379     size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
 380     // Round up to a multiple of 16.
 381     numBytes = (numBytes + 15) & ~15;
 382     int32_t *array = (int32_t *) uprv_malloc(numBytes);
 383     if(array != NULL) {
 384       // set initial refCount and point behind the refCount
 385       *array++ = 1;
 386       numBytes -= sizeof(int32_t);
 387
 388       // have fArray point to the first UChar
 389       fUnion.fFields.fArray = (UChar *)array;
 390       fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
 391       fUnion.fFields.fLengthAndFlags = kLongString;
 392       return TRUE;
 393     }
 394   }
 395   fUnion.fFields.fLengthAndFlags = kIsBogus;
 396   fUnion.fFields.fArray = 0;
 397   fUnion.fFields.fCapacity = 0;
 398   return FALSE;
 399 }
 400
 401 //========================================
 402 // Destructor
 403 //========================================
 404
 405 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 406 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
 407 static u_atomic_int32_t beyondCount(0);
 408
 409 U_CAPI void unistr_printLengths() {
 410   int32_t i;
 411   for(i = 0; i <= 59; ++i) {
 412     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
 413   }
 414   int32_t beyond = beyondCount;
 415   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
 416     beyond += finalLengthCounts[i];
 417   }
 418   printf(">59, %9d\n", beyond);
 419 }
 420 #endif
 421
 422 UnicodeString::~UnicodeString()
 423 {
 424 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
 425   // Count lengths of strings at the end of their lifetime.
 426   // Useful for discussion of a desirable stack buffer size.
 427   // Count the contents length, not the optional NUL terminator nor further capacity.
 428   // Ignore open-buffer strings and strings which alias external storage.
 429   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
 430     if(hasShortLength()) {
 431       umtx_atomic_inc(finalLengthCounts + getShortLength());
 432     } else {
 433       umtx_atomic_inc(&beyondCount);
 434     }
 435   }
 436 #endif
 437
 438   releaseArray();
 439 }
 440
 441 //========================================
 442 // Factory methods
 443 //========================================
 444
 445 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
 446   UnicodeString result;
 447   result.setToUTF8(utf8);
 448   return result;
 449 }
 450
 451 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
 452   UnicodeString result;
 453   int32_t capacity;
 454   // Most UTF-32 strings will be BMP-only and result in a same-length
 455   // UTF-16 string. We overestimate the capacity just slightly,
 456   // just in case there are a few supplementary characters.
 457   if(length <= US_STACKBUF_SIZE) {
 458     capacity = US_STACKBUF_SIZE;
 459   } else {
 460     capacity = length + (length >> 4) + 4;
 461   }
 462   do {
 463     UChar *utf16 = result.getBuffer(capacity);
 464     int32_t length16;
 465     UErrorCode errorCode = U_ZERO_ERROR;
 466     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
 467         utf32, length,
 468         0xfffd,  // Substitution character.
 469         NULL,    // Don't care about number of substitutions.
 470         &errorCode);
 471     result.releaseBuffer(length16);
 472     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 473       capacity = length16 + 1;  // +1 for the terminating NUL.
 474       continue;
 475     } else if(U_FAILURE(errorCode)) {
 476       result.setToBogus();
 477     }
 478     break;
 479   } while(TRUE);
 480   return result;
 481 }
 482
 483 //========================================
 484 // Assignment
 485 //========================================
 486
 487 UnicodeString &
 488 UnicodeString::operator=(const UnicodeString &src) {
 489   return copyFrom(src);
 490 }
 491
 492 UnicodeString &
 493 UnicodeString::fastCopyFrom(const UnicodeString &src) {
 494   return copyFrom(src, TRUE);
 495 }
 496
 497 UnicodeString &
 498 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
 499   // if assigning to ourselves, do nothing
 500   if(this == &src) {
 501     return *this;
 502   }
 503
 504   // is the right side bogus?
 505   if(src.isBogus()) {
 506     setToBogus();
 507     return *this;
 508   }
 509
 510   // delete the current contents
 511   releaseArray();
 512
 513   if(src.isEmpty()) {
 514     // empty string - use the stack buffer
 515     setToEmpty();
 516     return *this;
 517   }
 518
 519   // fLength>0 and not an "open" src.getBuffer(minCapacity)
 520   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 521   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
 522   case kShortString:
 523     // short string using the stack buffer, do the same
 524     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 525                 getShortLength() * U_SIZEOF_UCHAR);
 526     break;
 527   case kLongString:
 528     // src uses a refCounted string buffer, use that buffer with refCount
 529     // src is const, use a cast - we don't actually change it
 530     ((UnicodeString &)src).addRef();
 531     // copy all fields, share the reference-counted buffer
 532     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 533     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 534     if(!hasShortLength()) {
 535       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 536     }
 537     break;
 538   case kReadonlyAlias:
 539     if(fastCopy) {
 540       // src is a readonly alias, do the same
 541       // -> maintain the readonly alias as such
 542       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 543       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 544       if(!hasShortLength()) {
 545         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 546       }
 547       break;
 548     }
 549     // else if(!fastCopy) fall through to case kWritableAlias
 550     // -> allocate a new buffer and copy the contents
 551     U_FALLTHROUGH;
 552   case kWritableAlias: {
 553     // src is a writable alias; we make a copy of that instead
 554     int32_t srcLength = src.length();
 555     if(allocate(srcLength)) {
 556       u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
 557       setLength(srcLength);
 558       break;
 559     }
 560     // if there is not enough memory, then fall through to setting to bogus
 561     U_FALLTHROUGH;
 562   }
 563   default:
 564     // if src is bogus, set ourselves to bogus
 565     // do not call setToBogus() here because fArray and flags are not consistent here
 566     fUnion.fFields.fLengthAndFlags = kIsBogus;
 567     fUnion.fFields.fArray = 0;
 568     fUnion.fFields.fCapacity = 0;
 569     break;
 570   }
 571
 572   return *this;
 573 }
 574
 575 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
 576   // No explicit check for self move assignment, consistent with standard library.
 577   // Self move assignment causes no crash nor leak but might make the object bogus.
 578   releaseArray();
 579   copyFieldsFrom(src, TRUE);
 580   return *this;
 581 }
 582
 583 // Same as moveFrom() except without memory management.
 584 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
 585   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
 586   if(lengthAndFlags & kUsingStackBuffer) {
 587     // Short string using the stack buffer, copy the contents.
 588     // Check for self assignment to prevent "overlap in memcpy" warnings,
 589     // although it should be harmless to copy a buffer to itself exactly.
 590     if(this != &src) {
 591       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
 592                   getShortLength() * U_SIZEOF_UCHAR);
 593     }
 594   } else {
 595     // In all other cases, copy all fields.
 596     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
 597     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
 598     if(!hasShortLength()) {
 599       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
 600     }
 601     if(setSrcToBogus) {
 602       // Set src to bogus without releasing any memory.
 603       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
 604       src.fUnion.fFields.fArray = NULL;
 605       src.fUnion.fFields.fCapacity = 0;
 606     }
 607   }
 608 }
 609
 610 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
 611   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
 612   // Copy fields without resetting source values in between.
 613   temp.copyFieldsFrom(*this, FALSE);
 614   this->copyFieldsFrom(other, FALSE);
 615   other.copyFieldsFrom(temp, FALSE);
 616   // Set temp to an empty string so that other's memory is not released twice.
 617   temp.fUnion.fFields.fLengthAndFlags = kShortString;
 618 }
 619
 620 //========================================
 621 // Miscellaneous operations
 622 //========================================
 623
 624 UnicodeString UnicodeString::unescape() const {
 625     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
 626     if (result.isBogus()) {
 627         return result;
 628     }
 629     const UChar *array = getBuffer();
 630     int32_t len = length();
 631     int32_t prev = 0;
 632     for (int32_t i=0;;) {
 633         if (i == len) {
 634             result.append(array, prev, len - prev);
 635             break;
 636         }
 637         if (array[i++] == 0x5C /*'\\'*/) {
 638             result.append(array, prev, (i - 1) - prev);
 639             UChar32 c = unescapeAt(i); // advances i
 640             if (c < 0) {
 641                 result.remove(); // return empty string
 642                 break; // invalid escape sequence
 643             }
 644             result.append(c);
 645             prev = i;
 646         }
 647     }
 648     return result;
 649 }
 650
 651 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
 652     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
 653 }
 654
 655 //========================================
 656 // Read-only implementation
 657 //========================================
 658 UBool
 659 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
 660   // Requires: this & text not bogus and have same lengths.
 661   // Byte-wise comparison works for equality regardless of endianness.
 662   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
 663 }
 664
 665 int8_t
 666 UnicodeString::doCompare( int32_t start,
 667               int32_t length,
 668               const UChar *srcChars,
 669               int32_t srcStart,
 670               int32_t srcLength) const
 671 {
 672   // compare illegal string values
 673   if(isBogus()) {
 674     return -1;
 675   }
 676
 677   // pin indices to legal values
 678   pinIndices(start, length);
 679
 680   if(srcChars == NULL) {
 681     // treat const UChar *srcChars==NULL as an empty string
 682     return length == 0 ? 0 : 1;
 683   }
 684
 685   // get the correct pointer
 686   const UChar *chars = getArrayStart();
 687
 688   chars += start;
 689   srcChars += srcStart;
 690
 691   int32_t minLength;
 692   int8_t lengthResult;
 693
 694   // get the srcLength if necessary
 695   if(srcLength < 0) {
 696     srcLength = u_strlen(srcChars + srcStart);
 697   }
 698
 699   // are we comparing different lengths?
 700   if(length != srcLength) {
 701     if(length < srcLength) {
 702       minLength = length;
 703       lengthResult = -1;
 704     } else {
 705       minLength = srcLength;
 706       lengthResult = 1;
 707     }
 708   } else {
 709     minLength = length;
 710     lengthResult = 0;
 711   }
 712
 713   /*
 714    * note that uprv_memcmp() returns an int but we return an int8_t;
 715    * we need to take care not to truncate the result -
 716    * one way to do this is to right-shift the value to
 717    * move the sign bit into the lower 8 bits and making sure that this
 718    * does not become 0 itself
 719    */
 720
 721   if(minLength > 0 && chars != srcChars) {
 722     int32_t result;
 723
 724 #   if U_IS_BIG_ENDIAN
 725       // big-endian: byte comparison works
 726       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
 727       if(result != 0) {
 728         return (int8_t)(result >> 15 | 1);
 729       }
 730 #   else
 731       // little-endian: compare UChar units
 732       do {
 733         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
 734         if(result != 0) {
 735           return (int8_t)(result >> 15 | 1);
 736         }
 737       } while(--minLength > 0);
 738 #   endif
 739   }
 740   return lengthResult;
 741 }
 742
 743 /* String compare in code point order - doCompare() compares in code unit order. */
 744 int8_t
 745 UnicodeString::doCompareCodePointOrder(int32_t start,
 746                                        int32_t length,
 747                                        const UChar *srcChars,
 748                                        int32_t srcStart,
 749                                        int32_t srcLength) const
 750 {
 751   // compare illegal string values
 752   // treat const UChar *srcChars==NULL as an empty string
 753   if(isBogus()) {
 754     return -1;
 755   }
 756
 757   // pin indices to legal values
 758   pinIndices(start, length);
 759
 760   if(srcChars == NULL) {
 761     srcStart = srcLength = 0;
 762   }
 763
 764   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
 765   /* translate the 32-bit result into an 8-bit one */
 766   if(diff!=0) {
 767     return (int8_t)(diff >> 15 | 1);
 768   } else {
 769     return 0;
 770   }
 771 }
 772
 773 int32_t
 774 UnicodeString::getLength() const {
 775     return length();
 776 }
 777
 778 UChar
 779 UnicodeString::getCharAt(int32_t offset) const {
 780   return charAt(offset);
 781 }
 782
 783 UChar32
 784 UnicodeString::getChar32At(int32_t offset) const {
 785   return char32At(offset);
 786 }
 787
 788 UChar32
 789 UnicodeString::char32At(int32_t offset) const
 790 {
 791   int32_t len = length();
 792   if((uint32_t)offset < (uint32_t)len) {
 793     const UChar *array = getArrayStart();
 794     UChar32 c;
 795     U16_GET(array, 0, offset, len, c);
 796     return c;
 797   } else {
 798     return kInvalidUChar;
 799   }
 800 }
 801
 802 int32_t
 803 UnicodeString::getChar32Start(int32_t offset) const {
 804   if((uint32_t)offset < (uint32_t)length()) {
 805     const UChar *array = getArrayStart();
 806     U16_SET_CP_START(array, 0, offset);
 807     return offset;
 808   } else {
 809     return 0;
 810   }
 811 }
 812
 813 int32_t
 814 UnicodeString::getChar32Limit(int32_t offset) const {
 815   int32_t len = length();
 816   if((uint32_t)offset < (uint32_t)len) {
 817     const UChar *array = getArrayStart();
 818     U16_SET_CP_LIMIT(array, 0, offset, len);
 819     return offset;
 820   } else {
 821     return len;
 822   }
 823 }
 824
 825 int32_t
 826 UnicodeString::countChar32(int32_t start, int32_t length) const {
 827   pinIndices(start, length);
 828   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
 829   return u_countChar32(getArrayStart()+start, length);
 830 }
 831
 832 UBool
 833 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
 834   pinIndices(start, length);
 835   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
 836   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
 837 }
 838
 839 int32_t
 840 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
 841   // pin index
 842   int32_t len = length();
 843   if(index<0) {
 844     index=0;
 845   } else if(index>len) {
 846     index=len;
 847   }
 848
 849   const UChar *array = getArrayStart();
 850   if(delta>0) {
 851     U16_FWD_N(array, index, len, delta);
 852   } else {
 853     U16_BACK_N(array, 0, index, -delta);
 854   }
 855
 856   return index;
 857 }
 858
 859 void
 860 UnicodeString::doExtract(int32_t start,
 861              int32_t length,
 862              UChar *dst,
 863              int32_t dstStart) const
 864 {
 865   // pin indices to legal values
 866   pinIndices(start, length);
 867
 868   // do not copy anything if we alias dst itself
 869   const UChar *array = getArrayStart();
 870   if(array + start != dst + dstStart) {
 871     us_arrayCopy(array, start, dst, dstStart, length);
 872   }
 873 }
 874
 875 int32_t
 876 UnicodeString::extract(UChar *dest, int32_t destCapacity,
 877                        UErrorCode &errorCode) const {
 878   int32_t len = length();
 879   if(U_SUCCESS(errorCode)) {
 880     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
 881       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 882     } else {
 883       const UChar *array = getArrayStart();
 884       if(len>0 && len<=destCapacity && array!=dest) {
 885         u_memcpy(dest, array, len);
 886       }
 887       return u_terminateUChars(dest, destCapacity, len, &errorCode);
 888     }
 889   }
 890
 891   return len;
 892 }
 893
 894 int32_t
 895 UnicodeString::extract(int32_t start,
 896                        int32_t length,
 897                        char *target,
 898                        int32_t targetCapacity,
 899                        enum EInvariant) const
 900 {
 901   // if the arguments are illegal, then do nothing
 902   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
 903     return 0;
 904   }
 905
 906   // pin the indices to legal values
 907   pinIndices(start, length);
 908
 909   if(length <= targetCapacity) {
 910     u_UCharsToChars(getArrayStart() + start, target, length);
 911   }
 912   UErrorCode status = U_ZERO_ERROR;
 913   return u_terminateChars(target, targetCapacity, length, &status);
 914 }
 915
 916 UnicodeString
 917 UnicodeString::tempSubString(int32_t start, int32_t len) const {
 918   pinIndices(start, len);
 919   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
 920   if(array==NULL) {
 921     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
 922     len=-2;  // bogus result string
 923   }
 924   return UnicodeString(FALSE, array + start, len);
 925 }
 926
 927 int32_t
 928 UnicodeString::toUTF8(int32_t start, int32_t len,
 929                       char *target, int32_t capacity) const {
 930   pinIndices(start, len);
 931   int32_t length8;
 932   UErrorCode errorCode = U_ZERO_ERROR;
 933   u_strToUTF8WithSub(target, capacity, &length8,
 934                      getBuffer() + start, len,
 935                      0xFFFD,  // Standard substitution character.
 936                      NULL,    // Don't care about number of substitutions.
 937                      &errorCode);
 938   return length8;
 939 }
 940
 941 #if U_CHARSET_IS_UTF8
 942
 943 int32_t
 944 UnicodeString::extract(int32_t start, int32_t len,
 945                        char *target, uint32_t dstSize) const {
 946   // if the arguments are illegal, then do nothing
 947   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
 948     return 0;
 949   }
 950   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
 951 }
 952
 953 // else see unistr_cnv.cpp
 954 #endif
 955
 956 void
 957 UnicodeString::extractBetween(int32_t start,
 958                   int32_t limit,
 959                   UnicodeString& target) const {
 960   pinIndex(start);
 961   pinIndex(limit);
 962   doExtract(start, limit - start, target);
 963 }
 964
 965 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
 966 // as many bytes as the source has UChars.
 967 // The "worst cases" are writing systems like Indic, Thai and CJK with
 968 // 3:1 bytes:UChars.
 969 void
 970 UnicodeString::toUTF8(ByteSink &sink) const {
 971   int32_t length16 = length();
 972   if(length16 != 0) {
 973     char stackBuffer[1024];
 974     int32_t capacity = (int32_t)sizeof(stackBuffer);
 975     UBool utf8IsOwned = FALSE;
 976     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
 977                                       3*length16,
 978                                       stackBuffer, capacity,
 979                                       &capacity);
 980     int32_t length8 = 0;
 981     UErrorCode errorCode = U_ZERO_ERROR;
 982     u_strToUTF8WithSub(utf8, capacity, &length8,
 983                        getBuffer(), length16,
 984                        0xFFFD,  // Standard substitution character.
 985                        NULL,    // Don't care about number of substitutions.
 986                        &errorCode);
 987     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
 988       utf8 = (char *)uprv_malloc(length8);
 989       if(utf8 != NULL) {
 990         utf8IsOwned = TRUE;
 991         errorCode = U_ZERO_ERROR;
 992         u_strToUTF8WithSub(utf8, length8, &length8,
 993                            getBuffer(), length16,
 994                            0xFFFD,  // Standard substitution character.
 995                            NULL,    // Don't care about number of substitutions.
 996                            &errorCode);
 997       } else {
 998         errorCode = U_MEMORY_ALLOCATION_ERROR;
 999       }
1000     }
1001     if(U_SUCCESS(errorCode)) {
1002       sink.Append(utf8, length8);
1003       sink.Flush();
1004     }
1005     if(utf8IsOwned) {
1006       uprv_free(utf8);
1007     }
1008   }
1009 }
1010
1011 int32_t
1012 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1013   int32_t length32=0;
1014   if(U_SUCCESS(errorCode)) {
1015     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1016     u_strToUTF32WithSub(utf32, capacity, &length32,
1017         getBuffer(), length(),
1018         0xfffd,  // Substitution character.
1019         NULL,    // Don't care about number of substitutions.
1020         &errorCode);
1021   }
1022   return length32;
1023 }
1024
1025 int32_t
1026 UnicodeString::indexOf(const UChar *srcChars,
1027                int32_t srcStart,
1028                int32_t srcLength,
1029                int32_t start,
1030                int32_t length) const
1031 {
1032   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1033     return -1;
1034   }
1035
1036   // UnicodeString does not find empty substrings
1037   if(srcLength < 0 && srcChars[srcStart] == 0) {
1038     return -1;
1039   }
1040
1041   // get the indices within bounds
1042   pinIndices(start, length);
1043
1044   // find the first occurrence of the substring
1045   const UChar *array = getArrayStart();
1046   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1047   if(match == NULL) {
1048     return -1;
1049   } else {
1050     return (int32_t)(match - array);
1051   }
1052 }
1053
1054 int32_t
1055 UnicodeString::doIndexOf(UChar c,
1056              int32_t start,
1057              int32_t length) const
1058 {
1059   // pin indices
1060   pinIndices(start, length);
1061
1062   // find the first occurrence of c
1063   const UChar *array = getArrayStart();
1064   const UChar *match = u_memchr(array + start, c, length);
1065   if(match == NULL) {
1066     return -1;
1067   } else {
1068     return (int32_t)(match - array);
1069   }
1070 }
1071
1072 int32_t
1073 UnicodeString::doIndexOf(UChar32 c,
1074                          int32_t start,
1075                          int32_t length) const {
1076   // pin indices
1077   pinIndices(start, length);
1078
1079   // find the first occurrence of c
1080   const UChar *array = getArrayStart();
1081   const UChar *match = u_memchr32(array + start, c, length);
1082   if(match == NULL) {
1083     return -1;
1084   } else {
1085     return (int32_t)(match - array);
1086   }
1087 }
1088
1089 int32_t
1090 UnicodeString::lastIndexOf(const UChar *srcChars,
1091                int32_t srcStart,
1092                int32_t srcLength,
1093                int32_t start,
1094                int32_t length) const
1095 {
1096   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1097     return -1;
1098   }
1099
1100   // UnicodeString does not find empty substrings
1101   if(srcLength < 0 && srcChars[srcStart] == 0) {
1102     return -1;
1103   }
1104
1105   // get the indices within bounds
1106   pinIndices(start, length);
1107
1108   // find the last occurrence of the substring
1109   const UChar *array = getArrayStart();
1110   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1111   if(match == NULL) {
1112     return -1;
1113   } else {
1114     return (int32_t)(match - array);
1115   }
1116 }
1117
1118 int32_t
1119 UnicodeString::doLastIndexOf(UChar c,
1120                  int32_t start,
1121                  int32_t length) const
1122 {
1123   if(isBogus()) {
1124     return -1;
1125   }
1126
1127   // pin indices
1128   pinIndices(start, length);
1129
1130   // find the last occurrence of c
1131   const UChar *array = getArrayStart();
1132   const UChar *match = u_memrchr(array + start, c, length);
1133   if(match == NULL) {
1134     return -1;
1135   } else {
1136     return (int32_t)(match - array);
1137   }
1138 }
1139
1140 int32_t
1141 UnicodeString::doLastIndexOf(UChar32 c,
1142                              int32_t start,
1143                              int32_t length) const {
1144   // pin indices
1145   pinIndices(start, length);
1146
1147   // find the last occurrence of c
1148   const UChar *array = getArrayStart();
1149   const UChar *match = u_memrchr32(array + start, c, length);
1150   if(match == NULL) {
1151     return -1;
1152   } else {
1153     return (int32_t)(match - array);
1154   }
1155 }
1156
1157 //========================================
1158 // Write implementation
1159 //========================================
1160
1161 UnicodeString&
1162 UnicodeString::findAndReplace(int32_t start,
1163                   int32_t length,
1164                   const UnicodeString& oldText,
1165                   int32_t oldStart,
1166                   int32_t oldLength,
1167                   const UnicodeString& newText,
1168                   int32_t newStart,
1169                   int32_t newLength)
1170 {
1171   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1172     return *this;
1173   }
1174
1175   pinIndices(start, length);
1176   oldText.pinIndices(oldStart, oldLength);
1177   newText.pinIndices(newStart, newLength);
1178
1179   if(oldLength == 0) {
1180     return *this;
1181   }
1182
1183   while(length > 0 && length >= oldLength) {
1184     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1185     if(pos < 0) {
1186       // no more oldText's here: done
1187       break;
1188     } else {
1189       // we found oldText, replace it by newText and go beyond it
1190       replace(pos, oldLength, newText, newStart, newLength);
1191       length -= pos + oldLength - start;
1192       start = pos + newLength;
1193     }
1194   }
1195
1196   return *this;
1197 }
1198
1199
1200 void
1201 UnicodeString::setToBogus()
1202 {
1203   releaseArray();
1204
1205   fUnion.fFields.fLengthAndFlags = kIsBogus;
1206   fUnion.fFields.fArray = 0;
1207   fUnion.fFields.fCapacity = 0;
1208 }
1209
1210 // turn a bogus string into an empty one
1211 void
1212 UnicodeString::unBogus() {
1213   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1214     setToEmpty();
1215   }
1216 }
1217
1218 const UChar *
1219 UnicodeString::getTerminatedBuffer() {
1220   if(!isWritable()) {
1221     return 0;
1222   }
1223   UChar *array = getArrayStart();
1224   int32_t len = length();
1225   if(len < getCapacity()) {
1226     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1227       // If len<capacity on a read-only alias, then array[len] is
1228       // either the original NUL (if constructed with (TRUE, s, length))
1229       // or one of the original string contents characters (if later truncated),
1230       // therefore we can assume that array[len] is initialized memory.
1231       if(array[len] == 0) {
1232         return array;
1233       }
1234     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1235       // kRefCounted: Do not write the NUL if the buffer is shared.
1236       // That is mostly safe, except when the length of one copy was modified
1237       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1238       // Then the NUL would be written into the middle of another copy's string.
1239
1240       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1241       // Do not test if there is a NUL already because it might be uninitialized memory.
1242       // (That would be safe, but tools like valgrind & Purify would complain.)
1243       array[len] = 0;
1244       return array;
1245     }
1246   }
1247   if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1248     array = getArrayStart();
1249     array[len] = 0;
1250     return array;
1251   } else {
1252     return NULL;
1253   }
1254 }
1255
1256 // setTo() analogous to the readonly-aliasing constructor with the same signature
1257 UnicodeString &
1258 UnicodeString::setTo(UBool isTerminated,
1259                      const UChar *text,
1260                      int32_t textLength)
1261 {
1262   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1263     // do not modify a string that has an "open" getBuffer(minCapacity)
1264     return *this;
1265   }
1266
1267   if(text == NULL) {
1268     // treat as an empty string, do not alias
1269     releaseArray();
1270     setToEmpty();
1271     return *this;
1272   }
1273
1274   if( textLength < -1 ||
1275       (textLength == -1 && !isTerminated) ||
1276       (textLength >= 0 && isTerminated && text[textLength] != 0)
1277   ) {
1278     setToBogus();
1279     return *this;
1280   }
1281
1282   releaseArray();
1283
1284   if(textLength == -1) {
1285     // text is terminated, or else it would have failed the above test
1286     textLength = u_strlen(text);
1287   }
1288   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1290   return *this;
1291 }
1292
1293 // setTo() analogous to the writable-aliasing constructor with the same signature
1294 UnicodeString &
1295 UnicodeString::setTo(UChar *buffer,
1296                      int32_t buffLength,
1297                      int32_t buffCapacity) {
1298   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299     // do not modify a string that has an "open" getBuffer(minCapacity)
1300     return *this;
1301   }
1302
1303   if(buffer == NULL) {
1304     // treat as an empty string, do not alias
1305     releaseArray();
1306     setToEmpty();
1307     return *this;
1308   }
1309
1310   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1311     setToBogus();
1312     return *this;
1313   } else if(buffLength == -1) {
1314     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315     const UChar *p = buffer, *limit = buffer + buffCapacity;
1316     while(p != limit && *p != 0) {
1317       ++p;
1318     }
1319     buffLength = (int32_t)(p - buffer);
1320   }
1321
1322   releaseArray();
1323
1324   fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325   setArray(buffer, buffLength, buffCapacity);
1326   return *this;
1327 }
1328
1329 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330   unBogus();
1331   int32_t length = utf8.length();
1332   int32_t capacity;
1333   // The UTF-16 string will be at most as long as the UTF-8 string.
1334   if(length <= US_STACKBUF_SIZE) {
1335     capacity = US_STACKBUF_SIZE;
1336   } else {
1337     capacity = length + 1;  // +1 for the terminating NUL.
1338   }
1339   UChar *utf16 = getBuffer(capacity);
1340   int32_t length16;
1341   UErrorCode errorCode = U_ZERO_ERROR;
1342   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343       utf8.data(), length,
1344       0xfffd,  // Substitution character.
1345       NULL,    // Don't care about number of substitutions.
1346       &errorCode);
1347   releaseBuffer(length16);
1348   if(U_FAILURE(errorCode)) {
1349     setToBogus();
1350   }
1351   return *this;
1352 }
1353
1354 UnicodeString&
1355 UnicodeString::setCharAt(int32_t offset,
1356              UChar c)
1357 {
1358   int32_t len = length();
1359   if(cloneArrayIfNeeded() && len > 0) {
1360     if(offset < 0) {
1361       offset = 0;
1362     } else if(offset >= len) {
1363       offset = len - 1;
1364     }
1365
1366     getArrayStart()[offset] = c;
1367   }
1368   return *this;
1369 }
1370
1371 UnicodeString&
1372 UnicodeString::replace(int32_t start,
1373                int32_t _length,
1374                UChar32 srcChar) {
1375   UChar buffer[U16_MAX_LENGTH];
1376   int32_t count = 0;
1377   UBool isError = FALSE;
1378   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379   // We test isError so that the compiler does not complain that we don't.
1380   // If isError (srcChar is not a valid code point) then count==0 which means
1381   // we remove the source segment rather than replacing it with srcChar.
1382   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1383 }
1384
1385 UnicodeString&
1386 UnicodeString::append(UChar32 srcChar) {
1387   UChar buffer[U16_MAX_LENGTH];
1388   int32_t _length = 0;
1389   UBool isError = FALSE;
1390   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391   // We test isError so that the compiler does not complain that we don't.
1392   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393   return isError ? *this : doAppend(buffer, 0, _length);
1394 }
1395
1396 UnicodeString&
1397 UnicodeString::doReplace( int32_t start,
1398               int32_t length,
1399               const UnicodeString& src,
1400               int32_t srcStart,
1401               int32_t srcLength)
1402 {
1403   // pin the indices to legal values
1404   src.pinIndices(srcStart, srcLength);
1405
1406   // get the characters from src
1407   // and replace the range in ourselves with them
1408   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409 }
1410
1411 UnicodeString&
1412 UnicodeString::doReplace(int32_t start,
1413              int32_t length,
1414              const UChar *srcChars,
1415              int32_t srcStart,
1416              int32_t srcLength)
1417 {
1418   if(!isWritable()) {
1419     return *this;
1420   }
1421
1422   int32_t oldLength = this->length();
1423
1424   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1425   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1426     if(start == 0) {
1427       // remove prefix by adjusting the array pointer
1428       pinIndex(length);
1429       fUnion.fFields.fArray += length;
1430       fUnion.fFields.fCapacity -= length;
1431       setLength(oldLength - length);
1432       return *this;
1433     } else {
1434       pinIndex(start);
1435       if(length >= (oldLength - start)) {
1436         // remove suffix by reducing the length (like truncate())
1437         setLength(start);
1438         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1439         return *this;
1440       }
1441     }
1442   }
1443
1444   if(start == oldLength) {
1445     return doAppend(srcChars, srcStart, srcLength);
1446   }
1447
1448   if(srcChars == 0) {
1449     srcStart = srcLength = 0;
1450   } else if(srcLength < 0) {
1451     // get the srcLength if necessary
1452     srcLength = u_strlen(srcChars + srcStart);
1453   }
1454
1455   // pin the indices to legal values
1456   pinIndices(start, length);
1457
1458   // Calculate the size of the string after the replace.
1459   // Avoid int32_t overflow.
1460   int32_t newLength = oldLength - length;
1461   if(srcLength > (INT32_MAX - newLength)) {
1462     setToBogus();
1463     return *this;
1464   }
1465   newLength += srcLength;
1466
1467   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1468   // therefore we need to keep the current fArray
1469   UChar oldStackBuffer[US_STACKBUF_SIZE];
1470   UChar *oldArray;
1471   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1472     // copy the stack buffer contents because it will be overwritten with
1473     // fUnion.fFields values
1474     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1475     oldArray = oldStackBuffer;
1476   } else {
1477     oldArray = getArrayStart();
1478   }
1479
1480   // clone our array and allocate a bigger array if needed
1481   int32_t *bufferToDelete = 0;
1482   if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1483                          FALSE, &bufferToDelete)
1484   ) {
1485     return *this;
1486   }
1487
1488   // now do the replace
1489
1490   UChar *newArray = getArrayStart();
1491   if(newArray != oldArray) {
1492     // if fArray changed, then we need to copy everything except what will change
1493     us_arrayCopy(oldArray, 0, newArray, 0, start);
1494     us_arrayCopy(oldArray, start + length,
1495                  newArray, start + srcLength,
1496                  oldLength - (start + length));
1497   } else if(length != srcLength) {
1498     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1499     us_arrayCopy(oldArray, start + length,
1500                  newArray, start + srcLength,
1501                  oldLength - (start + length));
1502   }
1503
1504   // now fill in the hole with the new string
1505   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1506
1507   setLength(newLength);
1508
1509   // delayed delete in case srcChars == fArray when we started, and
1510   // to keep oldArray alive for the above operations
1511   if (bufferToDelete) {
1512     uprv_free(bufferToDelete);
1513   }
1514
1515   return *this;
1516 }
1517
1518 // Versions of doReplace() only for append() variants.
1519 // doReplace() and doAppend() optimize for different cases.
1520
1521 UnicodeString&
1522 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1523   if(srcLength == 0) {
1524     return *this;
1525   }
1526
1527   // pin the indices to legal values
1528   src.pinIndices(srcStart, srcLength);
1529   return doAppend(src.getArrayStart(), srcStart, srcLength);
1530 }
1531
1532 UnicodeString&
1533 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1534   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1535     return *this;
1536   }
1537
1538   if(srcLength < 0) {
1539     // get the srcLength if necessary
1540     if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1541       return *this;
1542     }
1543   }
1544
1545   int32_t oldLength = length();
1546   int32_t newLength = oldLength + srcLength;
1547   // optimize append() onto a large-enough, owned string
1548   if((newLength <= getCapacity() && isBufferWritable()) ||
1549       cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1550     UChar *newArray = getArrayStart();
1551     // Do not copy characters when
1552     //   UChar *buffer=str.getAppendBuffer(...);
1553     // is followed by
1554     //   str.append(buffer, length);
1555     // or
1556     //   str.appendString(buffer, length)
1557     // or similar.
1558     if(srcChars + srcStart != newArray + oldLength) {
1559       us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1560     }
1561     setLength(newLength);
1562   }
1563   return *this;
1564 }
1565
1566 /**
1567  * Replaceable API
1568  */
1569 void
1570 UnicodeString::handleReplaceBetween(int32_t start,
1571                                     int32_t limit,
1572                                     const UnicodeString& text) {
1573     replaceBetween(start, limit, text);
1574 }
1575
1576 /**
1577  * Replaceable API
1578  */
1579 void
1580 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1581     if (limit <= start) {
1582         return; // Nothing to do; avoid bogus malloc call
1583     }
1584     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1585     // Check to make sure text is not null.
1586     if (text != NULL) {
1587             extractBetween(start, limit, text, 0);
1588             insert(dest, text, 0, limit - start);
1589             uprv_free(text);
1590     }
1591 }
1592
1593 /**
1594  * Replaceable API
1595  *
1596  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1597  * so we implement this function here.
1598  */
1599 UBool Replaceable::hasMetaData() const {
1600     return TRUE;
1601 }
1602
1603 /**
1604  * Replaceable API
1605  */
1606 UBool UnicodeString::hasMetaData() const {
1607     return FALSE;
1608 }
1609
1610 UnicodeString&
1611 UnicodeString::doReverse(int32_t start, int32_t length) {
1612   if(length <= 1 || !cloneArrayIfNeeded()) {
1613     return *this;
1614   }
1615
1616   // pin the indices to legal values
1617   pinIndices(start, length);
1618   if(length <= 1) {  // pinIndices() might have shrunk the length
1619     return *this;
1620   }
1621
1622   UChar *left = getArrayStart() + start;
1623   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1624   UChar swap;
1625   UBool hasSupplementary = FALSE;
1626
1627   // Before the loop we know left<right because length>=2.
1628   do {
1629     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1630     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1631     *right-- = swap;
1632   } while(left < right);
1633   // Make sure to test the middle code unit of an odd-length string.
1634   // Redundant if the length is even.
1635   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1636
1637   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1638   if(hasSupplementary) {
1639     UChar swap2;
1640
1641     left = getArrayStart() + start;
1642     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1643     while(left < right) {
1644       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1645         *left++ = swap2;
1646         *left++ = swap;
1647       } else {
1648         ++left;
1649       }
1650     }
1651   }
1652
1653   return *this;
1654 }
1655
1656 UBool
1657 UnicodeString::padLeading(int32_t targetLength,
1658                           UChar padChar)
1659 {
1660   int32_t oldLength = length();
1661   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1662     return FALSE;
1663   } else {
1664     // move contents up by padding width
1665     UChar *array = getArrayStart();
1666     int32_t start = targetLength - oldLength;
1667     us_arrayCopy(array, 0, array, start, oldLength);
1668
1669     // fill in padding character
1670     while(--start >= 0) {
1671       array[start] = padChar;
1672     }
1673     setLength(targetLength);
1674     return TRUE;
1675   }
1676 }
1677
1678 UBool
1679 UnicodeString::padTrailing(int32_t targetLength,
1680                            UChar padChar)
1681 {
1682   int32_t oldLength = length();
1683   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1684     return FALSE;
1685   } else {
1686     // fill in padding character
1687     UChar *array = getArrayStart();
1688     int32_t length = targetLength;
1689     while(--length >= oldLength) {
1690       array[length] = padChar;
1691     }
1692     setLength(targetLength);
1693     return TRUE;
1694   }
1695 }
1696
1697 //========================================
1698 // Hashing
1699 //========================================
1700 int32_t
1701 UnicodeString::doHashCode() const
1702 {
1703     /* Delegate hash computation to uhash.  This makes UnicodeString
1704      * hashing consistent with UChar* hashing.  */
1705     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1706     if (hashCode == kInvalidHashCode) {
1707         hashCode = kEmptyHashCode;
1708     }
1709     return hashCode;
1710 }
1711
1712 //========================================
1713 // External Buffer
1714 //========================================
1715
1716 UChar *
1717 UnicodeString::getBuffer(int32_t minCapacity) {
1718   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1719     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1720     setZeroLength();
1721     return getArrayStart();
1722   } else {
1723     return 0;
1724   }
1725 }
1726
1727 void
1728 UnicodeString::releaseBuffer(int32_t newLength) {
1729   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1730     // set the new fLength
1731     int32_t capacity=getCapacity();
1732     if(newLength==-1) {
1733       // the new length is the string length, capped by fCapacity
1734       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1735       while(p<limit && *p!=0) {
1736         ++p;
1737       }
1738       newLength=(int32_t)(p-array);
1739     } else if(newLength>capacity) {
1740       newLength=capacity;
1741     }
1742     setLength(newLength);
1743     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1744   }
1745 }
1746
1747 //========================================
1748 // Miscellaneous
1749 //========================================
1750 UBool
1751 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1752                                   int32_t growCapacity,
1753                                   UBool doCopyArray,
1754                                   int32_t **pBufferToDelete,
1755                                   UBool forceClone) {
1756   // default parameters need to be static, therefore
1757   // the defaults are -1 to have convenience defaults
1758   if(newCapacity == -1) {
1759     newCapacity = getCapacity();
1760   }
1761
1762   // while a getBuffer(minCapacity) is "open",
1763   // prevent any modifications of the string by returning FALSE here
1764   // if the string is bogus, then only an assignment or similar can revive it
1765   if(!isWritable()) {
1766     return FALSE;
1767   }
1768
1769   /*
1770    * We need to make a copy of the array if
1771    * the buffer is read-only, or
1772    * the buffer is refCounted (shared), and refCount>1, or
1773    * the buffer is too small.
1774    * Return FALSE if memory could not be allocated.
1775    */
1776   if(forceClone ||
1777      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1778      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1779      newCapacity > getCapacity()
1780   ) {
1781     // check growCapacity for default value and use of the stack buffer
1782     if(growCapacity < 0) {
1783       growCapacity = newCapacity;
1784     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1785       growCapacity = US_STACKBUF_SIZE;
1786     }
1787
1788     // save old values
1789     UChar oldStackBuffer[US_STACKBUF_SIZE];
1790     UChar *oldArray;
1791     int32_t oldLength = length();
1792     int16_t flags = fUnion.fFields.fLengthAndFlags;
1793
1794     if(flags&kUsingStackBuffer) {
1795       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1796       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1797         // copy the stack buffer contents because it will be overwritten with
1798         // fUnion.fFields values
1799         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1800         oldArray = oldStackBuffer;
1801       } else {
1802         oldArray = NULL; // no need to copy from the stack buffer to itself
1803       }
1804     } else {
1805       oldArray = fUnion.fFields.fArray;
1806       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1807     }
1808
1809     // allocate a new array
1810     if(allocate(growCapacity) ||
1811        (newCapacity < growCapacity && allocate(newCapacity))
1812     ) {
1813       if(doCopyArray) {
1814         // copy the contents
1815         // do not copy more than what fits - it may be smaller than before
1816         int32_t minLength = oldLength;
1817         newCapacity = getCapacity();
1818         if(newCapacity < minLength) {
1819           minLength = newCapacity;
1820         }
1821         if(oldArray != NULL) {
1822           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1823         }
1824         setLength(minLength);
1825       } else {
1826         setZeroLength();
1827       }
1828
1829       // release the old array
1830       if(flags & kRefCounted) {
1831         // the array is refCounted; decrement and release if 0
1832         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1833         if(umtx_atomic_dec(pRefCount) == 0) {
1834           if(pBufferToDelete == 0) {
1835               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1836               // is defined as volatile. (Volatile has useful non-standard behavior
1837               //   with this compiler.)
1838             uprv_free((void *)pRefCount);
1839           } else {
1840             // the caller requested to delete it himself
1841             *pBufferToDelete = (int32_t *)pRefCount;
1842           }
1843         }
1844       }
1845     } else {
1846       // not enough memory for growCapacity and not even for the smaller newCapacity
1847       // reset the old values for setToBogus() to release the array
1848       if(!(flags&kUsingStackBuffer)) {
1849         fUnion.fFields.fArray = oldArray;
1850       }
1851       fUnion.fFields.fLengthAndFlags = flags;
1852       setToBogus();
1853       return FALSE;
1854     }
1855   }
1856   return TRUE;
1857 }
1858
1859 // UnicodeStringAppendable ------------------------------------------------- ***
1860
1861 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1862
1863 UBool
1864 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1865   return str.doAppend(&c, 0, 1).isWritable();
1866 }
1867
1868 UBool
1869 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1870   UChar buffer[U16_MAX_LENGTH];
1871   int32_t cLength = 0;
1872   UBool isError = FALSE;
1873   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1874   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1875 }
1876
1877 UBool
1878 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1879   return str.doAppend(s, 0, length).isWritable();
1880 }
1881
1882 UBool
1883 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1884   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1885 }
1886
1887 UChar *
1888 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1889                                          int32_t desiredCapacityHint,
1890                                          UChar *scratch, int32_t scratchCapacity,
1891                                          int32_t *resultCapacity) {
1892   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1893     *resultCapacity = 0;
1894     return NULL;
1895   }
1896   int32_t oldLength = str.length();
1897   if(minCapacity <= (kMaxCapacity - oldLength) &&
1898       desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1899       str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1900     *resultCapacity = str.getCapacity() - oldLength;
1901     return str.getArrayStart() + oldLength;
1902   }
1903   *resultCapacity = scratchCapacity;
1904   return scratch;
1905 }
1906
1907 U_NAMESPACE_END
1908
1909 U_NAMESPACE_USE
1910
1911 U_CAPI int32_t U_EXPORT2
1912 uhash_hashUnicodeString(const UElement key) {
1913     const UnicodeString *str = (const UnicodeString*) key.pointer;
1914     return (str == NULL) ? 0 : str->hashCode();
1915 }
1916
1917 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1918 // does not depend on hashtable code.
1919 U_CAPI UBool U_EXPORT2
1920 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1921     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1922     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1923     if (str1 == str2) {
1924         return TRUE;
1925     }
1926     if (str1 == NULL || str2 == NULL) {
1927         return FALSE;
1928     }
1929     return *str1 == *str2;
1930 }
1931
1932 #ifdef U_STATIC_IMPLEMENTATION
1933 /*
1934 This should never be called. It is defined here to make sure that the
1935 virtual vector deleting destructor is defined within unistr.cpp.
1936 The vector deleting destructor is already a part of UObject,
1937 but defining it here makes sure that it is included with this object file.
1938 This makes sure that static library dependencies are kept to a minimum.
1939 */
1940 static void uprv_UnicodeStringDummy(void) {
1941     delete [] (new UnicodeString[2]);
1942 }
1943 #endif