2 ******************************************************************************
3 * Copyright (C) 1999-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
9 * Modification History:
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
42 print(const UnicodeString& s,
47 for(int i = 0; i < s.length(); ++i) {
49 if(c>= 0x007E || c < 0x0020)
50 cout << "[0x" << hex << s[i] << "]";
64 for(int i = 0; i < len; ++i) {
66 if(c>= 0x007E || c < 0x0020)
67 cout << "[0x" << hex << s[i] << "]";
76 // Local function definitions for now
78 // need to copy areas that may overlap
81 us_arrayCopy(const UChar *src, int32_t srcStart,
82 UChar *dst, int32_t dstStart, int32_t count)
85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
91 static UChar U_CALLCONV
92 UnicodeString_charAt(int32_t offset, void *context) {
93 return ((icu::UnicodeString*) context)->charAt(offset);
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
102 Replaceable::~Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
120 UnicodeString::addRef() {
121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
125 UnicodeString::removeRef() {
126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
130 UnicodeString::refCount() const {
131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
135 UnicodeString::releaseArray() {
136 if((fFlags & kRefCounted) && removeRef() == 0) {
137 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
143 //========================================
145 //========================================
147 // The default constructor is inline in unistr.h.
149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
157 // count > 0, allocate and fill the new string with count c's
158 int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
159 if(capacity < length) {
162 if(allocate(capacity)) {
163 UChar *array = getArrayStart();
166 // fill the new string with c
168 // fill with length UChars
170 array[i++] = (UChar)c;
173 // get the code units for c
174 UChar units[U16_MAX_LENGTH];
175 U16_APPEND_UNSAFE(units, i, c);
177 // now it must be i==unitCount
180 // for Unicode, unitCount can only be 1, 2, 3, or 4
181 // 1 is handled above
184 while(unitIdx < unitCount) {
185 array[i++]=units[unitIdx++];
194 UnicodeString::UnicodeString(UChar ch)
198 fUnion.fStackBuffer[0] = ch;
201 UnicodeString::UnicodeString(UChar32 ch)
206 UBool isError = FALSE;
207 U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
208 // We test isError so that the compiler does not complain that we don't.
209 // If isError then i==0 which is what we want anyway.
211 fShortLength = (int8_t)i;
215 UnicodeString::UnicodeString(const UChar *text)
219 doReplace(0, 0, text, 0, -1);
222 UnicodeString::UnicodeString(const UChar *text,
227 doReplace(0, 0, text, 0, textLength);
230 UnicodeString::UnicodeString(UBool isTerminated,
234 fFlags(kReadonlyAlias)
237 // treat as an empty string, do not alias
239 } else if(textLength < -1 ||
240 (textLength == -1 && !isTerminated) ||
241 (textLength >= 0 && isTerminated && text[textLength] != 0)
245 if(textLength == -1) {
246 // text is terminated, or else it would have failed the above test
247 textLength = u_strlen(text);
249 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
253 UnicodeString::UnicodeString(UChar *buff,
255 int32_t buffCapacity)
257 fFlags(kWritableAlias)
260 // treat as an empty string, do not alias
262 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
265 if(buffLength == -1) {
266 // fLength = u_strlen(buff); but do not look beyond buffCapacity
267 const UChar *p = buff, *limit = buff + buffCapacity;
268 while(p != limit && *p != 0) {
271 buffLength = (int32_t)(p - buff);
273 setArray(buff, buffLength, buffCapacity);
277 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
282 // treat as an empty string
285 length=(int32_t)uprv_strlen(src);
287 if(cloneArrayIfNeeded(length, length, FALSE)) {
288 u_charsToUChars(src, getArrayStart(), length);
296 #if U_CHARSET_IS_UTF8
298 UnicodeString::UnicodeString(const char *codepageData)
300 fFlags(kShortString) {
301 if(codepageData != 0) {
302 setToUTF8(codepageData);
306 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
308 fFlags(kShortString) {
309 // if there's nothing to convert, do nothing
310 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
313 if(dataLength == -1) {
314 dataLength = (int32_t)uprv_strlen(codepageData);
316 setToUTF8(StringPiece(codepageData, dataLength));
319 // else see unistr_cnv.cpp
322 UnicodeString::UnicodeString(const UnicodeString& that)
330 UnicodeString::UnicodeString(const UnicodeString& that,
336 setTo(that, srcStart);
339 UnicodeString::UnicodeString(const UnicodeString& that,
346 setTo(that, srcStart, srcLength);
349 // Replaceable base class clone() default implementation, does not clone
351 Replaceable::clone() const {
355 // UnicodeString overrides clone() with a real implementation
357 UnicodeString::clone() const {
358 return new UnicodeString(*this);
361 //========================================
363 //========================================
366 UnicodeString::allocate(int32_t capacity) {
367 if(capacity <= US_STACKBUF_SIZE) {
368 fFlags = kShortString;
370 // count bytes for the refCounter and the string capacity, and
371 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
372 // to be safely aligned for the refCount
373 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
374 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
375 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
377 // set initial refCount and point behind the refCount
380 // have fArray point to the first UChar
381 fUnion.fFields.fArray = (UChar *)array;
382 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
383 fFlags = kLongString;
386 fUnion.fFields.fArray = 0;
387 fUnion.fFields.fCapacity = 0;
395 //========================================
397 //========================================
398 UnicodeString::~UnicodeString()
403 //========================================
405 //========================================
407 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
408 UnicodeString result;
409 result.setToUTF8(utf8);
413 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
414 UnicodeString result;
416 // Most UTF-32 strings will be BMP-only and result in a same-length
417 // UTF-16 string. We overestimate the capacity just slightly,
418 // just in case there are a few supplementary characters.
419 if(length <= US_STACKBUF_SIZE) {
420 capacity = US_STACKBUF_SIZE;
422 capacity = length + (length >> 4) + 4;
425 UChar *utf16 = result.getBuffer(capacity);
427 UErrorCode errorCode = U_ZERO_ERROR;
428 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
430 0xfffd, // Substitution character.
431 NULL, // Don't care about number of substitutions.
433 result.releaseBuffer(length16);
434 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
435 capacity = length16 + 1; // +1 for the terminating NUL.
437 } else if(U_FAILURE(errorCode)) {
445 //========================================
447 //========================================
450 UnicodeString::operator=(const UnicodeString &src) {
451 return copyFrom(src);
455 UnicodeString::fastCopyFrom(const UnicodeString &src) {
456 return copyFrom(src, TRUE);
460 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
461 // if assigning to ourselves, do nothing
462 if(this == 0 || this == &src) {
466 // is the right side bogus?
467 if(&src == 0 || src.isBogus()) {
472 // delete the current contents
476 // empty string - use the stack buffer
481 // we always copy the length
482 int32_t srcLength = src.length();
483 setLength(srcLength);
485 // fLength>0 and not an "open" src.getBuffer(minCapacity)
488 // short string using the stack buffer, do the same
489 fFlags = kShortString;
490 uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
493 // src uses a refCounted string buffer, use that buffer with refCount
494 // src is const, use a cast - we don't really change it
495 ((UnicodeString &)src).addRef();
496 // copy all fields, share the reference-counted buffer
497 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
498 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
503 // src is a readonly alias, do the same
504 // -> maintain the readonly alias as such
505 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
506 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
510 // else if(!fastCopy) fall through to case kWritableAlias
511 // -> allocate a new buffer and copy the contents
513 // src is a writable alias; we make a copy of that instead
514 if(allocate(srcLength)) {
515 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
518 // if there is not enough memory, then fall through to setting to bogus
520 // if src is bogus, set ourselves to bogus
521 // do not call setToBogus() here because fArray and fFlags are not consistent here
523 fUnion.fFields.fArray = 0;
524 fUnion.fFields.fCapacity = 0;
532 //========================================
533 // Miscellaneous operations
534 //========================================
536 UnicodeString UnicodeString::unescape() const {
537 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
538 const UChar *array = getBuffer();
539 int32_t len = length();
541 for (int32_t i=0;;) {
543 result.append(array, prev, len - prev);
546 if (array[i++] == 0x5C /*'\\'*/) {
547 result.append(array, prev, (i - 1) - prev);
548 UChar32 c = unescapeAt(i); // advances i
550 result.remove(); // return empty string
551 break; // invalid escape sequence
560 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
561 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
564 //========================================
565 // Read-only implementation
566 //========================================
568 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
569 // Requires: this & text not bogus and have same lengths.
570 // Byte-wise comparison works for equality regardless of endianness.
571 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
575 UnicodeString::doCompare( int32_t start,
577 const UChar *srcChars,
579 int32_t srcLength) const
581 // compare illegal string values
586 // pin indices to legal values
587 pinIndices(start, length);
589 if(srcChars == NULL) {
590 // treat const UChar *srcChars==NULL as an empty string
591 return length == 0 ? 0 : 1;
594 // get the correct pointer
595 const UChar *chars = getArrayStart();
598 srcChars += srcStart;
603 // get the srcLength if necessary
605 srcLength = u_strlen(srcChars + srcStart);
608 // are we comparing different lengths?
609 if(length != srcLength) {
610 if(length < srcLength) {
614 minLength = srcLength;
623 * note that uprv_memcmp() returns an int but we return an int8_t;
624 * we need to take care not to truncate the result -
625 * one way to do this is to right-shift the value to
626 * move the sign bit into the lower 8 bits and making sure that this
627 * does not become 0 itself
630 if(minLength > 0 && chars != srcChars) {
634 // big-endian: byte comparison works
635 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
637 return (int8_t)(result >> 15 | 1);
640 // little-endian: compare UChar units
642 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
644 return (int8_t)(result >> 15 | 1);
646 } while(--minLength > 0);
652 /* String compare in code point order - doCompare() compares in code unit order. */
654 UnicodeString::doCompareCodePointOrder(int32_t start,
656 const UChar *srcChars,
658 int32_t srcLength) const
660 // compare illegal string values
661 // treat const UChar *srcChars==NULL as an empty string
666 // pin indices to legal values
667 pinIndices(start, length);
669 if(srcChars == NULL) {
670 srcStart = srcLength = 0;
673 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
674 /* translate the 32-bit result into an 8-bit one */
676 return (int8_t)(diff >> 15 | 1);
683 UnicodeString::getLength() const {
688 UnicodeString::getCharAt(int32_t offset) const {
689 return charAt(offset);
693 UnicodeString::getChar32At(int32_t offset) const {
694 return char32At(offset);
698 UnicodeString::char32At(int32_t offset) const
700 int32_t len = length();
701 if((uint32_t)offset < (uint32_t)len) {
702 const UChar *array = getArrayStart();
704 U16_GET(array, 0, offset, len, c);
707 return kInvalidUChar;
712 UnicodeString::getChar32Start(int32_t offset) const {
713 if((uint32_t)offset < (uint32_t)length()) {
714 const UChar *array = getArrayStart();
715 U16_SET_CP_START(array, 0, offset);
723 UnicodeString::getChar32Limit(int32_t offset) const {
724 int32_t len = length();
725 if((uint32_t)offset < (uint32_t)len) {
726 const UChar *array = getArrayStart();
727 U16_SET_CP_LIMIT(array, 0, offset, len);
735 UnicodeString::countChar32(int32_t start, int32_t length) const {
736 pinIndices(start, length);
737 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
738 return u_countChar32(getArrayStart()+start, length);
742 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
743 pinIndices(start, length);
744 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
745 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
749 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
751 int32_t len = length();
754 } else if(index>len) {
758 const UChar *array = getArrayStart();
760 U16_FWD_N(array, index, len, delta);
762 U16_BACK_N(array, 0, index, -delta);
769 UnicodeString::doExtract(int32_t start,
772 int32_t dstStart) const
774 // pin indices to legal values
775 pinIndices(start, length);
777 // do not copy anything if we alias dst itself
778 const UChar *array = getArrayStart();
779 if(array + start != dst + dstStart) {
780 us_arrayCopy(array, start, dst, dstStart, length);
785 UnicodeString::extract(UChar *dest, int32_t destCapacity,
786 UErrorCode &errorCode) const {
787 int32_t len = length();
788 if(U_SUCCESS(errorCode)) {
789 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
790 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
792 const UChar *array = getArrayStart();
793 if(len>0 && len<=destCapacity && array!=dest) {
794 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
796 return u_terminateUChars(dest, destCapacity, len, &errorCode);
804 UnicodeString::extract(int32_t start,
807 int32_t targetCapacity,
808 enum EInvariant) const
810 // if the arguments are illegal, then do nothing
811 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
815 // pin the indices to legal values
816 pinIndices(start, length);
818 if(length <= targetCapacity) {
819 u_UCharsToChars(getArrayStart() + start, target, length);
821 UErrorCode status = U_ZERO_ERROR;
822 return u_terminateChars(target, targetCapacity, length, &status);
826 UnicodeString::tempSubString(int32_t start, int32_t len) const {
827 pinIndices(start, len);
828 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
830 array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string
831 len=-2; // bogus result string
833 return UnicodeString(FALSE, array + start, len);
837 UnicodeString::toUTF8(int32_t start, int32_t len,
838 char *target, int32_t capacity) const {
839 pinIndices(start, len);
841 UErrorCode errorCode = U_ZERO_ERROR;
842 u_strToUTF8WithSub(target, capacity, &length8,
843 getBuffer() + start, len,
844 0xFFFD, // Standard substitution character.
845 NULL, // Don't care about number of substitutions.
850 #if U_CHARSET_IS_UTF8
853 UnicodeString::extract(int32_t start, int32_t len,
854 char *target, uint32_t dstSize) const {
855 // if the arguments are illegal, then do nothing
856 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
859 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
862 // else see unistr_cnv.cpp
866 UnicodeString::extractBetween(int32_t start,
868 UnicodeString& target) const {
871 doExtract(start, limit - start, target);
874 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
875 // as many bytes as the source has UChars.
876 // The "worst cases" are writing systems like Indic, Thai and CJK with
879 UnicodeString::toUTF8(ByteSink &sink) const {
880 int32_t length16 = length();
882 char stackBuffer[1024];
883 int32_t capacity = (int32_t)sizeof(stackBuffer);
884 UBool utf8IsOwned = FALSE;
885 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
887 stackBuffer, capacity,
890 UErrorCode errorCode = U_ZERO_ERROR;
891 u_strToUTF8WithSub(utf8, capacity, &length8,
892 getBuffer(), length16,
893 0xFFFD, // Standard substitution character.
894 NULL, // Don't care about number of substitutions.
896 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
897 utf8 = (char *)uprv_malloc(length8);
900 errorCode = U_ZERO_ERROR;
901 u_strToUTF8WithSub(utf8, length8, &length8,
902 getBuffer(), length16,
903 0xFFFD, // Standard substitution character.
904 NULL, // Don't care about number of substitutions.
907 errorCode = U_MEMORY_ALLOCATION_ERROR;
910 if(U_SUCCESS(errorCode)) {
911 sink.Append(utf8, length8);
921 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
923 if(U_SUCCESS(errorCode)) {
924 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
925 u_strToUTF32WithSub(utf32, capacity, &length32,
926 getBuffer(), length(),
927 0xfffd, // Substitution character.
928 NULL, // Don't care about number of substitutions.
935 UnicodeString::indexOf(const UChar *srcChars,
939 int32_t length) const
941 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
945 // UnicodeString does not find empty substrings
946 if(srcLength < 0 && srcChars[srcStart] == 0) {
950 // get the indices within bounds
951 pinIndices(start, length);
953 // find the first occurrence of the substring
954 const UChar *array = getArrayStart();
955 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
959 return (int32_t)(match - array);
964 UnicodeString::doIndexOf(UChar c,
966 int32_t length) const
969 pinIndices(start, length);
971 // find the first occurrence of c
972 const UChar *array = getArrayStart();
973 const UChar *match = u_memchr(array + start, c, length);
977 return (int32_t)(match - array);
982 UnicodeString::doIndexOf(UChar32 c,
984 int32_t length) const {
986 pinIndices(start, length);
988 // find the first occurrence of c
989 const UChar *array = getArrayStart();
990 const UChar *match = u_memchr32(array + start, c, length);
994 return (int32_t)(match - array);
999 UnicodeString::lastIndexOf(const UChar *srcChars,
1003 int32_t length) const
1005 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1009 // UnicodeString does not find empty substrings
1010 if(srcLength < 0 && srcChars[srcStart] == 0) {
1014 // get the indices within bounds
1015 pinIndices(start, length);
1017 // find the last occurrence of the substring
1018 const UChar *array = getArrayStart();
1019 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1023 return (int32_t)(match - array);
1028 UnicodeString::doLastIndexOf(UChar c,
1030 int32_t length) const
1037 pinIndices(start, length);
1039 // find the last occurrence of c
1040 const UChar *array = getArrayStart();
1041 const UChar *match = u_memrchr(array + start, c, length);
1045 return (int32_t)(match - array);
1050 UnicodeString::doLastIndexOf(UChar32 c,
1052 int32_t length) const {
1054 pinIndices(start, length);
1056 // find the last occurrence of c
1057 const UChar *array = getArrayStart();
1058 const UChar *match = u_memrchr32(array + start, c, length);
1062 return (int32_t)(match - array);
1066 //========================================
1067 // Write implementation
1068 //========================================
1071 UnicodeString::findAndReplace(int32_t start,
1073 const UnicodeString& oldText,
1076 const UnicodeString& newText,
1080 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1084 pinIndices(start, length);
1085 oldText.pinIndices(oldStart, oldLength);
1086 newText.pinIndices(newStart, newLength);
1088 if(oldLength == 0) {
1092 while(length > 0 && length >= oldLength) {
1093 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1095 // no more oldText's here: done
1098 // we found oldText, replace it by newText and go beyond it
1099 replace(pos, oldLength, newText, newStart, newLength);
1100 length -= pos + oldLength - start;
1101 start = pos + newLength;
1110 UnicodeString::setToBogus()
1115 fUnion.fFields.fArray = 0;
1116 fUnion.fFields.fCapacity = 0;
1120 // turn a bogus string into an empty one
1122 UnicodeString::unBogus() {
1123 if(fFlags & kIsBogus) {
1129 UnicodeString::getTerminatedBuffer() {
1133 UChar *array = getArrayStart();
1134 int32_t len = length();
1135 if(len < getCapacity()) {
1136 if(fFlags & kBufferIsReadonly) {
1137 // If len<capacity on a read-only alias, then array[len] is
1138 // either the original NUL (if constructed with (TRUE, s, length))
1139 // or one of the original string contents characters (if later truncated),
1140 // therefore we can assume that array[len] is initialized memory.
1141 if(array[len] == 0) {
1144 } else if(((fFlags & kRefCounted) == 0 || refCount() == 1)) {
1145 // kRefCounted: Do not write the NUL if the buffer is shared.
1146 // That is mostly safe, except when the length of one copy was modified
1147 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1148 // Then the NUL would be written into the middle of another copy's string.
1150 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1151 // Do not test if there is a NUL already because it might be uninitialized memory.
1152 // (That would be safe, but tools like valgrind & Purify would complain.)
1157 if(cloneArrayIfNeeded(len+1)) {
1158 array = getArrayStart();
1166 // setTo() analogous to the readonly-aliasing constructor with the same signature
1168 UnicodeString::setTo(UBool isTerminated,
1172 if(fFlags & kOpenGetBuffer) {
1173 // do not modify a string that has an "open" getBuffer(minCapacity)
1178 // treat as an empty string, do not alias
1184 if( textLength < -1 ||
1185 (textLength == -1 && !isTerminated) ||
1186 (textLength >= 0 && isTerminated && text[textLength] != 0)
1194 if(textLength == -1) {
1195 // text is terminated, or else it would have failed the above test
1196 textLength = u_strlen(text);
1198 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1200 fFlags = kReadonlyAlias;
1204 // setTo() analogous to the writable-aliasing constructor with the same signature
1206 UnicodeString::setTo(UChar *buffer,
1208 int32_t buffCapacity) {
1209 if(fFlags & kOpenGetBuffer) {
1210 // do not modify a string that has an "open" getBuffer(minCapacity)
1214 if(buffer == NULL) {
1215 // treat as an empty string, do not alias
1221 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1224 } else if(buffLength == -1) {
1225 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1226 const UChar *p = buffer, *limit = buffer + buffCapacity;
1227 while(p != limit && *p != 0) {
1230 buffLength = (int32_t)(p - buffer);
1235 setArray(buffer, buffLength, buffCapacity);
1236 fFlags = kWritableAlias;
1240 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1242 int32_t length = utf8.length();
1244 // The UTF-16 string will be at most as long as the UTF-8 string.
1245 if(length <= US_STACKBUF_SIZE) {
1246 capacity = US_STACKBUF_SIZE;
1248 capacity = length + 1; // +1 for the terminating NUL.
1250 UChar *utf16 = getBuffer(capacity);
1252 UErrorCode errorCode = U_ZERO_ERROR;
1253 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1254 utf8.data(), length,
1255 0xfffd, // Substitution character.
1256 NULL, // Don't care about number of substitutions.
1258 releaseBuffer(length16);
1259 if(U_FAILURE(errorCode)) {
1266 UnicodeString::setCharAt(int32_t offset,
1269 int32_t len = length();
1270 if(cloneArrayIfNeeded() && len > 0) {
1273 } else if(offset >= len) {
1277 getArrayStart()[offset] = c;
1283 UnicodeString::replace(int32_t start,
1286 UChar buffer[U16_MAX_LENGTH];
1288 UBool isError = FALSE;
1289 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1290 // We test isError so that the compiler does not complain that we don't.
1291 // If isError (srcChar is not a valid code point) then count==0 which means
1292 // we remove the source segment rather than replacing it with srcChar.
1293 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1297 UnicodeString::append(UChar32 srcChar) {
1298 UChar buffer[U16_MAX_LENGTH];
1299 int32_t _length = 0;
1300 UBool isError = FALSE;
1301 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1302 // We test isError so that the compiler does not complain that we don't.
1303 // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1304 return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1308 UnicodeString::doReplace( int32_t start,
1310 const UnicodeString& src,
1314 if(!src.isBogus()) {
1315 // pin the indices to legal values
1316 src.pinIndices(srcStart, srcLength);
1318 // get the characters from src
1319 // and replace the range in ourselves with them
1320 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1323 return doReplace(start, length, 0, 0, 0);
1328 UnicodeString::doReplace(int32_t start,
1330 const UChar *srcChars,
1338 int32_t oldLength = this->length();
1340 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1341 if((fFlags&kBufferIsReadonly) && srcLength == 0) {
1343 // remove prefix by adjusting the array pointer
1345 fUnion.fFields.fArray += length;
1346 fUnion.fFields.fCapacity -= length;
1347 setLength(oldLength - length);
1351 if(length >= (oldLength - start)) {
1352 // remove suffix by reducing the length (like truncate())
1354 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1361 srcStart = srcLength = 0;
1362 } else if(srcLength < 0) {
1363 // get the srcLength if necessary
1364 srcLength = u_strlen(srcChars + srcStart);
1367 // calculate the size of the string after the replace
1370 // optimize append() onto a large-enough, owned string
1371 if(start >= oldLength) {
1372 if(srcLength == 0) {
1375 newLength = oldLength + srcLength;
1376 if(newLength <= getCapacity() && isBufferWritable()) {
1377 UChar *oldArray = getArrayStart();
1378 // Do not copy characters when
1379 // UChar *buffer=str.getAppendBuffer(...);
1381 // str.append(buffer, length);
1383 // str.appendString(buffer, length)
1385 if(srcChars + srcStart != oldArray + start || start > oldLength) {
1386 us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1388 setLength(newLength);
1391 // pin the indices to legal values
1396 // pin the indices to legal values
1397 pinIndices(start, length);
1399 newLength = oldLength - length + srcLength;
1402 // the following may change fArray but will not copy the current contents;
1403 // therefore we need to keep the current fArray
1404 UChar oldStackBuffer[US_STACKBUF_SIZE];
1406 if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1407 // copy the stack buffer contents because it will be overwritten with
1408 // fUnion.fFields values
1409 u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
1410 oldArray = oldStackBuffer;
1412 oldArray = getArrayStart();
1415 // clone our array and allocate a bigger array if needed
1416 int32_t *bufferToDelete = 0;
1417 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1418 FALSE, &bufferToDelete)
1423 // now do the replace
1425 UChar *newArray = getArrayStart();
1426 if(newArray != oldArray) {
1427 // if fArray changed, then we need to copy everything except what will change
1428 us_arrayCopy(oldArray, 0, newArray, 0, start);
1429 us_arrayCopy(oldArray, start + length,
1430 newArray, start + srcLength,
1431 oldLength - (start + length));
1432 } else if(length != srcLength) {
1433 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1434 us_arrayCopy(oldArray, start + length,
1435 newArray, start + srcLength,
1436 oldLength - (start + length));
1439 // now fill in the hole with the new string
1440 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1442 setLength(newLength);
1444 // delayed delete in case srcChars == fArray when we started, and
1445 // to keep oldArray alive for the above operations
1446 if (bufferToDelete) {
1447 uprv_free(bufferToDelete);
1457 UnicodeString::handleReplaceBetween(int32_t start,
1459 const UnicodeString& text) {
1460 replaceBetween(start, limit, text);
1467 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1468 if (limit <= start) {
1469 return; // Nothing to do; avoid bogus malloc call
1471 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1472 // Check to make sure text is not null.
1474 extractBetween(start, limit, text, 0);
1475 insert(dest, text, 0, limit - start);
1483 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1484 * so we implement this function here.
1486 UBool Replaceable::hasMetaData() const {
1493 UBool UnicodeString::hasMetaData() const {
1498 UnicodeString::doReverse(int32_t start, int32_t length) {
1499 if(length <= 1 || !cloneArrayIfNeeded()) {
1503 // pin the indices to legal values
1504 pinIndices(start, length);
1505 if(length <= 1) { // pinIndices() might have shrunk the length
1509 UChar *left = getArrayStart() + start;
1510 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1512 UBool hasSupplementary = FALSE;
1514 // Before the loop we know left<right because length>=2.
1516 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1517 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1519 } while(left < right);
1520 // Make sure to test the middle code unit of an odd-length string.
1521 // Redundant if the length is even.
1522 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1524 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1525 if(hasSupplementary) {
1528 left = getArrayStart() + start;
1529 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1530 while(left < right) {
1531 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1544 UnicodeString::padLeading(int32_t targetLength,
1547 int32_t oldLength = length();
1548 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1551 // move contents up by padding width
1552 UChar *array = getArrayStart();
1553 int32_t start = targetLength - oldLength;
1554 us_arrayCopy(array, 0, array, start, oldLength);
1556 // fill in padding character
1557 while(--start >= 0) {
1558 array[start] = padChar;
1560 setLength(targetLength);
1566 UnicodeString::padTrailing(int32_t targetLength,
1569 int32_t oldLength = length();
1570 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1573 // fill in padding character
1574 UChar *array = getArrayStart();
1575 int32_t length = targetLength;
1576 while(--length >= oldLength) {
1577 array[length] = padChar;
1579 setLength(targetLength);
1584 //========================================
1586 //========================================
1588 UnicodeString::doHashCode() const
1590 /* Delegate hash computation to uhash. This makes UnicodeString
1591 * hashing consistent with UChar* hashing. */
1592 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1593 if (hashCode == kInvalidHashCode) {
1594 hashCode = kEmptyHashCode;
1599 //========================================
1601 //========================================
1604 UnicodeString::getBuffer(int32_t minCapacity) {
1605 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1606 fFlags|=kOpenGetBuffer;
1608 return getArrayStart();
1615 UnicodeString::releaseBuffer(int32_t newLength) {
1616 if(fFlags&kOpenGetBuffer && newLength>=-1) {
1617 // set the new fLength
1618 int32_t capacity=getCapacity();
1620 // the new length is the string length, capped by fCapacity
1621 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1622 while(p<limit && *p!=0) {
1625 newLength=(int32_t)(p-array);
1626 } else if(newLength>capacity) {
1629 setLength(newLength);
1630 fFlags&=~kOpenGetBuffer;
1634 //========================================
1636 //========================================
1638 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1639 int32_t growCapacity,
1641 int32_t **pBufferToDelete,
1643 // default parameters need to be static, therefore
1644 // the defaults are -1 to have convenience defaults
1645 if(newCapacity == -1) {
1646 newCapacity = getCapacity();
1649 // while a getBuffer(minCapacity) is "open",
1650 // prevent any modifications of the string by returning FALSE here
1651 // if the string is bogus, then only an assignment or similar can revive it
1657 * We need to make a copy of the array if
1658 * the buffer is read-only, or
1659 * the buffer is refCounted (shared), and refCount>1, or
1660 * the buffer is too small.
1661 * Return FALSE if memory could not be allocated.
1664 fFlags & kBufferIsReadonly ||
1665 (fFlags & kRefCounted && refCount() > 1) ||
1666 newCapacity > getCapacity()
1668 // check growCapacity for default value and use of the stack buffer
1669 if(growCapacity < 0) {
1670 growCapacity = newCapacity;
1671 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1672 growCapacity = US_STACKBUF_SIZE;
1676 UChar oldStackBuffer[US_STACKBUF_SIZE];
1678 uint8_t flags = fFlags;
1680 if(flags&kUsingStackBuffer) {
1681 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1682 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1683 // copy the stack buffer contents because it will be overwritten with
1684 // fUnion.fFields values
1685 us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
1686 oldArray = oldStackBuffer;
1688 oldArray = 0; // no need to copy from stack buffer to itself
1691 oldArray = fUnion.fFields.fArray;
1692 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1695 // allocate a new array
1696 if(allocate(growCapacity) ||
1697 (newCapacity < growCapacity && allocate(newCapacity))
1699 if(doCopyArray && oldArray != 0) {
1700 // copy the contents
1701 // do not copy more than what fits - it may be smaller than before
1702 int32_t minLength = length();
1703 newCapacity = getCapacity();
1704 if(newCapacity < minLength) {
1705 minLength = newCapacity;
1706 setLength(minLength);
1708 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1713 // release the old array
1714 if(flags & kRefCounted) {
1715 // the array is refCounted; decrement and release if 0
1716 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1717 if(umtx_atomic_dec(pRefCount) == 0) {
1718 if(pBufferToDelete == 0) {
1719 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1720 // is defined as volatile. (Volatile has useful non-standard behavior
1721 // with this compiler.)
1722 uprv_free((void *)pRefCount);
1724 // the caller requested to delete it himself
1725 *pBufferToDelete = (int32_t *)pRefCount;
1730 // not enough memory for growCapacity and not even for the smaller newCapacity
1731 // reset the old values for setToBogus() to release the array
1732 if(!(flags&kUsingStackBuffer)) {
1733 fUnion.fFields.fArray = oldArray;
1743 // UnicodeStringAppendable ------------------------------------------------- ***
1745 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1748 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1749 return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1753 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1754 UChar buffer[U16_MAX_LENGTH];
1755 int32_t cLength = 0;
1756 UBool isError = FALSE;
1757 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1758 return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1762 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1763 return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1767 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1768 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1772 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1773 int32_t desiredCapacityHint,
1774 UChar *scratch, int32_t scratchCapacity,
1775 int32_t *resultCapacity) {
1776 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1777 *resultCapacity = 0;
1780 int32_t oldLength = str.length();
1781 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1782 *resultCapacity = str.getCapacity() - oldLength;
1783 return str.getArrayStart() + oldLength;
1785 *resultCapacity = scratchCapacity;
1793 U_CAPI int32_t U_EXPORT2
1794 uhash_hashUnicodeString(const UElement key) {
1795 const UnicodeString *str = (const UnicodeString*) key.pointer;
1796 return (str == NULL) ? 0 : str->hashCode();
1799 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1800 // does not depend on hashtable code.
1801 U_CAPI UBool U_EXPORT2
1802 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1803 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1804 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1808 if (str1 == NULL || str2 == NULL) {
1811 return *str1 == *str2;
1814 #ifdef U_STATIC_IMPLEMENTATION
1816 This should never be called. It is defined here to make sure that the
1817 virtual vector deleting destructor is defined within unistr.cpp.
1818 The vector deleting destructor is already a part of UObject,
1819 but defining it here makes sure that it is included with this object file.
1820 This makes sure that static library dependencies are kept to a minimum.
1822 static void uprv_UnicodeStringDummy(void) {
1823 delete [] (new UnicodeString[2]);