2 *******************************************************************************
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ustrcase.cpp
10 * tab size: 8 (not used)
13 * created on: 2002feb20
14 * created by: Markus W. Scherer
16 * Implementation file for string casing C API functions.
17 * Uses functions from uchar.c for basic functionality that requires access
18 * to the Unicode Character Database (uprops.dat).
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/ustring.h"
24 #include "unicode/ucasemap.h"
25 #include "unicode/ubrk.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
36 /* string casing ------------------------------------------------------------ */
38 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
40 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
41 int32_t result, const UChar *s) {
45 /* decode the result */
47 /* (not) original code point */
50 } else if(result<=UCASE_MAX_STRING_LENGTH) {
58 if(destIndex<destCapacity) {
59 /* append the result */
63 U16_APPEND(dest, destIndex, destCapacity, c, isError);
65 /* overflow, nothing written */
66 destIndex+=U16_LENGTH(c);
70 if((destIndex+length)<=destCapacity) {
72 dest[destIndex++]=*s++;
83 destIndex+=U16_LENGTH(c);
91 static UChar32 U_CALLCONV
92 utf16_caseContextIterator(void *context, int8_t dir) {
93 UCaseContext *csc=(UCaseContext *)context;
97 /* reset for backward iteration */
98 csc->index=csc->cpStart;
101 /* reset for forward iteration */
102 csc->index=csc->cpLimit;
105 /* continue current iteration direction */
110 if(csc->start<csc->index) {
111 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
115 if(csc->index<csc->limit) {
116 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
124 * Case-maps [srcStart..srcLimit[ but takes
125 * context [0..srcLength[ into account.
128 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
129 UChar *dest, int32_t destCapacity,
130 const UChar *src, UCaseContext *csc,
131 int32_t srcStart, int32_t srcLimit,
132 UErrorCode *pErrorCode) {
135 int32_t srcIndex, destIndex;
138 locCache=csm->locCache;
140 /* case mapping loop */
143 while(srcIndex<srcLimit) {
144 csc->cpStart=srcIndex;
145 U16_NEXT(src, srcIndex, srcLimit, c);
146 csc->cpLimit=srcIndex;
147 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
148 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
149 /* fast path version of appendResult() for BMP results */
150 dest[destIndex++]=(UChar)c2;
152 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
156 if(destIndex>destCapacity) {
157 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
162 #if !UCONFIG_NO_BREAK_ITERATION
164 U_CFUNC int32_t U_CALLCONV
165 ustrcase_internalToTitle(const UCaseMap *csm,
166 UChar *dest, int32_t destCapacity,
167 const UChar *src, int32_t srcLength,
168 UErrorCode *pErrorCode) {
171 int32_t prev, titleStart, titleLimit, idx, destIndex, length;
174 if(U_FAILURE(*pErrorCode)) {
178 // Use the C++ abstract base class to minimize dependencies.
179 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
180 BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
182 /* set up local variables */
183 int32_t locCache=csm->locCache;
184 UCaseContext csc=UCASECONTEXT_INITIALIZER;
191 /* titlecasing loop */
192 while(prev<srcLength) {
193 /* find next index where to titlecase */
200 if(idx==UBRK_DONE || idx>srcLength) {
205 * Unicode 4 & 5 section 3.13 Default Case Operations:
207 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
208 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
209 * cased character F. If F exists, map F to default_title(F); then map each
210 * subsequent character C to default_lower(C).
212 * In this implementation, segment [prev..index[ into 3 parts:
213 * a) uncased characters (copy as-is) [prev..titleStart[
214 * b) first case letter (titlecase) [titleStart..titleLimit[
215 * c) subsequent characters (lowercase) [titleLimit..index[
218 /* find and copy uncased characters [prev..titleStart[ */
219 titleStart=titleLimit=prev;
220 U16_NEXT(src, titleLimit, idx, c);
221 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
222 /* Adjust the titlecasing index (titleStart) to the next cased character. */
224 titleStart=titleLimit;
225 if(titleLimit==idx) {
227 * only uncased characters in [prev..index[
228 * stop with titleStart==titleLimit==index
232 U16_NEXT(src, titleLimit, idx, c);
233 if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
234 break; /* cased letter at [titleStart..titleLimit[ */
237 length=titleStart-prev;
239 if((destIndex+length)<=destCapacity) {
240 uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
246 if(titleStart<titleLimit) {
247 /* titlecase c which is from [titleStart..titleLimit[ */
248 csc.cpStart=titleStart;
249 csc.cpLimit=titleLimit;
250 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
251 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
253 /* Special case Dutch IJ titlecasing */
254 if ( titleStart+1 < idx &&
255 ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
256 ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
257 ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) {
259 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
263 /* lowercase [titleLimit..index[ */
265 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
266 /* Normal operation: Lowercase the rest of the word. */
269 csm, ucase_toFullLower,
270 dest+destIndex, destCapacity-destIndex,
275 /* Optionally just copy the rest of the word unchanged. */
276 length=idx-titleLimit;
277 if((destIndex+length)<=destCapacity) {
278 uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
289 if(destIndex>destCapacity) {
290 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
295 #endif // !UCONFIG_NO_BREAK_ITERATION
297 /* functions available in the common library (for unistr_case.cpp) */
299 U_CFUNC int32_t U_CALLCONV
300 ustrcase_internalToLower(const UCaseMap *csm,
301 UChar *dest, int32_t destCapacity,
302 const UChar *src, int32_t srcLength,
303 UErrorCode *pErrorCode) {
304 UCaseContext csc=UCASECONTEXT_INITIALIZER;
308 csm, ucase_toFullLower,
310 src, &csc, 0, srcLength,
314 U_CFUNC int32_t U_CALLCONV
315 ustrcase_internalToUpper(const UCaseMap *csm,
316 UChar *dest, int32_t destCapacity,
317 const UChar *src, int32_t srcLength,
318 UErrorCode *pErrorCode) {
319 UCaseContext csc=UCASECONTEXT_INITIALIZER;
323 csm, ucase_toFullUpper,
325 src, &csc, 0, srcLength,
330 ustr_foldCase(const UCaseProps *csp,
331 UChar *dest, int32_t destCapacity,
332 const UChar *src, int32_t srcLength,
334 UErrorCode *pErrorCode) {
335 int32_t srcIndex, destIndex;
340 /* case mapping loop */
341 srcIndex=destIndex=0;
342 while(srcIndex<srcLength) {
343 U16_NEXT(src, srcIndex, srcLength, c);
344 c=ucase_toFullFolding(csp, c, &s, options);
345 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
346 /* fast path version of appendResult() for BMP results */
347 dest[destIndex++]=(UChar)c2;
349 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
353 if(destIndex>destCapacity) {
354 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
359 U_CFUNC int32_t U_CALLCONV
360 ustrcase_internalFold(const UCaseMap *csm,
361 UChar *dest, int32_t destCapacity,
362 const UChar *src, int32_t srcLength,
363 UErrorCode *pErrorCode) {
364 return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
368 ustrcase_map(const UCaseMap *csm,
369 UChar *dest, int32_t destCapacity,
370 const UChar *src, int32_t srcLength,
371 UStringCaseMapper *stringCaseMapper,
372 UErrorCode *pErrorCode) {
378 /* check argument values */
379 if(U_FAILURE(*pErrorCode)) {
382 if( destCapacity<0 ||
383 (dest==NULL && destCapacity>0) ||
387 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
391 /* get the string length */
393 srcLength=u_strlen(src);
396 /* check for overlapping source and destination */
398 ((src>=dest && src<(dest+destCapacity)) ||
399 (dest>=src && dest<(src+srcLength)))
401 /* overlap: provide a temporary destination buffer and later copy the result */
402 if(destCapacity<=LENGTHOF(buffer)) {
403 /* the stack buffer is large enough */
406 /* allocate a buffer */
407 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
409 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
417 destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
419 /* copy the result string to the destination buffer */
421 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
423 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR);
431 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
434 /* public API functions */
436 U_CAPI int32_t U_EXPORT2
437 u_strFoldCase(UChar *dest, int32_t destCapacity,
438 const UChar *src, int32_t srcLength,
440 UErrorCode *pErrorCode) {
441 UCaseMap csm=UCASEMAP_INITIALIZER;
442 csm.csp=ucase_getSingleton();
448 ustrcase_internalFold, pErrorCode);
451 /* case-insensitive string comparisons -------------------------------------- */
454 * This function is a copy of unorm_cmpEquivFold() minus the parts for
455 * canonical equivalence.
456 * Keep the functions in sync, and see there for how this works.
457 * The duplication is for modularization:
458 * It makes caseless (but not canonical caseless) matches independent of
459 * the normalization code.
462 /* stack element for previous-level source/decomposition pointers */
463 struct CmpEquivLevel {
464 const UChar *start, *s, *limit;
466 typedef struct CmpEquivLevel CmpEquivLevel;
468 /* internal function */
470 u_strcmpFold(const UChar *s1, int32_t length1,
471 const UChar *s2, int32_t length2,
473 UErrorCode *pErrorCode) {
474 const UCaseProps *csp;
476 /* current-level start/limit - s1/s2 as current */
477 const UChar *start1, *start2, *limit1, *limit2;
479 /* case folding variables */
483 /* stacks of previous-level start/current/limit */
484 CmpEquivLevel stack1[2], stack2[2];
486 /* case folding buffers, only use current-level start/limit */
487 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
489 /* track which is the current level per string */
490 int32_t level1, level2;
492 /* current code units, and code points for lookups */
493 UChar32 c1, c2, cp1, cp2;
495 /* no argument error checking because this itself is not an API */
498 * assume that at least the option U_COMPARE_IGNORE_CASE is set
499 * otherwise this function would have to behave exactly as uprv_strCompare()
501 csp=ucase_getSingleton();
502 if(U_FAILURE(*pErrorCode)) {
524 /* comparison loop */
527 * here a code unit value of -1 means "get another code unit"
528 * below it will mean "this source is finished"
532 /* get next code unit from string 1, post-increment */
534 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
544 /* reached end of level buffer, pop one level */
547 start1=stack1[level1].start; /*Not uninitialized*/
548 } while(start1==NULL);
549 s1=stack1[level1].s; /*Not uninitialized*/
550 limit1=stack1[level1].limit; /*Not uninitialized*/
555 /* get next code unit from string 2, post-increment */
557 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
567 /* reached end of level buffer, pop one level */
570 start2=stack2[level2].start; /*Not uninitialized*/
571 } while(start2==NULL);
572 s2=stack2[level2].s; /*Not uninitialized*/
573 limit2=stack2[level2].limit; /*Not uninitialized*/
579 * either variable c1, c2 is -1 only if the corresponding string is finished
583 return 0; /* c1==c2==-1 indicating end of strings */
585 c1=c2=-1; /* make us fetch new code units */
588 return -1; /* string 1 ends before string 2 */
590 return 1; /* string 2 ends before string 1 */
592 /* c1!=c2 && c1>=0 && c2>=0 */
594 /* get complete code points for c1, c2 for lookups if either is a surrogate */
596 if(U_IS_SURROGATE(c1)) {
599 if(U_IS_SURROGATE_LEAD(c1)) {
600 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
601 /* advance ++s1; only below if cp1 decomposes/case-folds */
602 cp1=U16_GET_SUPPLEMENTARY(c1, c);
604 } else /* isTrail(c1) */ {
605 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
606 cp1=U16_GET_SUPPLEMENTARY(c, c1);
612 if(U_IS_SURROGATE(c2)) {
615 if(U_IS_SURROGATE_LEAD(c2)) {
616 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
617 /* advance ++s2; only below if cp2 decomposes/case-folds */
618 cp2=U16_GET_SUPPLEMENTARY(c2, c);
620 } else /* isTrail(c2) */ {
621 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
622 cp2=U16_GET_SUPPLEMENTARY(c, c2);
628 * go down one level for each string
629 * continue with the main loop as soon as there is a real change
633 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
635 /* cp1 case-folds to the code point "length" or to p[length] */
636 if(U_IS_SURROGATE(c1)) {
637 if(U_IS_SURROGATE_LEAD(c1)) {
638 /* advance beyond source surrogate pair if it case-folds */
640 } else /* isTrail(c1) */ {
642 * we got a supplementary code point when hitting its trail surrogate,
643 * therefore the lead surrogate must have been the same as in the other string;
644 * compare this decomposition with the lead surrogate in the other string
645 * remember that this simulates bulk text replacement:
646 * the decomposition would replace the entire code point
653 /* push current level pointers */
654 stack1[0].start=start1;
656 stack1[0].limit=limit1;
659 /* copy the folding result to fold1[] */
660 if(length<=UCASE_MAX_STRING_LENGTH) {
661 u_memcpy(fold1, p, length);
664 U16_APPEND_UNSAFE(fold1, i, length);
668 /* set next level pointers to case folding */
672 /* get ready to read from decomposition, continue with loop */
678 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
680 /* cp2 case-folds to the code point "length" or to p[length] */
681 if(U_IS_SURROGATE(c2)) {
682 if(U_IS_SURROGATE_LEAD(c2)) {
683 /* advance beyond source surrogate pair if it case-folds */
685 } else /* isTrail(c2) */ {
687 * we got a supplementary code point when hitting its trail surrogate,
688 * therefore the lead surrogate must have been the same as in the other string;
689 * compare this decomposition with the lead surrogate in the other string
690 * remember that this simulates bulk text replacement:
691 * the decomposition would replace the entire code point
698 /* push current level pointers */
699 stack2[0].start=start2;
701 stack2[0].limit=limit2;
704 /* copy the folding result to fold2[] */
705 if(length<=UCASE_MAX_STRING_LENGTH) {
706 u_memcpy(fold2, p, length);
709 U16_APPEND_UNSAFE(fold2, i, length);
713 /* set next level pointers to case folding */
717 /* get ready to read from decomposition, continue with loop */
723 * no decomposition/case folding, max level for both sides:
724 * return difference result
726 * code point order comparison must not just return cp1-cp2
727 * because when single surrogates are present then the surrogate pairs
728 * that formed cp1 and cp2 may be from different string indexes
730 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
731 * c1=d800 cp1=10001 c2=dc00 cp2=10000
732 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
734 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
735 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
736 * so we have slightly different pointer/start/limit comparisons here
739 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
740 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
742 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
743 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
745 /* part of a surrogate pair, leave >=d800 */
747 /* BMP code point - may be surrogate code point - make <d800 */
752 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
753 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
755 /* part of a surrogate pair, leave >=d800 */
757 /* BMP code point - may be surrogate code point - make <d800 */
766 /* public API functions */
768 U_CAPI int32_t U_EXPORT2
769 u_strCaseCompare(const UChar *s1, int32_t length1,
770 const UChar *s2, int32_t length2,
772 UErrorCode *pErrorCode) {
773 /* argument checking */
774 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
777 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
778 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
781 return u_strcmpFold(s1, length1, s2, length2,
782 options|U_COMPARE_IGNORE_CASE,
786 U_CAPI int32_t U_EXPORT2
787 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
788 UErrorCode errorCode=U_ZERO_ERROR;
789 return u_strcmpFold(s1, -1, s2, -1,
790 options|U_COMPARE_IGNORE_CASE,
794 U_CAPI int32_t U_EXPORT2
795 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
796 UErrorCode errorCode=U_ZERO_ERROR;
797 return u_strcmpFold(s1, length, s2, length,
798 options|U_COMPARE_IGNORE_CASE,
802 U_CAPI int32_t U_EXPORT2
803 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
804 UErrorCode errorCode=U_ZERO_ERROR;
805 return u_strcmpFold(s1, n, s2, n,
806 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),