1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: ustrcase.cpp
12 * tab size: 8 (not used)
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/ustring.h"
26 #include "unicode/ucasemap.h"
27 #include "unicode/ubrk.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
37 /* string casing ------------------------------------------------------------ */
39 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
41 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
42 int32_t result, const UChar *s) {
46 /* decode the result */
48 /* (not) original code point */
51 } else if(result<=UCASE_MAX_STRING_LENGTH) {
58 if(length>(INT32_MAX-destIndex)) {
59 return -1; // integer overflow
62 if(destIndex<destCapacity) {
63 /* append the result */
67 U16_APPEND(dest, destIndex, destCapacity, c, isError);
69 /* overflow, nothing written */
74 if((destIndex+length)<=destCapacity) {
76 dest[destIndex++]=*s++;
92 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
93 if(destIndex<destCapacity) {
95 } else if(destIndex==INT32_MAX) {
96 return -1; // integer overflow
101 static inline int32_t
102 appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
103 const UChar *s, int32_t length) {
105 if(length>(INT32_MAX-destIndex)) {
106 return -1; // integer overflow
108 if((destIndex+length)<=destCapacity) {
109 u_memcpy(dest+destIndex, s, length);
116 static UChar32 U_CALLCONV
117 utf16_caseContextIterator(void *context, int8_t dir) {
118 UCaseContext *csc=(UCaseContext *)context;
122 /* reset for backward iteration */
123 csc->index=csc->cpStart;
126 /* reset for forward iteration */
127 csc->index=csc->cpLimit;
130 /* continue current iteration direction */
135 if(csc->start<csc->index) {
136 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
140 if(csc->index<csc->limit) {
141 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
149 * Case-maps [srcStart..srcLimit[ but takes
150 * context [0..srcLength[ into account.
153 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
154 UChar *dest, int32_t destCapacity,
155 const UChar *src, UCaseContext *csc,
156 int32_t srcStart, int32_t srcLimit,
157 UErrorCode *pErrorCode) {
160 int32_t srcIndex, destIndex;
163 locCache=csm->locCache;
165 /* case mapping loop */
168 while(srcIndex<srcLimit) {
169 csc->cpStart=srcIndex;
170 U16_NEXT(src, srcIndex, srcLimit, c);
171 csc->cpLimit=srcIndex;
172 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
173 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
174 /* fast path version of appendResult() for BMP results */
175 dest[destIndex++]=(UChar)c2;
177 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
179 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
185 if(destIndex>destCapacity) {
186 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
191 #if !UCONFIG_NO_BREAK_ITERATION
193 U_CFUNC int32_t U_CALLCONV
194 ustrcase_internalToTitle(const UCaseMap *csm,
195 UChar *dest, int32_t destCapacity,
196 const UChar *src, int32_t srcLength,
197 UErrorCode *pErrorCode) {
200 int32_t prev, titleStart, titleLimit, idx, destIndex;
203 if(U_FAILURE(*pErrorCode)) {
207 // Use the C++ abstract base class to minimize dependencies.
208 // TODO: Change UCaseMap.iter to store a BreakIterator directly.
209 BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
211 /* set up local variables */
212 int32_t locCache=csm->locCache;
213 UCaseContext csc=UCASECONTEXT_INITIALIZER;
220 /* titlecasing loop */
221 while(prev<srcLength) {
222 /* find next index where to titlecase */
229 if(idx==UBRK_DONE || idx>srcLength) {
234 * Unicode 4 & 5 section 3.13 Default Case Operations:
236 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
237 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
238 * cased character F. If F exists, map F to default_title(F); then map each
239 * subsequent character C to default_lower(C).
241 * In this implementation, segment [prev..index[ into 3 parts:
242 * a) uncased characters (copy as-is) [prev..titleStart[
243 * b) first case letter (titlecase) [titleStart..titleLimit[
244 * c) subsequent characters (lowercase) [titleLimit..index[
247 /* find and copy uncased characters [prev..titleStart[ */
248 titleStart=titleLimit=prev;
249 U16_NEXT(src, titleLimit, idx, c);
250 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
251 /* Adjust the titlecasing index (titleStart) to the next cased character. */
253 titleStart=titleLimit;
254 if(titleLimit==idx) {
256 * only uncased characters in [prev..index[
257 * stop with titleStart==titleLimit==index
261 U16_NEXT(src, titleLimit, idx, c);
262 if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
263 break; /* cased letter at [titleStart..titleLimit[ */
266 destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
268 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
273 if(titleStart<titleLimit) {
274 /* titlecase c which is from [titleStart..titleLimit[ */
275 csc.cpStart=titleStart;
276 csc.cpLimit=titleLimit;
277 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
278 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
280 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
284 /* Special case Dutch IJ titlecasing */
285 if (titleStart+1 < idx &&
286 ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
287 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
288 (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
289 destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
291 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
297 /* lowercase [titleLimit..index[ */
299 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
300 /* Normal operation: Lowercase the rest of the word. */
303 csm, ucase_toFullLower,
304 dest+destIndex, destCapacity-destIndex,
308 if(U_FAILURE(*pErrorCode)) {
312 /* Optionally just copy the rest of the word unchanged. */
313 destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
315 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
326 if(destIndex>destCapacity) {
327 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
332 #endif // !UCONFIG_NO_BREAK_ITERATION
335 namespace GreekUpper {
337 // Data generated by prototype code, see
338 // http://site.icu-project.org/design/case/greek-upper
339 // TODO: Move this data into ucase.icu.
340 static const uint16_t data0370[] = {
364 0x0391 | HAS_VOWEL | HAS_ACCENT,
366 0x0395 | HAS_VOWEL | HAS_ACCENT,
367 0x0397 | HAS_VOWEL | HAS_ACCENT,
368 0x0399 | HAS_VOWEL | HAS_ACCENT,
370 0x039F | HAS_VOWEL | HAS_ACCENT,
372 0x03A5 | HAS_VOWEL | HAS_ACCENT,
373 0x03A9 | HAS_VOWEL | HAS_ACCENT,
374 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
400 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
401 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
402 0x0391 | HAS_VOWEL | HAS_ACCENT,
403 0x0395 | HAS_VOWEL | HAS_ACCENT,
404 0x0397 | HAS_VOWEL | HAS_ACCENT,
405 0x0399 | HAS_VOWEL | HAS_ACCENT,
406 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
432 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
433 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
434 0x039F | HAS_VOWEL | HAS_ACCENT,
435 0x03A5 | HAS_VOWEL | HAS_ACCENT,
436 0x03A9 | HAS_VOWEL | HAS_ACCENT,
442 0x03D2 | HAS_DIALYTIKA,
488 static const uint16_t data1F00[] = {
492 0x0391 | HAS_VOWEL | HAS_ACCENT,
493 0x0391 | HAS_VOWEL | HAS_ACCENT,
494 0x0391 | HAS_VOWEL | HAS_ACCENT,
495 0x0391 | HAS_VOWEL | HAS_ACCENT,
496 0x0391 | HAS_VOWEL | HAS_ACCENT,
497 0x0391 | HAS_VOWEL | HAS_ACCENT,
500 0x0391 | HAS_VOWEL | HAS_ACCENT,
501 0x0391 | HAS_VOWEL | HAS_ACCENT,
502 0x0391 | HAS_VOWEL | HAS_ACCENT,
503 0x0391 | HAS_VOWEL | HAS_ACCENT,
504 0x0391 | HAS_VOWEL | HAS_ACCENT,
505 0x0391 | HAS_VOWEL | HAS_ACCENT,
508 0x0395 | HAS_VOWEL | HAS_ACCENT,
509 0x0395 | HAS_VOWEL | HAS_ACCENT,
510 0x0395 | HAS_VOWEL | HAS_ACCENT,
511 0x0395 | HAS_VOWEL | HAS_ACCENT,
516 0x0395 | HAS_VOWEL | HAS_ACCENT,
517 0x0395 | HAS_VOWEL | HAS_ACCENT,
518 0x0395 | HAS_VOWEL | HAS_ACCENT,
519 0x0395 | HAS_VOWEL | HAS_ACCENT,
524 0x0397 | HAS_VOWEL | HAS_ACCENT,
525 0x0397 | HAS_VOWEL | HAS_ACCENT,
526 0x0397 | HAS_VOWEL | HAS_ACCENT,
527 0x0397 | HAS_VOWEL | HAS_ACCENT,
528 0x0397 | HAS_VOWEL | HAS_ACCENT,
529 0x0397 | HAS_VOWEL | HAS_ACCENT,
532 0x0397 | HAS_VOWEL | HAS_ACCENT,
533 0x0397 | HAS_VOWEL | HAS_ACCENT,
534 0x0397 | HAS_VOWEL | HAS_ACCENT,
535 0x0397 | HAS_VOWEL | HAS_ACCENT,
536 0x0397 | HAS_VOWEL | HAS_ACCENT,
537 0x0397 | HAS_VOWEL | HAS_ACCENT,
540 0x0399 | HAS_VOWEL | HAS_ACCENT,
541 0x0399 | HAS_VOWEL | HAS_ACCENT,
542 0x0399 | HAS_VOWEL | HAS_ACCENT,
543 0x0399 | HAS_VOWEL | HAS_ACCENT,
544 0x0399 | HAS_VOWEL | HAS_ACCENT,
545 0x0399 | HAS_VOWEL | HAS_ACCENT,
548 0x0399 | HAS_VOWEL | HAS_ACCENT,
549 0x0399 | HAS_VOWEL | HAS_ACCENT,
550 0x0399 | HAS_VOWEL | HAS_ACCENT,
551 0x0399 | HAS_VOWEL | HAS_ACCENT,
552 0x0399 | HAS_VOWEL | HAS_ACCENT,
553 0x0399 | HAS_VOWEL | HAS_ACCENT,
556 0x039F | HAS_VOWEL | HAS_ACCENT,
557 0x039F | HAS_VOWEL | HAS_ACCENT,
558 0x039F | HAS_VOWEL | HAS_ACCENT,
559 0x039F | HAS_VOWEL | HAS_ACCENT,
564 0x039F | HAS_VOWEL | HAS_ACCENT,
565 0x039F | HAS_VOWEL | HAS_ACCENT,
566 0x039F | HAS_VOWEL | HAS_ACCENT,
567 0x039F | HAS_VOWEL | HAS_ACCENT,
572 0x03A5 | HAS_VOWEL | HAS_ACCENT,
573 0x03A5 | HAS_VOWEL | HAS_ACCENT,
574 0x03A5 | HAS_VOWEL | HAS_ACCENT,
575 0x03A5 | HAS_VOWEL | HAS_ACCENT,
576 0x03A5 | HAS_VOWEL | HAS_ACCENT,
577 0x03A5 | HAS_VOWEL | HAS_ACCENT,
581 0x03A5 | HAS_VOWEL | HAS_ACCENT,
583 0x03A5 | HAS_VOWEL | HAS_ACCENT,
585 0x03A5 | HAS_VOWEL | HAS_ACCENT,
588 0x03A9 | HAS_VOWEL | HAS_ACCENT,
589 0x03A9 | HAS_VOWEL | HAS_ACCENT,
590 0x03A9 | HAS_VOWEL | HAS_ACCENT,
591 0x03A9 | HAS_VOWEL | HAS_ACCENT,
592 0x03A9 | HAS_VOWEL | HAS_ACCENT,
593 0x03A9 | HAS_VOWEL | HAS_ACCENT,
596 0x03A9 | HAS_VOWEL | HAS_ACCENT,
597 0x03A9 | HAS_VOWEL | HAS_ACCENT,
598 0x03A9 | HAS_VOWEL | HAS_ACCENT,
599 0x03A9 | HAS_VOWEL | HAS_ACCENT,
600 0x03A9 | HAS_VOWEL | HAS_ACCENT,
601 0x03A9 | HAS_VOWEL | HAS_ACCENT,
602 0x0391 | HAS_VOWEL | HAS_ACCENT,
603 0x0391 | HAS_VOWEL | HAS_ACCENT,
604 0x0395 | HAS_VOWEL | HAS_ACCENT,
605 0x0395 | HAS_VOWEL | HAS_ACCENT,
606 0x0397 | HAS_VOWEL | HAS_ACCENT,
607 0x0397 | HAS_VOWEL | HAS_ACCENT,
608 0x0399 | HAS_VOWEL | HAS_ACCENT,
609 0x0399 | HAS_VOWEL | HAS_ACCENT,
610 0x039F | HAS_VOWEL | HAS_ACCENT,
611 0x039F | HAS_VOWEL | HAS_ACCENT,
612 0x03A5 | HAS_VOWEL | HAS_ACCENT,
613 0x03A5 | HAS_VOWEL | HAS_ACCENT,
614 0x03A9 | HAS_VOWEL | HAS_ACCENT,
615 0x03A9 | HAS_VOWEL | HAS_ACCENT,
618 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
619 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
620 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
621 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
622 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
623 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
624 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
625 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
626 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
627 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
628 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
629 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
630 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
631 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
632 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
633 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
634 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
635 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
636 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
637 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
638 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
639 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
640 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
641 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
642 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
643 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
644 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
645 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
646 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
647 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
648 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
649 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
650 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
651 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
652 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
653 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
654 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
655 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
656 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
657 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
658 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
659 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
660 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
661 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
662 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
663 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
664 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
665 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
668 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
669 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
670 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
672 0x0391 | HAS_VOWEL | HAS_ACCENT,
673 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
676 0x0391 | HAS_VOWEL | HAS_ACCENT,
677 0x0391 | HAS_VOWEL | HAS_ACCENT,
678 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
684 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
685 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
686 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
688 0x0397 | HAS_VOWEL | HAS_ACCENT,
689 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
690 0x0395 | HAS_VOWEL | HAS_ACCENT,
691 0x0395 | HAS_VOWEL | HAS_ACCENT,
692 0x0397 | HAS_VOWEL | HAS_ACCENT,
693 0x0397 | HAS_VOWEL | HAS_ACCENT,
694 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
700 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
701 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
704 0x0399 | HAS_VOWEL | HAS_ACCENT,
705 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
708 0x0399 | HAS_VOWEL | HAS_ACCENT,
709 0x0399 | HAS_VOWEL | HAS_ACCENT,
716 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
717 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
720 0x03A5 | HAS_VOWEL | HAS_ACCENT,
721 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
724 0x03A5 | HAS_VOWEL | HAS_ACCENT,
725 0x03A5 | HAS_VOWEL | HAS_ACCENT,
732 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
733 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
734 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
736 0x03A9 | HAS_VOWEL | HAS_ACCENT,
737 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
738 0x039F | HAS_VOWEL | HAS_ACCENT,
739 0x039F | HAS_VOWEL | HAS_ACCENT,
740 0x03A9 | HAS_VOWEL | HAS_ACCENT,
741 0x03A9 | HAS_VOWEL | HAS_ACCENT,
742 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
749 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
751 uint32_t getLetterData(UChar32 c) {
752 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
754 } else if (c <= 0x3ff) {
755 return data0370[c - 0x370];
756 } else if (c <= 0x1fff) {
757 return data1F00[c - 0x1f00];
758 } else if (c == 0x2126) {
765 uint32_t getDiacriticData(UChar32 c) {
767 case 0x0300: // varia
768 case 0x0301: // tonos = oxia
769 case 0x0342: // perispomeni
770 case 0x0302: // circumflex can look like perispomeni
771 case 0x0303: // tilde can look like perispomeni
772 case 0x0311: // inverted breve can look like perispomeni
774 case 0x0308: // dialytika = diaeresis
775 return HAS_COMBINING_DIALYTIKA;
776 case 0x0344: // dialytika tonos
777 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
778 case 0x0345: // ypogegrammeni = iota subscript
779 return HAS_YPOGEGRAMMENI;
780 case 0x0304: // macron
781 case 0x0306: // breve
782 case 0x0313: // comma above
783 case 0x0314: // reversed comma above
784 case 0x0343: // koronis
785 return HAS_OTHER_GREEK_DIACRITIC;
791 UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i, int32_t length) {
794 U16_NEXT(s, i, length, c);
795 int32_t type = ucase_getTypeOrIgnorable(csp, c);
796 if ((type & UCASE_IGNORABLE) != 0) {
797 // Case-ignorable, continue with the loop.
798 } else if (type != UCASE_NONE) {
799 return TRUE; // Followed by cased letter.
801 return FALSE; // Uncased and not case-ignorable.
804 return FALSE; // Not followed by cased letter.
808 * Greek string uppercasing with a state machine.
809 * Probably simpler than a stateless function that has to figure out complex context-before
810 * for each character.
811 * TODO: Try to re-consolidate one way or another with the non-Greek function.
813 int32_t toUpper(const UCaseMap *csm,
814 UChar *dest, int32_t destCapacity,
815 const UChar *src, int32_t srcLength,
816 UErrorCode *pErrorCode) {
817 int32_t locCache = UCASE_LOC_GREEK;
820 for (int32_t i = 0; i < srcLength;) {
821 int32_t nextIndex = i;
823 U16_NEXT(src, nextIndex, srcLength, c);
824 uint32_t nextState = 0;
825 int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
826 if ((type & UCASE_IGNORABLE) != 0) {
827 // c is case-ignorable
828 nextState |= (state & AFTER_CASED);
829 } else if (type != UCASE_NONE) {
831 nextState |= AFTER_CASED;
833 uint32_t data = getLetterData(c);
835 uint32_t upper = data & UPPER_MASK;
836 // Add a dialytika to this iota or ypsilon vowel
837 // if we removed a tonos from the previous vowel,
838 // and that previous vowel did not also have (or gain) a dialytika.
839 // Adding one only to the final vowel in a longer sequence
840 // (which does not occur in normal writing) would require lookahead.
841 // Set the same flag as for preserving an existing dialytika.
842 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
843 (upper == 0x399 || upper == 0x3A5)) {
844 data |= HAS_DIALYTIKA;
846 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
847 if ((data & HAS_YPOGEGRAMMENI) != 0) {
848 numYpogegrammeni = 1;
850 // Skip combining diacritics after this Greek letter.
851 while (nextIndex < srcLength) {
852 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
853 if (diacriticData != 0) {
854 data |= diacriticData;
855 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
860 break; // not a Greek diacritic
863 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
864 nextState |= AFTER_VOWEL_WITH_ACCENT;
866 // Map according to Greek rules.
867 UBool addTonos = FALSE;
868 if (upper == 0x397 &&
869 (data & HAS_ACCENT) != 0 &&
870 numYpogegrammeni == 0 &&
871 (state & AFTER_CASED) == 0 &&
872 !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
873 // Keep disjunctive "or" with (only) a tonos.
874 // We use the same "word boundary" conditions as for the Final_Sigma test.
875 if (i == nextIndex) {
876 upper = 0x389; // Preserve the precomposed form.
880 } else if ((data & HAS_DIALYTIKA) != 0) {
881 // Preserve a vowel with dialytika in precomposed form if it exists.
882 if (upper == 0x399) {
884 data &= ~HAS_EITHER_DIALYTIKA;
885 } else if (upper == 0x3A5) {
887 data &= ~HAS_EITHER_DIALYTIKA;
890 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
891 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
892 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
894 if (destIndex >= 0 && addTonos) {
895 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
897 while (destIndex >= 0 && numYpogegrammeni > 0) {
898 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
902 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
908 c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
909 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
910 /* fast path version of appendResult() for BMP results */
911 dest[destIndex++]=(UChar)c2;
913 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
915 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
924 if(destIndex>destCapacity) {
925 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
930 } // namespace GreekUpper
933 /* functions available in the common library (for unistr_case.cpp) */
935 U_CFUNC int32_t U_CALLCONV
936 ustrcase_internalToLower(const UCaseMap *csm,
937 UChar *dest, int32_t destCapacity,
938 const UChar *src, int32_t srcLength,
939 UErrorCode *pErrorCode) {
940 UCaseContext csc=UCASECONTEXT_INITIALIZER;
944 csm, ucase_toFullLower,
946 src, &csc, 0, srcLength,
950 U_CFUNC int32_t U_CALLCONV
951 ustrcase_internalToUpper(const UCaseMap *csm,
952 UChar *dest, int32_t destCapacity,
953 const UChar *src, int32_t srcLength,
954 UErrorCode *pErrorCode) {
955 int32_t locCache = csm->locCache;
956 if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
957 return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode);
959 UCaseContext csc=UCASECONTEXT_INITIALIZER;
963 csm, ucase_toFullUpper,
965 src, &csc, 0, srcLength,
970 ustr_foldCase(const UCaseProps *csp,
971 UChar *dest, int32_t destCapacity,
972 const UChar *src, int32_t srcLength,
974 UErrorCode *pErrorCode) {
975 int32_t srcIndex, destIndex;
980 /* case mapping loop */
981 srcIndex=destIndex=0;
982 while(srcIndex<srcLength) {
983 U16_NEXT(src, srcIndex, srcLength, c);
984 c=ucase_toFullFolding(csp, c, &s, options);
985 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
986 /* fast path version of appendResult() for BMP results */
987 dest[destIndex++]=(UChar)c2;
989 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
991 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
997 if(destIndex>destCapacity) {
998 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1003 U_CFUNC int32_t U_CALLCONV
1004 ustrcase_internalFold(const UCaseMap *csm,
1005 UChar *dest, int32_t destCapacity,
1006 const UChar *src, int32_t srcLength,
1007 UErrorCode *pErrorCode) {
1008 return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
1012 ustrcase_map(const UCaseMap *csm,
1013 UChar *dest, int32_t destCapacity,
1014 const UChar *src, int32_t srcLength,
1015 UStringCaseMapper *stringCaseMapper,
1016 UErrorCode *pErrorCode) {
1022 /* check argument values */
1023 if(U_FAILURE(*pErrorCode)) {
1026 if( destCapacity<0 ||
1027 (dest==NULL && destCapacity>0) ||
1031 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1035 /* get the string length */
1037 srcLength=u_strlen(src);
1040 /* check for overlapping source and destination */
1042 ((src>=dest && src<(dest+destCapacity)) ||
1043 (dest>=src && dest<(src+srcLength)))
1045 /* overlap: provide a temporary destination buffer and later copy the result */
1046 if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1047 /* the stack buffer is large enough */
1050 /* allocate a buffer */
1051 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1053 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1061 destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
1063 /* copy the result string to the destination buffer */
1065 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
1067 u_memmove(dest, temp, copyLength);
1075 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
1078 /* public API functions */
1080 U_CAPI int32_t U_EXPORT2
1081 u_strFoldCase(UChar *dest, int32_t destCapacity,
1082 const UChar *src, int32_t srcLength,
1084 UErrorCode *pErrorCode) {
1085 UCaseMap csm=UCASEMAP_INITIALIZER;
1086 csm.csp=ucase_getSingleton();
1087 csm.options=options;
1088 return ustrcase_map(
1092 ustrcase_internalFold, pErrorCode);
1095 /* case-insensitive string comparisons -------------------------------------- */
1098 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1099 * canonical equivalence.
1100 * Keep the functions in sync, and see there for how this works.
1101 * The duplication is for modularization:
1102 * It makes caseless (but not canonical caseless) matches independent of
1103 * the normalization code.
1106 /* stack element for previous-level source/decomposition pointers */
1107 struct CmpEquivLevel {
1108 const UChar *start, *s, *limit;
1110 typedef struct CmpEquivLevel CmpEquivLevel;
1113 * Internal implementation code comparing string with case fold.
1114 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1116 * @param s1 input string 1
1117 * @param length1 length of string 1, or -1 (NULL terminated)
1118 * @param s2 input string 2
1119 * @param length2 length of string 2, or -1 (NULL terminated)
1120 * @param options compare options
1121 * @param matchLen1 (output) length of partial prefix match in s1
1122 * @param matchLen2 (output) length of partial prefix match in s2
1123 * @param pErrorCode receives error status
1124 * @return The result of comparison
1126 static int32_t _cmpFold(
1127 const UChar *s1, int32_t length1,
1128 const UChar *s2, int32_t length2,
1130 int32_t *matchLen1, int32_t *matchLen2,
1131 UErrorCode *pErrorCode) {
1134 const UCaseProps *csp;
1136 /* current-level start/limit - s1/s2 as current */
1137 const UChar *start1, *start2, *limit1, *limit2;
1139 /* points to the original start address */
1140 const UChar *org1, *org2;
1142 /* points to the end of match + 1 */
1143 const UChar *m1, *m2;
1145 /* case folding variables */
1149 /* stacks of previous-level start/current/limit */
1150 CmpEquivLevel stack1[2], stack2[2];
1152 /* case folding buffers, only use current-level start/limit */
1153 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1155 /* track which is the current level per string */
1156 int32_t level1, level2;
1158 /* current code units, and code points for lookups */
1159 UChar32 c1, c2, cp1, cp2;
1161 /* no argument error checking because this itself is not an API */
1164 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1165 * otherwise this function would have to behave exactly as uprv_strCompare()
1167 csp=ucase_getSingleton();
1168 if(U_FAILURE(*pErrorCode)) {
1174 U_ASSERT(matchLen2 !=NULL);
1196 /* comparison loop */
1199 * here a code unit value of -1 means "get another code unit"
1200 * below it will mean "this source is finished"
1204 /* get next code unit from string 1, post-increment */
1206 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1216 /* reached end of level buffer, pop one level */
1219 start1=stack1[level1].start; /*Not uninitialized*/
1220 } while(start1==NULL);
1221 s1=stack1[level1].s; /*Not uninitialized*/
1222 limit1=stack1[level1].limit; /*Not uninitialized*/
1227 /* get next code unit from string 2, post-increment */
1229 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1239 /* reached end of level buffer, pop one level */
1242 start2=stack2[level2].start; /*Not uninitialized*/
1243 } while(start2==NULL);
1244 s2=stack2[level2].s; /*Not uninitialized*/
1245 limit2=stack2[level2].limit; /*Not uninitialized*/
1251 * either variable c1, c2 is -1 only if the corresponding string is finished
1254 const UChar *next1, *next2;
1257 cmpRes=0; /* c1==c2==-1 indicating end of strings */
1262 * Note: Move the match positions in both strings at the same time
1263 * only when corresponding code point(s) in the original strings
1264 * are fully consumed. For example, when comparing s1="Fust" and
1265 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1266 * the first code point in the case-folded data. But the second "s"
1267 * has no matching code point in s1, so this implementation returns
1268 * 2 as the prefix match length ("Fu").
1273 } else if(s1==limit1) {
1274 /* Note: This implementation only use a single level of stack.
1275 * If this code needs to be changed to use multiple levels
1276 * of stacks, the code above should check if the current
1277 * code is at the end of all stacks.
1279 U_ASSERT(level1==1);
1281 /* is s1 at the end of the current stack? */
1288 } else if(s2==limit2) {
1289 U_ASSERT(level2==1);
1291 /* is s2 at the end of the current stack? */
1299 c1=c2=-1; /* make us fetch new code units */
1302 cmpRes=-1; /* string 1 ends before string 2 */
1305 cmpRes=1; /* string 2 ends before string 1 */
1308 /* c1!=c2 && c1>=0 && c2>=0 */
1310 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1312 if(U_IS_SURROGATE(c1)) {
1315 if(U_IS_SURROGATE_LEAD(c1)) {
1316 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1317 /* advance ++s1; only below if cp1 decomposes/case-folds */
1318 cp1=U16_GET_SUPPLEMENTARY(c1, c);
1320 } else /* isTrail(c1) */ {
1321 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1322 cp1=U16_GET_SUPPLEMENTARY(c, c1);
1328 if(U_IS_SURROGATE(c2)) {
1331 if(U_IS_SURROGATE_LEAD(c2)) {
1332 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1333 /* advance ++s2; only below if cp2 decomposes/case-folds */
1334 cp2=U16_GET_SUPPLEMENTARY(c2, c);
1336 } else /* isTrail(c2) */ {
1337 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1338 cp2=U16_GET_SUPPLEMENTARY(c, c2);
1344 * go down one level for each string
1345 * continue with the main loop as soon as there is a real change
1349 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
1351 /* cp1 case-folds to the code point "length" or to p[length] */
1352 if(U_IS_SURROGATE(c1)) {
1353 if(U_IS_SURROGATE_LEAD(c1)) {
1354 /* advance beyond source surrogate pair if it case-folds */
1356 } else /* isTrail(c1) */ {
1358 * we got a supplementary code point when hitting its trail surrogate,
1359 * therefore the lead surrogate must have been the same as in the other string;
1360 * compare this decomposition with the lead surrogate in the other string
1361 * remember that this simulates bulk text replacement:
1362 * the decomposition would replace the entire code point
1370 /* push current level pointers */
1371 stack1[0].start=start1;
1373 stack1[0].limit=limit1;
1376 /* copy the folding result to fold1[] */
1377 if(length<=UCASE_MAX_STRING_LENGTH) {
1378 u_memcpy(fold1, p, length);
1381 U16_APPEND_UNSAFE(fold1, i, length);
1385 /* set next level pointers to case folding */
1387 limit1=fold1+length;
1389 /* get ready to read from decomposition, continue with loop */
1395 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
1397 /* cp2 case-folds to the code point "length" or to p[length] */
1398 if(U_IS_SURROGATE(c2)) {
1399 if(U_IS_SURROGATE_LEAD(c2)) {
1400 /* advance beyond source surrogate pair if it case-folds */
1402 } else /* isTrail(c2) */ {
1404 * we got a supplementary code point when hitting its trail surrogate,
1405 * therefore the lead surrogate must have been the same as in the other string;
1406 * compare this decomposition with the lead surrogate in the other string
1407 * remember that this simulates bulk text replacement:
1408 * the decomposition would replace the entire code point
1416 /* push current level pointers */
1417 stack2[0].start=start2;
1419 stack2[0].limit=limit2;
1422 /* copy the folding result to fold2[] */
1423 if(length<=UCASE_MAX_STRING_LENGTH) {
1424 u_memcpy(fold2, p, length);
1427 U16_APPEND_UNSAFE(fold2, i, length);
1431 /* set next level pointers to case folding */
1433 limit2=fold2+length;
1435 /* get ready to read from decomposition, continue with loop */
1441 * no decomposition/case folding, max level for both sides:
1442 * return difference result
1444 * code point order comparison must not just return cp1-cp2
1445 * because when single surrogates are present then the surrogate pairs
1446 * that formed cp1 and cp2 may be from different string indexes
1448 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1449 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1450 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1452 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1453 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1454 * so we have slightly different pointer/start/limit comparisons here
1457 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1458 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1460 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1461 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1463 /* part of a surrogate pair, leave >=d800 */
1465 /* BMP code point - may be surrogate code point - make <d800 */
1470 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1471 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1473 /* part of a surrogate pair, leave >=d800 */
1475 /* BMP code point - may be surrogate code point - make <d800 */
1491 /* internal function */
1493 u_strcmpFold(const UChar *s1, int32_t length1,
1494 const UChar *s2, int32_t length2,
1496 UErrorCode *pErrorCode) {
1497 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1500 /* public API functions */
1502 U_CAPI int32_t U_EXPORT2
1503 u_strCaseCompare(const UChar *s1, int32_t length1,
1504 const UChar *s2, int32_t length2,
1506 UErrorCode *pErrorCode) {
1507 /* argument checking */
1508 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1511 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1512 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1515 return u_strcmpFold(s1, length1, s2, length2,
1516 options|U_COMPARE_IGNORE_CASE,
1520 U_CAPI int32_t U_EXPORT2
1521 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1522 UErrorCode errorCode=U_ZERO_ERROR;
1523 return u_strcmpFold(s1, -1, s2, -1,
1524 options|U_COMPARE_IGNORE_CASE,
1528 U_CAPI int32_t U_EXPORT2
1529 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1530 UErrorCode errorCode=U_ZERO_ERROR;
1531 return u_strcmpFold(s1, length, s2, length,
1532 options|U_COMPARE_IGNORE_CASE,
1536 U_CAPI int32_t U_EXPORT2
1537 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1538 UErrorCode errorCode=U_ZERO_ERROR;
1539 return u_strcmpFold(s1, n, s2, n,
1540 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1544 /* internal API - detect length of shared prefix */
1546 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1547 const UChar *s2, int32_t length2,
1549 int32_t *matchLen1, int32_t *matchLen2,
1550 UErrorCode *pErrorCode) {
1551 _cmpFold(s1, length1, s2, length2, options,
1552 matchLen1, matchLen2, pErrorCode);