source/common/ustrcase.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2001-2015, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  ustrcase.cpp
  11 *   encoding:   US-ASCII
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2002feb20
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Implementation file for string casing C API functions.
  19 *   Uses functions from uchar.c for basic functionality that requires access
  20 *   to the Unicode Character Database (uprops.dat).
  21 */
  22
  23 #include "unicode/utypes.h"
  24 #include "unicode/brkiter.h"
  25 #include "unicode/ustring.h"
  26 #include "unicode/ucasemap.h"
  27 #include "unicode/ubrk.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf16.h"
  30 #include "cmemory.h"
  31 #include "ucase.h"
  32 #include "ustr_imp.h"
  33 #include "uassert.h"
  34
  35 U_NAMESPACE_USE
  36
  37 /* string casing ------------------------------------------------------------ */
  38
  39 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
  40 static inline int32_t
  41 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
  42              int32_t result, const UChar *s) {
  43     UChar32 c;
  44     int32_t length;
  45
  46     /* decode the result */
  47     if(result<0) {
  48         /* (not) original code point */
  49         c=~result;
  50         length=U16_LENGTH(c);
  51     } else if(result<=UCASE_MAX_STRING_LENGTH) {
  52         c=U_SENTINEL;
  53         length=result;
  54     } else {
  55         c=result;
  56         length=U16_LENGTH(c);
  57     }
  58     if(length>(INT32_MAX-destIndex)) {
  59         return -1;  // integer overflow
  60     }
  61
  62     if(destIndex<destCapacity) {
  63         /* append the result */
  64         if(c>=0) {
  65             /* code point */
  66             UBool isError=FALSE;
  67             U16_APPEND(dest, destIndex, destCapacity, c, isError);
  68             if(isError) {
  69                 /* overflow, nothing written */
  70                 destIndex+=length;
  71             }
  72         } else {
  73             /* string */
  74             if((destIndex+length)<=destCapacity) {
  75                 while(length>0) {
  76                     dest[destIndex++]=*s++;
  77                     --length;
  78                 }
  79             } else {
  80                 /* overflow */
  81                 destIndex+=length;
  82             }
  83         }
  84     } else {
  85         /* preflight */
  86         destIndex+=length;
  87     }
  88     return destIndex;
  89 }
  90
  91 static inline int32_t
  92 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
  93     if(destIndex<destCapacity) {
  94         dest[destIndex]=c;
  95     } else if(destIndex==INT32_MAX) {
  96         return -1;  // integer overflow
  97     }
  98     return destIndex+1;
  99 }
 100
 101 static inline int32_t
 102 appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
 103              const UChar *s, int32_t length) {
 104     if(length>0) {
 105         if(length>(INT32_MAX-destIndex)) {
 106             return -1;  // integer overflow
 107         }
 108         if((destIndex+length)<=destCapacity) {
 109             u_memcpy(dest+destIndex, s, length);
 110         }
 111         destIndex+=length;
 112     }
 113     return destIndex;
 114 }
 115
 116 static UChar32 U_CALLCONV
 117 utf16_caseContextIterator(void *context, int8_t dir) {
 118     UCaseContext *csc=(UCaseContext *)context;
 119     UChar32 c;
 120
 121     if(dir<0) {
 122         /* reset for backward iteration */
 123         csc->index=csc->cpStart;
 124         csc->dir=dir;
 125     } else if(dir>0) {
 126         /* reset for forward iteration */
 127         csc->index=csc->cpLimit;
 128         csc->dir=dir;
 129     } else {
 130         /* continue current iteration direction */
 131         dir=csc->dir;
 132     }
 133
 134     if(dir<0) {
 135         if(csc->start<csc->index) {
 136             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
 137             return c;
 138         }
 139     } else {
 140         if(csc->index<csc->limit) {
 141             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
 142             return c;
 143         }
 144     }
 145     return U_SENTINEL;
 146 }
 147
 148 /*
 149  * Case-maps [srcStart..srcLimit[ but takes
 150  * context [0..srcLength[ into account.
 151  */
 152 static int32_t
 153 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
 154          UChar *dest, int32_t destCapacity,
 155          const UChar *src, UCaseContext *csc,
 156          int32_t srcStart, int32_t srcLimit,
 157          UErrorCode *pErrorCode) {
 158     const UChar *s;
 159     UChar32 c, c2 = 0;
 160     int32_t srcIndex, destIndex;
 161     int32_t locCache;
 162
 163     locCache=csm->locCache;
 164
 165     /* case mapping loop */
 166     srcIndex=srcStart;
 167     destIndex=0;
 168     while(srcIndex<srcLimit) {
 169         csc->cpStart=srcIndex;
 170         U16_NEXT(src, srcIndex, srcLimit, c);
 171         csc->cpLimit=srcIndex;
 172         c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
 173         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
 174             /* fast path version of appendResult() for BMP results */
 175             dest[destIndex++]=(UChar)c2;
 176         } else {
 177             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 178             if(destIndex<0) {
 179                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 180                 return 0;
 181             }
 182         }
 183     }
 184
 185     if(destIndex>destCapacity) {
 186         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 187     }
 188     return destIndex;
 189 }
 190
 191 #if !UCONFIG_NO_BREAK_ITERATION
 192
 193 U_CFUNC int32_t U_CALLCONV
 194 ustrcase_internalToTitle(const UCaseMap *csm,
 195                          UChar *dest, int32_t destCapacity,
 196                          const UChar *src, int32_t srcLength,
 197                          UErrorCode *pErrorCode) {
 198     const UChar *s;
 199     UChar32 c;
 200     int32_t prev, titleStart, titleLimit, idx, destIndex;
 201     UBool isFirstIndex;
 202
 203     if(U_FAILURE(*pErrorCode)) {
 204         return 0;
 205     }
 206
 207     // Use the C++ abstract base class to minimize dependencies.
 208     // TODO: Change UCaseMap.iter to store a BreakIterator directly.
 209     BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
 210
 211     /* set up local variables */
 212     int32_t locCache=csm->locCache;
 213     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 214     csc.p=(void *)src;
 215     csc.limit=srcLength;
 216     destIndex=0;
 217     prev=0;
 218     isFirstIndex=TRUE;
 219
 220     /* titlecasing loop */
 221     while(prev<srcLength) {
 222         /* find next index where to titlecase */
 223         if(isFirstIndex) {
 224             isFirstIndex=FALSE;
 225             idx=bi->first();
 226         } else {
 227             idx=bi->next();
 228         }
 229         if(idx==UBRK_DONE || idx>srcLength) {
 230             idx=srcLength;
 231         }
 232
 233         /*
 234          * Unicode 4 & 5 section 3.13 Default Case Operations:
 235          *
 236          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
 237          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
 238          * cased character F. If F exists, map F to default_title(F); then map each
 239          * subsequent character C to default_lower(C).
 240          *
 241          * In this implementation, segment [prev..index[ into 3 parts:
 242          * a) uncased characters (copy as-is) [prev..titleStart[
 243          * b) first case letter (titlecase)         [titleStart..titleLimit[
 244          * c) subsequent characters (lowercase)                 [titleLimit..index[
 245          */
 246         if(prev<idx) {
 247             /* find and copy uncased characters [prev..titleStart[ */
 248             titleStart=titleLimit=prev;
 249             U16_NEXT(src, titleLimit, idx, c);
 250             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
 251                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
 252                 for(;;) {
 253                     titleStart=titleLimit;
 254                     if(titleLimit==idx) {
 255                         /*
 256                          * only uncased characters in [prev..index[
 257                          * stop with titleStart==titleLimit==index
 258                          */
 259                         break;
 260                     }
 261                     U16_NEXT(src, titleLimit, idx, c);
 262                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
 263                         break; /* cased letter at [titleStart..titleLimit[ */
 264                     }
 265                 }
 266                 destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
 267                 if(destIndex<0) {
 268                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 269                     return 0;
 270                 }
 271             }
 272
 273             if(titleStart<titleLimit) {
 274                 /* titlecase c which is from [titleStart..titleLimit[ */
 275                 csc.cpStart=titleStart;
 276                 csc.cpLimit=titleLimit;
 277                 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
 278                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 279                 if(destIndex<0) {
 280                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 281                     return 0;
 282                 }
 283
 284                 /* Special case Dutch IJ titlecasing */
 285                 if (titleStart+1 < idx &&
 286                         ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
 287                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
 288                         (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
 289                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
 290                     if(destIndex<0) {
 291                         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 292                         return 0;
 293                     }
 294                     titleLimit++;
 295                 }
 296
 297                 /* lowercase [titleLimit..index[ */
 298                 if(titleLimit<idx) {
 299                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
 300                         /* Normal operation: Lowercase the rest of the word. */
 301                         destIndex+=
 302                             _caseMap(
 303                                 csm, ucase_toFullLower,
 304                                 dest+destIndex, destCapacity-destIndex,
 305                                 src, &csc,
 306                                 titleLimit, idx,
 307                                 pErrorCode);
 308                         if(U_FAILURE(*pErrorCode)) {
 309                             return destIndex;
 310                         }
 311                     } else {
 312                         /* Optionally just copy the rest of the word unchanged. */
 313                         destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
 314                         if(destIndex<0) {
 315                             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 316                             return 0;
 317                         }
 318                     }
 319                 }
 320             }
 321         }
 322
 323         prev=idx;
 324     }
 325
 326     if(destIndex>destCapacity) {
 327         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 328     }
 329     return destIndex;
 330 }
 331
 332 #endif  // !UCONFIG_NO_BREAK_ITERATION
 333
 334 U_NAMESPACE_BEGIN
 335 namespace GreekUpper {
 336
 337 // Data generated by prototype code, see
 338 // http://site.icu-project.org/design/case/greek-upper
 339 // TODO: Move this data into ucase.icu.
 340 static const uint16_t data0370[] = {
 341     // U+0370..03FF
 342     0x0370,
 343     0x0370,
 344     0x0372,
 345     0x0372,
 346     0,
 347     0,
 348     0x0376,
 349     0x0376,
 350     0,
 351     0,
 352     0x037A,
 353     0x03FD,
 354     0x03FE,
 355     0x03FF,
 356     0,
 357     0x037F,
 358     0,
 359     0,
 360     0,
 361     0,
 362     0,
 363     0,
 364     0x0391 | HAS_VOWEL | HAS_ACCENT,
 365     0,
 366     0x0395 | HAS_VOWEL | HAS_ACCENT,
 367     0x0397 | HAS_VOWEL | HAS_ACCENT,
 368     0x0399 | HAS_VOWEL | HAS_ACCENT,
 369     0,
 370     0x039F | HAS_VOWEL | HAS_ACCENT,
 371     0,
 372     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 373     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 374     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 375     0x0391 | HAS_VOWEL,
 376     0x0392,
 377     0x0393,
 378     0x0394,
 379     0x0395 | HAS_VOWEL,
 380     0x0396,
 381     0x0397 | HAS_VOWEL,
 382     0x0398,
 383     0x0399 | HAS_VOWEL,
 384     0x039A,
 385     0x039B,
 386     0x039C,
 387     0x039D,
 388     0x039E,
 389     0x039F | HAS_VOWEL,
 390     0x03A0,
 391     0x03A1,
 392     0,
 393     0x03A3,
 394     0x03A4,
 395     0x03A5 | HAS_VOWEL,
 396     0x03A6,
 397     0x03A7,
 398     0x03A8,
 399     0x03A9 | HAS_VOWEL,
 400     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
 401     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
 402     0x0391 | HAS_VOWEL | HAS_ACCENT,
 403     0x0395 | HAS_VOWEL | HAS_ACCENT,
 404     0x0397 | HAS_VOWEL | HAS_ACCENT,
 405     0x0399 | HAS_VOWEL | HAS_ACCENT,
 406     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 407     0x0391 | HAS_VOWEL,
 408     0x0392,
 409     0x0393,
 410     0x0394,
 411     0x0395 | HAS_VOWEL,
 412     0x0396,
 413     0x0397 | HAS_VOWEL,
 414     0x0398,
 415     0x0399 | HAS_VOWEL,
 416     0x039A,
 417     0x039B,
 418     0x039C,
 419     0x039D,
 420     0x039E,
 421     0x039F | HAS_VOWEL,
 422     0x03A0,
 423     0x03A1,
 424     0x03A3,
 425     0x03A3,
 426     0x03A4,
 427     0x03A5 | HAS_VOWEL,
 428     0x03A6,
 429     0x03A7,
 430     0x03A8,
 431     0x03A9 | HAS_VOWEL,
 432     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
 433     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
 434     0x039F | HAS_VOWEL | HAS_ACCENT,
 435     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 436     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 437     0x03CF,
 438     0x0392,
 439     0x0398,
 440     0x03D2,
 441     0x03D2 | HAS_ACCENT,
 442     0x03D2 | HAS_DIALYTIKA,
 443     0x03A6,
 444     0x03A0,
 445     0x03CF,
 446     0x03D8,
 447     0x03D8,
 448     0x03DA,
 449     0x03DA,
 450     0x03DC,
 451     0x03DC,
 452     0x03DE,
 453     0x03DE,
 454     0x03E0,
 455     0x03E0,
 456     0,
 457     0,
 458     0,
 459     0,
 460     0,
 461     0,
 462     0,
 463     0,
 464     0,
 465     0,
 466     0,
 467     0,
 468     0,
 469     0,
 470     0x039A,
 471     0x03A1,
 472     0x03F9,
 473     0x037F,
 474     0x03F4,
 475     0x0395 | HAS_VOWEL,
 476     0,
 477     0x03F7,
 478     0x03F7,
 479     0x03F9,
 480     0x03FA,
 481     0x03FA,
 482     0x03FC,
 483     0x03FD,
 484     0x03FE,
 485     0x03FF,
 486 };
 487
 488 static const uint16_t data1F00[] = {
 489     // U+1F00..1FFF
 490     0x0391 | HAS_VOWEL,
 491     0x0391 | HAS_VOWEL,
 492     0x0391 | HAS_VOWEL | HAS_ACCENT,
 493     0x0391 | HAS_VOWEL | HAS_ACCENT,
 494     0x0391 | HAS_VOWEL | HAS_ACCENT,
 495     0x0391 | HAS_VOWEL | HAS_ACCENT,
 496     0x0391 | HAS_VOWEL | HAS_ACCENT,
 497     0x0391 | HAS_VOWEL | HAS_ACCENT,
 498     0x0391 | HAS_VOWEL,
 499     0x0391 | HAS_VOWEL,
 500     0x0391 | HAS_VOWEL | HAS_ACCENT,
 501     0x0391 | HAS_VOWEL | HAS_ACCENT,
 502     0x0391 | HAS_VOWEL | HAS_ACCENT,
 503     0x0391 | HAS_VOWEL | HAS_ACCENT,
 504     0x0391 | HAS_VOWEL | HAS_ACCENT,
 505     0x0391 | HAS_VOWEL | HAS_ACCENT,
 506     0x0395 | HAS_VOWEL,
 507     0x0395 | HAS_VOWEL,
 508     0x0395 | HAS_VOWEL | HAS_ACCENT,
 509     0x0395 | HAS_VOWEL | HAS_ACCENT,
 510     0x0395 | HAS_VOWEL | HAS_ACCENT,
 511     0x0395 | HAS_VOWEL | HAS_ACCENT,
 512     0,
 513     0,
 514     0x0395 | HAS_VOWEL,
 515     0x0395 | HAS_VOWEL,
 516     0x0395 | HAS_VOWEL | HAS_ACCENT,
 517     0x0395 | HAS_VOWEL | HAS_ACCENT,
 518     0x0395 | HAS_VOWEL | HAS_ACCENT,
 519     0x0395 | HAS_VOWEL | HAS_ACCENT,
 520     0,
 521     0,
 522     0x0397 | HAS_VOWEL,
 523     0x0397 | HAS_VOWEL,
 524     0x0397 | HAS_VOWEL | HAS_ACCENT,
 525     0x0397 | HAS_VOWEL | HAS_ACCENT,
 526     0x0397 | HAS_VOWEL | HAS_ACCENT,
 527     0x0397 | HAS_VOWEL | HAS_ACCENT,
 528     0x0397 | HAS_VOWEL | HAS_ACCENT,
 529     0x0397 | HAS_VOWEL | HAS_ACCENT,
 530     0x0397 | HAS_VOWEL,
 531     0x0397 | HAS_VOWEL,
 532     0x0397 | HAS_VOWEL | HAS_ACCENT,
 533     0x0397 | HAS_VOWEL | HAS_ACCENT,
 534     0x0397 | HAS_VOWEL | HAS_ACCENT,
 535     0x0397 | HAS_VOWEL | HAS_ACCENT,
 536     0x0397 | HAS_VOWEL | HAS_ACCENT,
 537     0x0397 | HAS_VOWEL | HAS_ACCENT,
 538     0x0399 | HAS_VOWEL,
 539     0x0399 | HAS_VOWEL,
 540     0x0399 | HAS_VOWEL | HAS_ACCENT,
 541     0x0399 | HAS_VOWEL | HAS_ACCENT,
 542     0x0399 | HAS_VOWEL | HAS_ACCENT,
 543     0x0399 | HAS_VOWEL | HAS_ACCENT,
 544     0x0399 | HAS_VOWEL | HAS_ACCENT,
 545     0x0399 | HAS_VOWEL | HAS_ACCENT,
 546     0x0399 | HAS_VOWEL,
 547     0x0399 | HAS_VOWEL,
 548     0x0399 | HAS_VOWEL | HAS_ACCENT,
 549     0x0399 | HAS_VOWEL | HAS_ACCENT,
 550     0x0399 | HAS_VOWEL | HAS_ACCENT,
 551     0x0399 | HAS_VOWEL | HAS_ACCENT,
 552     0x0399 | HAS_VOWEL | HAS_ACCENT,
 553     0x0399 | HAS_VOWEL | HAS_ACCENT,
 554     0x039F | HAS_VOWEL,
 555     0x039F | HAS_VOWEL,
 556     0x039F | HAS_VOWEL | HAS_ACCENT,
 557     0x039F | HAS_VOWEL | HAS_ACCENT,
 558     0x039F | HAS_VOWEL | HAS_ACCENT,
 559     0x039F | HAS_VOWEL | HAS_ACCENT,
 560     0,
 561     0,
 562     0x039F | HAS_VOWEL,
 563     0x039F | HAS_VOWEL,
 564     0x039F | HAS_VOWEL | HAS_ACCENT,
 565     0x039F | HAS_VOWEL | HAS_ACCENT,
 566     0x039F | HAS_VOWEL | HAS_ACCENT,
 567     0x039F | HAS_VOWEL | HAS_ACCENT,
 568     0,
 569     0,
 570     0x03A5 | HAS_VOWEL,
 571     0x03A5 | HAS_VOWEL,
 572     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 573     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 574     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 575     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 576     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 577     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 578     0,
 579     0x03A5 | HAS_VOWEL,
 580     0,
 581     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 582     0,
 583     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 584     0,
 585     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 586     0x03A9 | HAS_VOWEL,
 587     0x03A9 | HAS_VOWEL,
 588     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 589     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 590     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 591     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 592     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 593     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 594     0x03A9 | HAS_VOWEL,
 595     0x03A9 | HAS_VOWEL,
 596     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 597     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 598     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 599     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 600     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 601     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 602     0x0391 | HAS_VOWEL | HAS_ACCENT,
 603     0x0391 | HAS_VOWEL | HAS_ACCENT,
 604     0x0395 | HAS_VOWEL | HAS_ACCENT,
 605     0x0395 | HAS_VOWEL | HAS_ACCENT,
 606     0x0397 | HAS_VOWEL | HAS_ACCENT,
 607     0x0397 | HAS_VOWEL | HAS_ACCENT,
 608     0x0399 | HAS_VOWEL | HAS_ACCENT,
 609     0x0399 | HAS_VOWEL | HAS_ACCENT,
 610     0x039F | HAS_VOWEL | HAS_ACCENT,
 611     0x039F | HAS_VOWEL | HAS_ACCENT,
 612     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 613     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 614     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 615     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 616     0,
 617     0,
 618     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 619     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 620     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 621     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 622     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 623     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 624     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 625     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 626     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 627     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 628     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 629     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 630     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 631     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 632     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 633     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 634     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 635     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 636     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 637     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 638     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 639     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 640     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 641     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 642     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 643     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 644     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 645     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 646     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 647     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 648     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 649     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 650     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 651     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 652     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 653     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 654     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 655     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 656     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 657     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 658     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 659     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 660     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 661     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 662     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 663     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 664     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 665     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 666     0x0391 | HAS_VOWEL,
 667     0x0391 | HAS_VOWEL,
 668     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 669     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 670     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 671     0,
 672     0x0391 | HAS_VOWEL | HAS_ACCENT,
 673     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 674     0x0391 | HAS_VOWEL,
 675     0x0391 | HAS_VOWEL,
 676     0x0391 | HAS_VOWEL | HAS_ACCENT,
 677     0x0391 | HAS_VOWEL | HAS_ACCENT,
 678     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 679     0,
 680     0x0399 | HAS_VOWEL,
 681     0,
 682     0,
 683     0,
 684     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 685     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 686     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 687     0,
 688     0x0397 | HAS_VOWEL | HAS_ACCENT,
 689     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 690     0x0395 | HAS_VOWEL | HAS_ACCENT,
 691     0x0395 | HAS_VOWEL | HAS_ACCENT,
 692     0x0397 | HAS_VOWEL | HAS_ACCENT,
 693     0x0397 | HAS_VOWEL | HAS_ACCENT,
 694     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 695     0,
 696     0,
 697     0,
 698     0x0399 | HAS_VOWEL,
 699     0x0399 | HAS_VOWEL,
 700     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 701     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 702     0,
 703     0,
 704     0x0399 | HAS_VOWEL | HAS_ACCENT,
 705     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 706     0x0399 | HAS_VOWEL,
 707     0x0399 | HAS_VOWEL,
 708     0x0399 | HAS_VOWEL | HAS_ACCENT,
 709     0x0399 | HAS_VOWEL | HAS_ACCENT,
 710     0,
 711     0,
 712     0,
 713     0,
 714     0x03A5 | HAS_VOWEL,
 715     0x03A5 | HAS_VOWEL,
 716     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 717     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 718     0x03A1,
 719     0x03A1,
 720     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 721     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
 722     0x03A5 | HAS_VOWEL,
 723     0x03A5 | HAS_VOWEL,
 724     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 725     0x03A5 | HAS_VOWEL | HAS_ACCENT,
 726     0x03A1,
 727     0,
 728     0,
 729     0,
 730     0,
 731     0,
 732     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 733     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 734     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 735     0,
 736     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 737     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
 738     0x039F | HAS_VOWEL | HAS_ACCENT,
 739     0x039F | HAS_VOWEL | HAS_ACCENT,
 740     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 741     0x03A9 | HAS_VOWEL | HAS_ACCENT,
 742     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
 743     0,
 744     0,
 745     0,
 746 };
 747
 748 // U+2126 Ohm sign
 749 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
 750
 751 uint32_t getLetterData(UChar32 c) {
 752     if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
 753         return 0;
 754     } else if (c <= 0x3ff) {
 755         return data0370[c - 0x370];
 756     } else if (c <= 0x1fff) {
 757         return data1F00[c - 0x1f00];
 758     } else if (c == 0x2126) {
 759         return data2126;
 760     } else {
 761         return 0;
 762     }
 763 }
 764
 765 uint32_t getDiacriticData(UChar32 c) {
 766     switch (c) {
 767     case 0x0300:  // varia
 768     case 0x0301:  // tonos = oxia
 769     case 0x0342:  // perispomeni
 770     case 0x0302:  // circumflex can look like perispomeni
 771     case 0x0303:  // tilde can look like perispomeni
 772     case 0x0311:  // inverted breve can look like perispomeni
 773         return HAS_ACCENT;
 774     case 0x0308:  // dialytika = diaeresis
 775         return HAS_COMBINING_DIALYTIKA;
 776     case 0x0344:  // dialytika tonos
 777         return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
 778     case 0x0345:  // ypogegrammeni = iota subscript
 779         return HAS_YPOGEGRAMMENI;
 780     case 0x0304:  // macron
 781     case 0x0306:  // breve
 782     case 0x0313:  // comma above
 783     case 0x0314:  // reversed comma above
 784     case 0x0343:  // koronis
 785         return HAS_OTHER_GREEK_DIACRITIC;
 786     default:
 787         return 0;
 788     }
 789 }
 790
 791 UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i, int32_t length) {
 792     while (i < length) {
 793         UChar32 c;
 794         U16_NEXT(s, i, length, c);
 795         int32_t type = ucase_getTypeOrIgnorable(csp, c);
 796         if ((type & UCASE_IGNORABLE) != 0) {
 797             // Case-ignorable, continue with the loop.
 798         } else if (type != UCASE_NONE) {
 799             return TRUE;  // Followed by cased letter.
 800         } else {
 801             return FALSE;  // Uncased and not case-ignorable.
 802         }
 803     }
 804     return FALSE;  // Not followed by cased letter.
 805 }
 806
 807 /**
 808  * Greek string uppercasing with a state machine.
 809  * Probably simpler than a stateless function that has to figure out complex context-before
 810  * for each character.
 811  * TODO: Try to re-consolidate one way or another with the non-Greek function.
 812  */
 813 int32_t toUpper(const UCaseMap *csm,
 814                 UChar *dest, int32_t destCapacity,
 815                 const UChar *src, int32_t srcLength,
 816                 UErrorCode *pErrorCode) {
 817     int32_t locCache = UCASE_LOC_GREEK;
 818     int32_t destIndex=0;
 819     uint32_t state = 0;
 820     for (int32_t i = 0; i < srcLength;) {
 821         int32_t nextIndex = i;
 822         UChar32 c;
 823         U16_NEXT(src, nextIndex, srcLength, c);
 824         uint32_t nextState = 0;
 825         int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
 826         if ((type & UCASE_IGNORABLE) != 0) {
 827             // c is case-ignorable
 828             nextState |= (state & AFTER_CASED);
 829         } else if (type != UCASE_NONE) {
 830             // c is cased
 831             nextState |= AFTER_CASED;
 832         }
 833         uint32_t data = getLetterData(c);
 834         if (data > 0) {
 835             uint32_t upper = data & UPPER_MASK;
 836             // Add a dialytika to this iota or ypsilon vowel
 837             // if we removed a tonos from the previous vowel,
 838             // and that previous vowel did not also have (or gain) a dialytika.
 839             // Adding one only to the final vowel in a longer sequence
 840             // (which does not occur in normal writing) would require lookahead.
 841             // Set the same flag as for preserving an existing dialytika.
 842             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
 843                     (upper == 0x399 || upper == 0x3A5)) {
 844                 data |= HAS_DIALYTIKA;
 845             }
 846             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
 847             if ((data & HAS_YPOGEGRAMMENI) != 0) {
 848                 numYpogegrammeni = 1;
 849             }
 850             // Skip combining diacritics after this Greek letter.
 851             while (nextIndex < srcLength) {
 852                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
 853                 if (diacriticData != 0) {
 854                     data |= diacriticData;
 855                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
 856                         ++numYpogegrammeni;
 857                     }
 858                     ++nextIndex;
 859                 } else {
 860                     break;  // not a Greek diacritic
 861                 }
 862             }
 863             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
 864                 nextState |= AFTER_VOWEL_WITH_ACCENT;
 865             }
 866             // Map according to Greek rules.
 867             UBool addTonos = FALSE;
 868             if (upper == 0x397 &&
 869                     (data & HAS_ACCENT) != 0 &&
 870                     numYpogegrammeni == 0 &&
 871                     (state & AFTER_CASED) == 0 &&
 872                     !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
 873                 // Keep disjunctive "or" with (only) a tonos.
 874                 // We use the same "word boundary" conditions as for the Final_Sigma test.
 875                 if (i == nextIndex) {
 876                     upper = 0x389;  // Preserve the precomposed form.
 877                 } else {
 878                     addTonos = TRUE;
 879                 }
 880             } else if ((data & HAS_DIALYTIKA) != 0) {
 881                 // Preserve a vowel with dialytika in precomposed form if it exists.
 882                 if (upper == 0x399) {
 883                     upper = 0x3AA;
 884                     data &= ~HAS_EITHER_DIALYTIKA;
 885                 } else if (upper == 0x3A5) {
 886                     upper = 0x3AB;
 887                     data &= ~HAS_EITHER_DIALYTIKA;
 888                 }
 889             }
 890             destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
 891             if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
 892                 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
 893             }
 894             if (destIndex >= 0 && addTonos) {
 895                 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
 896             }
 897             while (destIndex >= 0 && numYpogegrammeni > 0) {
 898                 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
 899                 --numYpogegrammeni;
 900             }
 901             if(destIndex<0) {
 902                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 903                 return 0;
 904             }
 905         } else {
 906             const UChar *s;
 907             UChar32 c2 = 0;
 908             c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
 909             if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
 910                 /* fast path version of appendResult() for BMP results */
 911                 dest[destIndex++]=(UChar)c2;
 912             } else {
 913                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 914                 if(destIndex<0) {
 915                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 916                     return 0;
 917                 }
 918             }
 919         }
 920         i = nextIndex;
 921         state = nextState;
 922     }
 923
 924     if(destIndex>destCapacity) {
 925         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 926     }
 927     return destIndex;
 928 }
 929
 930 }  // namespace GreekUpper
 931 U_NAMESPACE_END
 932
 933 /* functions available in the common library (for unistr_case.cpp) */
 934
 935 U_CFUNC int32_t U_CALLCONV
 936 ustrcase_internalToLower(const UCaseMap *csm,
 937                          UChar *dest, int32_t destCapacity,
 938                          const UChar *src, int32_t srcLength,
 939                          UErrorCode *pErrorCode) {
 940     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 941     csc.p=(void *)src;
 942     csc.limit=srcLength;
 943     return _caseMap(
 944         csm, ucase_toFullLower,
 945         dest, destCapacity,
 946         src, &csc, 0, srcLength,
 947         pErrorCode);
 948 }
 949
 950 U_CFUNC int32_t U_CALLCONV
 951 ustrcase_internalToUpper(const UCaseMap *csm,
 952                          UChar *dest, int32_t destCapacity,
 953                          const UChar *src, int32_t srcLength,
 954                          UErrorCode *pErrorCode) {
 955     int32_t locCache = csm->locCache;
 956     if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
 957         return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode);
 958     }
 959     UCaseContext csc=UCASECONTEXT_INITIALIZER;
 960     csc.p=(void *)src;
 961     csc.limit=srcLength;
 962     return _caseMap(
 963         csm, ucase_toFullUpper,
 964         dest, destCapacity,
 965         src, &csc, 0, srcLength,
 966         pErrorCode);
 967 }
 968
 969 static int32_t
 970 ustr_foldCase(const UCaseProps *csp,
 971               UChar *dest, int32_t destCapacity,
 972               const UChar *src, int32_t srcLength,
 973               uint32_t options,
 974               UErrorCode *pErrorCode) {
 975     int32_t srcIndex, destIndex;
 976
 977     const UChar *s;
 978     UChar32 c, c2 = 0;
 979
 980     /* case mapping loop */
 981     srcIndex=destIndex=0;
 982     while(srcIndex<srcLength) {
 983         U16_NEXT(src, srcIndex, srcLength, c);
 984         c=ucase_toFullFolding(csp, c, &s, options);
 985         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
 986             /* fast path version of appendResult() for BMP results */
 987             dest[destIndex++]=(UChar)c2;
 988         } else {
 989             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
 990             if(destIndex<0) {
 991                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
 992                 return 0;
 993             }
 994         }
 995     }
 996
 997     if(destIndex>destCapacity) {
 998         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 999     }
1000     return destIndex;
1001 }
1002
1003 U_CFUNC int32_t U_CALLCONV
1004 ustrcase_internalFold(const UCaseMap *csm,
1005                       UChar *dest, int32_t destCapacity,
1006                       const UChar *src, int32_t srcLength,
1007                       UErrorCode *pErrorCode) {
1008     return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
1009 }
1010
1011 U_CFUNC int32_t
1012 ustrcase_map(const UCaseMap *csm,
1013              UChar *dest, int32_t destCapacity,
1014              const UChar *src, int32_t srcLength,
1015              UStringCaseMapper *stringCaseMapper,
1016              UErrorCode *pErrorCode) {
1017     UChar buffer[300];
1018     UChar *temp;
1019
1020     int32_t destLength;
1021
1022     /* check argument values */
1023     if(U_FAILURE(*pErrorCode)) {
1024         return 0;
1025     }
1026     if( destCapacity<0 ||
1027         (dest==NULL && destCapacity>0) ||
1028         src==NULL ||
1029         srcLength<-1
1030     ) {
1031         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1032         return 0;
1033     }
1034
1035     /* get the string length */
1036     if(srcLength==-1) {
1037         srcLength=u_strlen(src);
1038     }
1039
1040     /* check for overlapping source and destination */
1041     if( dest!=NULL &&
1042         ((src>=dest && src<(dest+destCapacity)) ||
1043          (dest>=src && dest<(src+srcLength)))
1044     ) {
1045         /* overlap: provide a temporary destination buffer and later copy the result */
1046         if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1047             /* the stack buffer is large enough */
1048             temp=buffer;
1049         } else {
1050             /* allocate a buffer */
1051             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1052             if(temp==NULL) {
1053                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1054                 return 0;
1055             }
1056         }
1057     } else {
1058         temp=dest;
1059     }
1060
1061     destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
1062     if(temp!=dest) {
1063         /* copy the result string to the destination buffer */
1064         if(destLength>0) {
1065             int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
1066             if(copyLength>0) {
1067                 u_memmove(dest, temp, copyLength);
1068             }
1069         }
1070         if(temp!=buffer) {
1071             uprv_free(temp);
1072         }
1073     }
1074
1075     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
1076 }
1077
1078 /* public API functions */
1079
1080 U_CAPI int32_t U_EXPORT2
1081 u_strFoldCase(UChar *dest, int32_t destCapacity,
1082               const UChar *src, int32_t srcLength,
1083               uint32_t options,
1084               UErrorCode *pErrorCode) {
1085     UCaseMap csm=UCASEMAP_INITIALIZER;
1086     csm.csp=ucase_getSingleton();
1087     csm.options=options;
1088     return ustrcase_map(
1089         &csm,
1090         dest, destCapacity,
1091         src, srcLength,
1092         ustrcase_internalFold, pErrorCode);
1093 }
1094
1095 /* case-insensitive string comparisons -------------------------------------- */
1096
1097 /*
1098  * This function is a copy of unorm_cmpEquivFold() minus the parts for
1099  * canonical equivalence.
1100  * Keep the functions in sync, and see there for how this works.
1101  * The duplication is for modularization:
1102  * It makes caseless (but not canonical caseless) matches independent of
1103  * the normalization code.
1104  */
1105
1106 /* stack element for previous-level source/decomposition pointers */
1107 struct CmpEquivLevel {
1108     const UChar *start, *s, *limit;
1109 };
1110 typedef struct CmpEquivLevel CmpEquivLevel;
1111
1112 /**
1113  * Internal implementation code comparing string with case fold.
1114  * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1115  *
1116  * @param s1            input string 1
1117  * @param length1       length of string 1, or -1 (NULL terminated)
1118  * @param s2            input string 2
1119  * @param length2       length of string 2, or -1 (NULL terminated)
1120  * @param options       compare options
1121  * @param matchLen1     (output) length of partial prefix match in s1
1122  * @param matchLen2     (output) length of partial prefix match in s2
1123  * @param pErrorCode    receives error status
1124  * @return The result of comparison
1125  */
1126 static int32_t _cmpFold(
1127             const UChar *s1, int32_t length1,
1128             const UChar *s2, int32_t length2,
1129             uint32_t options,
1130             int32_t *matchLen1, int32_t *matchLen2,
1131             UErrorCode *pErrorCode) {
1132     int32_t cmpRes = 0;
1133
1134     const UCaseProps *csp;
1135
1136     /* current-level start/limit - s1/s2 as current */
1137     const UChar *start1, *start2, *limit1, *limit2;
1138
1139     /* points to the original start address */
1140     const UChar *org1, *org2;
1141
1142     /* points to the end of match + 1 */
1143     const UChar *m1, *m2;
1144
1145     /* case folding variables */
1146     const UChar *p;
1147     int32_t length;
1148
1149     /* stacks of previous-level start/current/limit */
1150     CmpEquivLevel stack1[2], stack2[2];
1151
1152     /* case folding buffers, only use current-level start/limit */
1153     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1154
1155     /* track which is the current level per string */
1156     int32_t level1, level2;
1157
1158     /* current code units, and code points for lookups */
1159     UChar32 c1, c2, cp1, cp2;
1160
1161     /* no argument error checking because this itself is not an API */
1162
1163     /*
1164      * assume that at least the option U_COMPARE_IGNORE_CASE is set
1165      * otherwise this function would have to behave exactly as uprv_strCompare()
1166      */
1167     csp=ucase_getSingleton();
1168     if(U_FAILURE(*pErrorCode)) {
1169         return 0;
1170     }
1171
1172     /* initialize */
1173     if(matchLen1) {
1174         U_ASSERT(matchLen2 !=NULL);
1175         *matchLen1=0;
1176         *matchLen2=0;
1177     }
1178
1179     start1=m1=org1=s1;
1180     if(length1==-1) {
1181         limit1=NULL;
1182     } else {
1183         limit1=s1+length1;
1184     }
1185
1186     start2=m2=org2=s2;
1187     if(length2==-1) {
1188         limit2=NULL;
1189     } else {
1190         limit2=s2+length2;
1191     }
1192
1193     level1=level2=0;
1194     c1=c2=-1;
1195
1196     /* comparison loop */
1197     for(;;) {
1198         /*
1199          * here a code unit value of -1 means "get another code unit"
1200          * below it will mean "this source is finished"
1201          */
1202
1203         if(c1<0) {
1204             /* get next code unit from string 1, post-increment */
1205             for(;;) {
1206                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1207                     if(level1==0) {
1208                         c1=-1;
1209                         break;
1210                     }
1211                 } else {
1212                     ++s1;
1213                     break;
1214                 }
1215
1216                 /* reached end of level buffer, pop one level */
1217                 do {
1218                     --level1;
1219                     start1=stack1[level1].start;    /*Not uninitialized*/
1220                 } while(start1==NULL);
1221                 s1=stack1[level1].s;                /*Not uninitialized*/
1222                 limit1=stack1[level1].limit;        /*Not uninitialized*/
1223             }
1224         }
1225
1226         if(c2<0) {
1227             /* get next code unit from string 2, post-increment */
1228             for(;;) {
1229                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1230                     if(level2==0) {
1231                         c2=-1;
1232                         break;
1233                     }
1234                 } else {
1235                     ++s2;
1236                     break;
1237                 }
1238
1239                 /* reached end of level buffer, pop one level */
1240                 do {
1241                     --level2;
1242                     start2=stack2[level2].start;    /*Not uninitialized*/
1243                 } while(start2==NULL);
1244                 s2=stack2[level2].s;                /*Not uninitialized*/
1245                 limit2=stack2[level2].limit;        /*Not uninitialized*/
1246             }
1247         }
1248
1249         /*
1250          * compare c1 and c2
1251          * either variable c1, c2 is -1 only if the corresponding string is finished
1252          */
1253         if(c1==c2) {
1254             const UChar *next1, *next2;
1255
1256             if(c1<0) {
1257                 cmpRes=0;   /* c1==c2==-1 indicating end of strings */
1258                 break;
1259             }
1260
1261             /*
1262              * Note: Move the match positions in both strings at the same time
1263              *      only when corresponding code point(s) in the original strings
1264              *      are fully consumed. For example, when comparing s1="Fust" and
1265              *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1266              *      the first code point in the case-folded data. But the second "s"
1267              *      has no matching code point in s1, so this implementation returns
1268              *      2 as the prefix match length ("Fu").
1269              */
1270             next1=next2=NULL;
1271             if(level1==0) {
1272                 next1=s1;
1273             } else if(s1==limit1) {
1274                 /* Note: This implementation only use a single level of stack.
1275                  *      If this code needs to be changed to use multiple levels
1276                  *      of stacks, the code above should check if the current
1277                  *      code is at the end of all stacks.
1278                  */
1279                 U_ASSERT(level1==1);
1280
1281                 /* is s1 at the end of the current stack? */
1282                 next1=stack1[0].s;
1283             }
1284
1285             if (next1!=NULL) {
1286                 if(level2==0) {
1287                     next2=s2;
1288                 } else if(s2==limit2) {
1289                     U_ASSERT(level2==1);
1290
1291                     /* is s2 at the end of the current stack? */
1292                     next2=stack2[0].s;
1293                 }
1294                 if(next2!=NULL) {
1295                     m1=next1;
1296                     m2=next2;
1297                 }
1298             }
1299             c1=c2=-1;       /* make us fetch new code units */
1300             continue;
1301         } else if(c1<0) {
1302             cmpRes=-1;      /* string 1 ends before string 2 */
1303             break;
1304         } else if(c2<0) {
1305             cmpRes=1;       /* string 2 ends before string 1 */
1306             break;
1307         }
1308         /* c1!=c2 && c1>=0 && c2>=0 */
1309
1310         /* get complete code points for c1, c2 for lookups if either is a surrogate */
1311         cp1=c1;
1312         if(U_IS_SURROGATE(c1)) {
1313             UChar c;
1314
1315             if(U_IS_SURROGATE_LEAD(c1)) {
1316                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1317                     /* advance ++s1; only below if cp1 decomposes/case-folds */
1318                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
1319                 }
1320             } else /* isTrail(c1) */ {
1321                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1322                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
1323                 }
1324             }
1325         }
1326
1327         cp2=c2;
1328         if(U_IS_SURROGATE(c2)) {
1329             UChar c;
1330
1331             if(U_IS_SURROGATE_LEAD(c2)) {
1332                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1333                     /* advance ++s2; only below if cp2 decomposes/case-folds */
1334                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
1335                 }
1336             } else /* isTrail(c2) */ {
1337                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1338                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
1339                 }
1340             }
1341         }
1342
1343         /*
1344          * go down one level for each string
1345          * continue with the main loop as soon as there is a real change
1346          */
1347
1348         if( level1==0 &&
1349             (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
1350         ) {
1351             /* cp1 case-folds to the code point "length" or to p[length] */
1352             if(U_IS_SURROGATE(c1)) {
1353                 if(U_IS_SURROGATE_LEAD(c1)) {
1354                     /* advance beyond source surrogate pair if it case-folds */
1355                     ++s1;
1356                 } else /* isTrail(c1) */ {
1357                     /*
1358                      * we got a supplementary code point when hitting its trail surrogate,
1359                      * therefore the lead surrogate must have been the same as in the other string;
1360                      * compare this decomposition with the lead surrogate in the other string
1361                      * remember that this simulates bulk text replacement:
1362                      * the decomposition would replace the entire code point
1363                      */
1364                     --s2;
1365                     --m2;
1366                     c2=*(s2-1);
1367                 }
1368             }
1369
1370             /* push current level pointers */
1371             stack1[0].start=start1;
1372             stack1[0].s=s1;
1373             stack1[0].limit=limit1;
1374             ++level1;
1375
1376             /* copy the folding result to fold1[] */
1377             if(length<=UCASE_MAX_STRING_LENGTH) {
1378                 u_memcpy(fold1, p, length);
1379             } else {
1380                 int32_t i=0;
1381                 U16_APPEND_UNSAFE(fold1, i, length);
1382                 length=i;
1383             }
1384
1385             /* set next level pointers to case folding */
1386             start1=s1=fold1;
1387             limit1=fold1+length;
1388
1389             /* get ready to read from decomposition, continue with loop */
1390             c1=-1;
1391             continue;
1392         }
1393
1394         if( level2==0 &&
1395             (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
1396         ) {
1397             /* cp2 case-folds to the code point "length" or to p[length] */
1398             if(U_IS_SURROGATE(c2)) {
1399                 if(U_IS_SURROGATE_LEAD(c2)) {
1400                     /* advance beyond source surrogate pair if it case-folds */
1401                     ++s2;
1402                 } else /* isTrail(c2) */ {
1403                     /*
1404                      * we got a supplementary code point when hitting its trail surrogate,
1405                      * therefore the lead surrogate must have been the same as in the other string;
1406                      * compare this decomposition with the lead surrogate in the other string
1407                      * remember that this simulates bulk text replacement:
1408                      * the decomposition would replace the entire code point
1409                      */
1410                     --s1;
1411                     --m2;
1412                     c1=*(s1-1);
1413                 }
1414             }
1415
1416             /* push current level pointers */
1417             stack2[0].start=start2;
1418             stack2[0].s=s2;
1419             stack2[0].limit=limit2;
1420             ++level2;
1421
1422             /* copy the folding result to fold2[] */
1423             if(length<=UCASE_MAX_STRING_LENGTH) {
1424                 u_memcpy(fold2, p, length);
1425             } else {
1426                 int32_t i=0;
1427                 U16_APPEND_UNSAFE(fold2, i, length);
1428                 length=i;
1429             }
1430
1431             /* set next level pointers to case folding */
1432             start2=s2=fold2;
1433             limit2=fold2+length;
1434
1435             /* get ready to read from decomposition, continue with loop */
1436             c2=-1;
1437             continue;
1438         }
1439
1440         /*
1441          * no decomposition/case folding, max level for both sides:
1442          * return difference result
1443          *
1444          * code point order comparison must not just return cp1-cp2
1445          * because when single surrogates are present then the surrogate pairs
1446          * that formed cp1 and cp2 may be from different string indexes
1447          *
1448          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1449          * c1=d800 cp1=10001 c2=dc00 cp2=10000
1450          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1451          *
1452          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1453          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1454          * so we have slightly different pointer/start/limit comparisons here
1455          */
1456
1457         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1458             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1459             if(
1460                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1461                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1462             ) {
1463                 /* part of a surrogate pair, leave >=d800 */
1464             } else {
1465                 /* BMP code point - may be surrogate code point - make <d800 */
1466                 c1-=0x2800;
1467             }
1468
1469             if(
1470                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1471                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1472             ) {
1473                 /* part of a surrogate pair, leave >=d800 */
1474             } else {
1475                 /* BMP code point - may be surrogate code point - make <d800 */
1476                 c2-=0x2800;
1477             }
1478         }
1479
1480         cmpRes=c1-c2;
1481         break;
1482     }
1483
1484     if(matchLen1) {
1485         *matchLen1=m1-org1;
1486         *matchLen2=m2-org2;
1487     }
1488     return cmpRes;
1489 }
1490
1491 /* internal function */
1492 U_CFUNC int32_t
1493 u_strcmpFold(const UChar *s1, int32_t length1,
1494              const UChar *s2, int32_t length2,
1495              uint32_t options,
1496              UErrorCode *pErrorCode) {
1497     return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1498 }
1499
1500 /* public API functions */
1501
1502 U_CAPI int32_t U_EXPORT2
1503 u_strCaseCompare(const UChar *s1, int32_t length1,
1504                  const UChar *s2, int32_t length2,
1505                  uint32_t options,
1506                  UErrorCode *pErrorCode) {
1507     /* argument checking */
1508     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1509         return 0;
1510     }
1511     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1512         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1513         return 0;
1514     }
1515     return u_strcmpFold(s1, length1, s2, length2,
1516                         options|U_COMPARE_IGNORE_CASE,
1517                         pErrorCode);
1518 }
1519
1520 U_CAPI int32_t U_EXPORT2
1521 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1522     UErrorCode errorCode=U_ZERO_ERROR;
1523     return u_strcmpFold(s1, -1, s2, -1,
1524                         options|U_COMPARE_IGNORE_CASE,
1525                         &errorCode);
1526 }
1527
1528 U_CAPI int32_t U_EXPORT2
1529 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1530     UErrorCode errorCode=U_ZERO_ERROR;
1531     return u_strcmpFold(s1, length, s2, length,
1532                         options|U_COMPARE_IGNORE_CASE,
1533                         &errorCode);
1534 }
1535
1536 U_CAPI int32_t U_EXPORT2
1537 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1538     UErrorCode errorCode=U_ZERO_ERROR;
1539     return u_strcmpFold(s1, n, s2, n,
1540                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1541                         &errorCode);
1542 }
1543
1544 /* internal API - detect length of shared prefix */
1545 U_CAPI void
1546 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1547                              const UChar *s2, int32_t length2,
1548                              uint32_t options,
1549                              int32_t *matchLen1, int32_t *matchLen2,
1550                              UErrorCode *pErrorCode) {
1551     _cmpFold(s1, length1, s2, length2, options,
1552         matchLen1, matchLen2, pErrorCode);
1553 }