source/common/ucnv_u8.c

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2016, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u8.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
  17 *
  18 *   Also, CESU-8 implementation, see UTR 26.
  19 *   The CESU-8 converter uses all the same functions as the
  20 *   UTF-8 converter, with a branch for converting supplementary code points.
  21 */
  22
  23 #include "unicode/utypes.h"
  24
  25 #if !UCONFIG_NO_CONVERSION
  26
  27 #include "unicode/ucnv.h"
  28 #include "unicode/utf.h"
  29 #include "unicode/utf8.h"
  30 #include "unicode/utf16.h"
  31 #include "ucnv_bld.h"
  32 #include "ucnv_cnv.h"
  33 #include "cmemory.h"
  34
  35 /* Prototypes --------------------------------------------------------------- */
  36
  37 /* Keep these here to make finicky compilers happy */
  38
  39 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  40                                            UErrorCode *err);
  41 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  42                                                         UErrorCode *err);
  43
  44
  45 /* UTF-8 -------------------------------------------------------------------- */
  46
  47 /* UTF-8 Conversion DATA
  48  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
  49  */
  50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
  51 #define MAXIMUM_UCS2            0x0000FFFF
  52 #define MAXIMUM_UTF             0x0010FFFF
  53 #define MAXIMUM_UCS4            0x7FFFFFFF
  54 #define HALF_SHIFT              10
  55 #define HALF_BASE               0x0010000
  56 #define HALF_MASK               0x3FF
  57 #define SURROGATE_HIGH_START    0xD800
  58 #define SURROGATE_HIGH_END      0xDBFF
  59 #define SURROGATE_LOW_START     0xDC00
  60 #define SURROGATE_LOW_END       0xDFFF
  61
  62 /* -SURROGATE_LOW_START + HALF_BASE */
  63 #define SURROGATE_LOW_BASE      9216
  64
  65 static const uint32_t offsetsFromUTF8[7] = {0,
  66   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
  67   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
  68 };
  69
  70 /* END OF UTF-8 Conversion DATA */
  71
  72 static const int8_t bytesFromUTF8[256] = {
  73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  75   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  76   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  77   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  78   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  79   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  80   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  81 };
  82
  83 /*
  84  * Starting with Unicode 3.0.1:
  85  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
  86  * byte sequences with more than 4 bytes are illegal in UTF-8,
  87  * which is tested with impossible values for them
  88  */
  89 static const uint32_t
  90 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
  91
  92 static UBool hasCESU8Data(const UConverter *cnv)
  93 {
  94 #if UCONFIG_ONLY_HTML_CONVERSION
  95     return FALSE;
  96 #else
  97     return (UBool)(cnv->sharedData == &_CESU8Data);
  98 #endif
  99 }
 100
 101 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
 102                                   UErrorCode * err)
 103 {
 104     UConverter *cnv = args->converter;
 105     const unsigned char *mySource = (unsigned char *) args->source;
 106     UChar *myTarget = args->target;
 107     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 108     const UChar *targetLimit = args->targetLimit;
 109     unsigned char *toUBytes = cnv->toUBytes;
 110     UBool isCESU8 = hasCESU8Data(cnv);
 111     uint32_t ch, ch2 = 0;
 112     int32_t i, inBytes;
 113
 114     /* Restore size of current sequence */
 115     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 116     {
 117         inBytes = cnv->mode;            /* restore # of bytes to consume */
 118         i = cnv->toULength;             /* restore # of bytes consumed */
 119         cnv->toULength = 0;
 120
 121         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 122         cnv->toUnicodeStatus = 0;
 123         goto morebytes;
 124     }
 125
 126
 127     while (mySource < sourceLimit && myTarget < targetLimit)
 128     {
 129         ch = *(mySource++);
 130         if (ch < 0x80)        /* Simple case */
 131         {
 132             *(myTarget++) = (UChar) ch;
 133         }
 134         else
 135         {
 136             /* store the first char */
 137             toUBytes[0] = (char)ch;
 138             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
 139             i = 1;
 140
 141 morebytes:
 142             while (i < inBytes)
 143             {
 144                 if (mySource < sourceLimit)
 145                 {
 146                     toUBytes[i] = (char) (ch2 = *mySource);
 147                     if (!U8_IS_TRAIL(ch2))
 148                     {
 149                         break; /* i < inBytes */
 150                     }
 151                     ch = (ch << 6) + ch2;
 152                     ++mySource;
 153                     i++;
 154                 }
 155                 else
 156                 {
 157                     /* stores a partially calculated target*/
 158                     cnv->toUnicodeStatus = ch;
 159                     cnv->mode = inBytes;
 160                     cnv->toULength = (int8_t) i;
 161                     goto donefornow;
 162                 }
 163             }
 164
 165             /* Remove the accumulated high bits */
 166             ch -= offsetsFromUTF8[inBytes];
 167
 168             /*
 169              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 170              * - use only trail bytes after a lead byte (checked above)
 171              * - use the right number of trail bytes for a given lead byte
 172              * - encode a code point <= U+10ffff
 173              * - use the fewest possible number of bytes for their code points
 174              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 175              *
 176              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 177              * There are no irregular sequences any more.
 178              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 179              */
 180             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 181                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
 182             {
 183                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 184                 if (ch <= MAXIMUM_UCS2)
 185                 {
 186                     /* fits in 16 bits */
 187                     *(myTarget++) = (UChar) ch;
 188                 }
 189                 else
 190                 {
 191                     /* write out the surrogates */
 192                     ch -= HALF_BASE;
 193                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 194                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 195                     if (myTarget < targetLimit)
 196                     {
 197                         *(myTarget++) = (UChar)ch;
 198                     }
 199                     else
 200                     {
 201                         /* Put in overflow buffer (not handled here) */
 202                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 203                         cnv->UCharErrorBufferLength = 1;
 204                         *err = U_BUFFER_OVERFLOW_ERROR;
 205                         break;
 206                     }
 207                 }
 208             }
 209             else
 210             {
 211                 cnv->toULength = (int8_t)i;
 212                 *err = U_ILLEGAL_CHAR_FOUND;
 213                 break;
 214             }
 215         }
 216     }
 217
 218 donefornow:
 219     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 220     {
 221         /* End of target buffer */
 222         *err = U_BUFFER_OVERFLOW_ERROR;
 223     }
 224
 225     args->target = myTarget;
 226     args->source = (const char *) mySource;
 227 }
 228
 229 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
 230                                                 UErrorCode * err)
 231 {
 232     UConverter *cnv = args->converter;
 233     const unsigned char *mySource = (unsigned char *) args->source;
 234     UChar *myTarget = args->target;
 235     int32_t *myOffsets = args->offsets;
 236     int32_t offsetNum = 0;
 237     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 238     const UChar *targetLimit = args->targetLimit;
 239     unsigned char *toUBytes = cnv->toUBytes;
 240     UBool isCESU8 = hasCESU8Data(cnv);
 241     uint32_t ch, ch2 = 0;
 242     int32_t i, inBytes;
 243
 244     /* Restore size of current sequence */
 245     if (cnv->toUnicodeStatus && myTarget < targetLimit)
 246     {
 247         inBytes = cnv->mode;            /* restore # of bytes to consume */
 248         i = cnv->toULength;             /* restore # of bytes consumed */
 249         cnv->toULength = 0;
 250
 251         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
 252         cnv->toUnicodeStatus = 0;
 253         goto morebytes;
 254     }
 255
 256     while (mySource < sourceLimit && myTarget < targetLimit)
 257     {
 258         ch = *(mySource++);
 259         if (ch < 0x80)        /* Simple case */
 260         {
 261             *(myTarget++) = (UChar) ch;
 262             *(myOffsets++) = offsetNum++;
 263         }
 264         else
 265         {
 266             toUBytes[0] = (char)ch;
 267             inBytes = bytesFromUTF8[ch];
 268             i = 1;
 269
 270 morebytes:
 271             while (i < inBytes)
 272             {
 273                 if (mySource < sourceLimit)
 274                 {
 275                     toUBytes[i] = (char) (ch2 = *mySource);
 276                     if (!U8_IS_TRAIL(ch2))
 277                     {
 278                         break; /* i < inBytes */
 279                     }
 280                     ch = (ch << 6) + ch2;
 281                     ++mySource;
 282                     i++;
 283                 }
 284                 else
 285                 {
 286                     cnv->toUnicodeStatus = ch;
 287                     cnv->mode = inBytes;
 288                     cnv->toULength = (int8_t)i;
 289                     goto donefornow;
 290                 }
 291             }
 292
 293             /* Remove the accumulated high bits */
 294             ch -= offsetsFromUTF8[inBytes];
 295
 296             /*
 297              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 298              * - use only trail bytes after a lead byte (checked above)
 299              * - use the right number of trail bytes for a given lead byte
 300              * - encode a code point <= U+10ffff
 301              * - use the fewest possible number of bytes for their code points
 302              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 303              *
 304              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 305              * There are no irregular sequences any more.
 306              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
 307              */
 308             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
 309                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
 310             {
 311                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 312                 if (ch <= MAXIMUM_UCS2)
 313                 {
 314                     /* fits in 16 bits */
 315                     *(myTarget++) = (UChar) ch;
 316                     *(myOffsets++) = offsetNum;
 317                 }
 318                 else
 319                 {
 320                     /* write out the surrogates */
 321                     ch -= HALF_BASE;
 322                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
 323                     *(myOffsets++) = offsetNum;
 324                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
 325                     if (myTarget < targetLimit)
 326                     {
 327                         *(myTarget++) = (UChar)ch;
 328                         *(myOffsets++) = offsetNum;
 329                     }
 330                     else
 331                     {
 332                         cnv->UCharErrorBuffer[0] = (UChar) ch;
 333                         cnv->UCharErrorBufferLength = 1;
 334                         *err = U_BUFFER_OVERFLOW_ERROR;
 335                     }
 336                 }
 337                 offsetNum += i;
 338             }
 339             else
 340             {
 341                 cnv->toULength = (int8_t)i;
 342                 *err = U_ILLEGAL_CHAR_FOUND;
 343                 break;
 344             }
 345         }
 346     }
 347
 348 donefornow:
 349     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 350     {   /* End of target buffer */
 351         *err = U_BUFFER_OVERFLOW_ERROR;
 352     }
 353
 354     args->target = myTarget;
 355     args->source = (const char *) mySource;
 356     args->offsets = myOffsets;
 357 }
 358
 359 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
 360                                     UErrorCode * err)
 361 {
 362     UConverter *cnv = args->converter;
 363     const UChar *mySource = args->source;
 364     const UChar *sourceLimit = args->sourceLimit;
 365     uint8_t *myTarget = (uint8_t *) args->target;
 366     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 367     uint8_t *tempPtr;
 368     UChar32 ch;
 369     uint8_t tempBuf[4];
 370     int32_t indexToWrite;
 371     UBool isNotCESU8 = !hasCESU8Data(cnv);
 372
 373     if (cnv->fromUChar32 && myTarget < targetLimit)
 374     {
 375         ch = cnv->fromUChar32;
 376         cnv->fromUChar32 = 0;
 377         goto lowsurrogate;
 378     }
 379
 380     while (mySource < sourceLimit && myTarget < targetLimit)
 381     {
 382         ch = *(mySource++);
 383
 384         if (ch < 0x80)        /* Single byte */
 385         {
 386             *(myTarget++) = (uint8_t) ch;
 387         }
 388         else if (ch < 0x800)  /* Double byte */
 389         {
 390             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 391             if (myTarget < targetLimit)
 392             {
 393                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 394             }
 395             else
 396             {
 397                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 398                 cnv->charErrorBufferLength = 1;
 399                 *err = U_BUFFER_OVERFLOW_ERROR;
 400             }
 401         }
 402         else {
 403             /* Check for surrogates */
 404             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 405 lowsurrogate:
 406                 if (mySource < sourceLimit) {
 407                     /* test both code units */
 408                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 409                         /* convert and consume this supplementary code point */
 410                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 411                         ++mySource;
 412                         /* exit this condition tree */
 413                     }
 414                     else {
 415                         /* this is an unpaired trail or lead code unit */
 416                         /* callback(illegal) */
 417                         cnv->fromUChar32 = ch;
 418                         *err = U_ILLEGAL_CHAR_FOUND;
 419                         break;
 420                     }
 421                 }
 422                 else {
 423                     /* no more input */
 424                     cnv->fromUChar32 = ch;
 425                     break;
 426                 }
 427             }
 428
 429             /* Do we write the buffer directly for speed,
 430             or do we have to be careful about target buffer space? */
 431             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 432
 433             if (ch <= MAXIMUM_UCS2) {
 434                 indexToWrite = 2;
 435                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 436             }
 437             else {
 438                 indexToWrite = 3;
 439                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 440                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 441             }
 442             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 443             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 444
 445             if (tempPtr == myTarget) {
 446                 /* There was enough space to write the codepoint directly. */
 447                 myTarget += (indexToWrite + 1);
 448             }
 449             else {
 450                 /* We might run out of room soon. Write it slowly. */
 451                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 452                     if (myTarget < targetLimit) {
 453                         *(myTarget++) = *tempPtr;
 454                     }
 455                     else {
 456                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 457                         *err = U_BUFFER_OVERFLOW_ERROR;
 458                     }
 459                 }
 460             }
 461         }
 462     }
 463
 464     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 465     {
 466         *err = U_BUFFER_OVERFLOW_ERROR;
 467     }
 468
 469     args->target = (char *) myTarget;
 470     args->source = mySource;
 471 }
 472
 473 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
 474                                                   UErrorCode * err)
 475 {
 476     UConverter *cnv = args->converter;
 477     const UChar *mySource = args->source;
 478     int32_t *myOffsets = args->offsets;
 479     const UChar *sourceLimit = args->sourceLimit;
 480     uint8_t *myTarget = (uint8_t *) args->target;
 481     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
 482     uint8_t *tempPtr;
 483     UChar32 ch;
 484     int32_t offsetNum, nextSourceIndex;
 485     int32_t indexToWrite;
 486     uint8_t tempBuf[4];
 487     UBool isNotCESU8 = !hasCESU8Data(cnv);
 488
 489     if (cnv->fromUChar32 && myTarget < targetLimit)
 490     {
 491         ch = cnv->fromUChar32;
 492         cnv->fromUChar32 = 0;
 493         offsetNum = -1;
 494         nextSourceIndex = 0;
 495         goto lowsurrogate;
 496     } else {
 497         offsetNum = 0;
 498     }
 499
 500     while (mySource < sourceLimit && myTarget < targetLimit)
 501     {
 502         ch = *(mySource++);
 503
 504         if (ch < 0x80)        /* Single byte */
 505         {
 506             *(myOffsets++) = offsetNum++;
 507             *(myTarget++) = (char) ch;
 508         }
 509         else if (ch < 0x800)  /* Double byte */
 510         {
 511             *(myOffsets++) = offsetNum;
 512             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
 513             if (myTarget < targetLimit)
 514             {
 515                 *(myOffsets++) = offsetNum++;
 516                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
 517             }
 518             else
 519             {
 520                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
 521                 cnv->charErrorBufferLength = 1;
 522                 *err = U_BUFFER_OVERFLOW_ERROR;
 523             }
 524         }
 525         else
 526         /* Check for surrogates */
 527         {
 528             nextSourceIndex = offsetNum + 1;
 529
 530             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
 531 lowsurrogate:
 532                 if (mySource < sourceLimit) {
 533                     /* test both code units */
 534                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
 535                         /* convert and consume this supplementary code point */
 536                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
 537                         ++mySource;
 538                         ++nextSourceIndex;
 539                         /* exit this condition tree */
 540                     }
 541                     else {
 542                         /* this is an unpaired trail or lead code unit */
 543                         /* callback(illegal) */
 544                         cnv->fromUChar32 = ch;
 545                         *err = U_ILLEGAL_CHAR_FOUND;
 546                         break;
 547                     }
 548                 }
 549                 else {
 550                     /* no more input */
 551                     cnv->fromUChar32 = ch;
 552                     break;
 553                 }
 554             }
 555
 556             /* Do we write the buffer directly for speed,
 557             or do we have to be careful about target buffer space? */
 558             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
 559
 560             if (ch <= MAXIMUM_UCS2) {
 561                 indexToWrite = 2;
 562                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
 563             }
 564             else {
 565                 indexToWrite = 3;
 566                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
 567                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
 568             }
 569             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
 570             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
 571
 572             if (tempPtr == myTarget) {
 573                 /* There was enough space to write the codepoint directly. */
 574                 myTarget += (indexToWrite + 1);
 575                 myOffsets[0] = offsetNum;
 576                 myOffsets[1] = offsetNum;
 577                 myOffsets[2] = offsetNum;
 578                 if (indexToWrite >= 3) {
 579                     myOffsets[3] = offsetNum;
 580                 }
 581                 myOffsets += (indexToWrite + 1);
 582             }
 583             else {
 584                 /* We might run out of room soon. Write it slowly. */
 585                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
 586                     if (myTarget < targetLimit)
 587                     {
 588                         *(myOffsets++) = offsetNum;
 589                         *(myTarget++) = *tempPtr;
 590                     }
 591                     else
 592                     {
 593                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
 594                         *err = U_BUFFER_OVERFLOW_ERROR;
 595                     }
 596                 }
 597             }
 598             offsetNum = nextSourceIndex;
 599         }
 600     }
 601
 602     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 603     {
 604         *err = U_BUFFER_OVERFLOW_ERROR;
 605     }
 606
 607     args->target = (char *) myTarget;
 608     args->source = mySource;
 609     args->offsets = myOffsets;
 610 }
 611
 612 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
 613                                                UErrorCode *err) {
 614     UConverter *cnv;
 615     const uint8_t *sourceInitial;
 616     const uint8_t *source;
 617     uint16_t extraBytesToWrite;
 618     uint8_t myByte;
 619     UChar32 ch;
 620     int8_t i, isLegalSequence;
 621
 622     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
 623
 624     cnv = args->converter;
 625     sourceInitial = source = (const uint8_t *)args->source;
 626     if (source >= (const uint8_t *)args->sourceLimit)
 627     {
 628         /* no input */
 629         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 630         return 0xffff;
 631     }
 632
 633     myByte = (uint8_t)*(source++);
 634     if (myByte < 0x80)
 635     {
 636         args->source = (const char *)source;
 637         return (UChar32)myByte;
 638     }
 639
 640     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
 641     if (extraBytesToWrite == 0) {
 642         cnv->toUBytes[0] = myByte;
 643         cnv->toULength = 1;
 644         *err = U_ILLEGAL_CHAR_FOUND;
 645         args->source = (const char *)source;
 646         return 0xffff;
 647     }
 648
 649     /*The byte sequence is longer than the buffer area passed*/
 650     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
 651     {
 652         /* check if all of the remaining bytes are trail bytes */
 653         cnv->toUBytes[0] = myByte;
 654         i = 1;
 655         *err = U_TRUNCATED_CHAR_FOUND;
 656         while(source < (const uint8_t *)args->sourceLimit) {
 657             if(U8_IS_TRAIL(myByte = *source)) {
 658                 cnv->toUBytes[i++] = myByte;
 659                 ++source;
 660             } else {
 661                 /* error even before we run out of input */
 662                 *err = U_ILLEGAL_CHAR_FOUND;
 663                 break;
 664             }
 665         }
 666         cnv->toULength = i;
 667         args->source = (const char *)source;
 668         return 0xffff;
 669     }
 670
 671     isLegalSequence = 1;
 672     ch = myByte << 6;
 673     switch(extraBytesToWrite)
 674     {
 675       /* note: code falls through cases! (sic)*/
 676     case 6:
 677         ch += (myByte = *source);
 678         ch <<= 6;
 679         if (!U8_IS_TRAIL(myByte))
 680         {
 681             isLegalSequence = 0;
 682             break;
 683         }
 684         ++source;
 685         U_FALLTHROUGH;
 686     case 5:
 687         ch += (myByte = *source);
 688         ch <<= 6;
 689         if (!U8_IS_TRAIL(myByte))
 690         {
 691             isLegalSequence = 0;
 692             break;
 693         }
 694         ++source;
 695         U_FALLTHROUGH;
 696     case 4:
 697         ch += (myByte = *source);
 698         ch <<= 6;
 699         if (!U8_IS_TRAIL(myByte))
 700         {
 701             isLegalSequence = 0;
 702             break;
 703         }
 704         ++source;
 705         U_FALLTHROUGH;
 706     case 3:
 707         ch += (myByte = *source);
 708         ch <<= 6;
 709         if (!U8_IS_TRAIL(myByte))
 710         {
 711             isLegalSequence = 0;
 712             break;
 713         }
 714         ++source;
 715         U_FALLTHROUGH;
 716     case 2:
 717         ch += (myByte = *source);
 718         if (!U8_IS_TRAIL(myByte))
 719         {
 720             isLegalSequence = 0;
 721             break;
 722         }
 723         ++source;
 724     };
 725     ch -= offsetsFromUTF8[extraBytesToWrite];
 726     args->source = (const char *)source;
 727
 728     /*
 729      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
 730      * - use only trail bytes after a lead byte (checked above)
 731      * - use the right number of trail bytes for a given lead byte
 732      * - encode a code point <= U+10ffff
 733      * - use the fewest possible number of bytes for their code points
 734      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
 735      *
 736      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
 737      * There are no irregular sequences any more.
 738      */
 739     if (isLegalSequence &&
 740         (uint32_t)ch <= MAXIMUM_UTF &&
 741         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
 742         !U_IS_SURROGATE(ch)
 743     ) {
 744         return ch; /* return the code point */
 745     }
 746
 747     for(i = 0; sourceInitial < source; ++i) {
 748         cnv->toUBytes[i] = *sourceInitial++;
 749     }
 750     cnv->toULength = i;
 751     *err = U_ILLEGAL_CHAR_FOUND;
 752     return 0xffff;
 753 }
 754
 755 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
 756
 757 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
 758 static const UChar32
 759 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
 760
 761 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
 762 static const UChar32
 763 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
 764
 765 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
 766 static void
 767 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
 768                   UConverterToUnicodeArgs *pToUArgs,
 769                   UErrorCode *pErrorCode) {
 770     UConverter *utf8;
 771     const uint8_t *source, *sourceLimit;
 772     uint8_t *target;
 773     int32_t targetCapacity;
 774     int32_t count;
 775
 776     int8_t oldToULength, toULength, toULimit;
 777
 778     UChar32 c;
 779     uint8_t b, t1, t2;
 780
 781     /* set up the local pointers */
 782     utf8=pToUArgs->converter;
 783     source=(uint8_t *)pToUArgs->source;
 784     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
 785     target=(uint8_t *)pFromUArgs->target;
 786     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
 787
 788     /* get the converter state from the UTF-8 UConverter */
 789     c=(UChar32)utf8->toUnicodeStatus;
 790     if(c!=0) {
 791         toULength=oldToULength=utf8->toULength;
 792         toULimit=(int8_t)utf8->mode;
 793     } else {
 794         toULength=oldToULength=toULimit=0;
 795     }
 796
 797     count=(int32_t)(sourceLimit-source)+oldToULength;
 798     if(count<toULimit) {
 799         /*
 800          * Not enough input to complete the partial character.
 801          * Jump to moreBytes below - it will not output to target.
 802          */
 803     } else if(targetCapacity<toULimit) {
 804         /*
 805          * Not enough target capacity to output the partial character.
 806          * Let the standard converter handle this.
 807          */
 808         *pErrorCode=U_USING_DEFAULT_WARNING;
 809         return;
 810     } else {
 811         /*
 812          * Use a single counter for source and target, counting the minimum of
 813          * the source length and the target capacity.
 814          * As a result, the source length is checked only once per multi-byte
 815          * character instead of twice.
 816          *
 817          * Make sure that the last byte sequence is complete, or else
 818          * stop just before it.
 819          * (The longest legal byte sequence has 3 trail bytes.)
 820          * Count oldToULength (number of source bytes from a previous buffer)
 821          * into the source length but reduce the source index by toULimit
 822          * while going back over trail bytes in order to not go back into
 823          * the bytes that will be read for finishing a partial
 824          * sequence from the previous buffer.
 825          * Let the standard converter handle edge cases.
 826          */
 827         int32_t i;
 828
 829         if(count>targetCapacity) {
 830             count=targetCapacity;
 831         }
 832
 833         i=0;
 834         while(i<3 && i<(count-toULimit)) {
 835             b=source[count-oldToULength-i-1];
 836             if(U8_IS_TRAIL(b)) {
 837                 ++i;
 838             } else {
 839                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
 840                     /* stop converting before the lead byte if there are not enough trail bytes for it */
 841                     count-=i+1;
 842                 }
 843                 break;
 844             }
 845         }
 846     }
 847
 848     if(c!=0) {
 849         utf8->toUnicodeStatus=0;
 850         utf8->toULength=0;
 851         goto moreBytes;
 852         /* See note in ucnv_SBCSFromUTF8() about this goto. */
 853     }
 854
 855     /* conversion loop */
 856     while(count>0) {
 857         b=*source++;
 858         if((int8_t)b>=0) {
 859             /* convert ASCII */
 860             *target++=b;
 861             --count;
 862             continue;
 863         } else {
 864             if(b>0xe0) {
 865                 if( /* handle U+1000..U+D7FF inline */
 866                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
 867                                                (b==0xed && (t1 <= 0x9f))) &&
 868                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 869                 ) {
 870                     source+=2;
 871                     *target++=b;
 872                     *target++=t1;
 873                     *target++=t2;
 874                     count-=3;
 875                     continue;
 876                 }
 877             } else if(b<0xe0) {
 878                 if( /* handle U+0080..U+07FF inline */
 879                     b>=0xc2 &&
 880                     (t1=*source) >= 0x80 && t1 <= 0xbf
 881                 ) {
 882                     ++source;
 883                     *target++=b;
 884                     *target++=t1;
 885                     count-=2;
 886                     continue;
 887                 }
 888             } else if(b==0xe0) {
 889                 if( /* handle U+0800..U+0FFF inline */
 890                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
 891                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
 892                 ) {
 893                     source+=2;
 894                     *target++=b;
 895                     *target++=t1;
 896                     *target++=t2;
 897                     count-=3;
 898                     continue;
 899                 }
 900             }
 901
 902             /* handle "complicated" and error cases, and continuing partial characters */
 903             oldToULength=0;
 904             toULength=1;
 905             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
 906             c=b;
 907 moreBytes:
 908             while(toULength<toULimit) {
 909                 if(source<sourceLimit) {
 910                     b=*source;
 911                     if(U8_IS_TRAIL(b)) {
 912                         ++source;
 913                         ++toULength;
 914                         c=(c<<6)+b;
 915                     } else {
 916                         break; /* sequence too short, stop with toULength<toULimit */
 917                     }
 918                 } else {
 919                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
 920                     source-=(toULength-oldToULength);
 921                     while(oldToULength<toULength) {
 922                         utf8->toUBytes[oldToULength++]=*source++;
 923                     }
 924                     utf8->toUnicodeStatus=c;
 925                     utf8->toULength=toULength;
 926                     utf8->mode=toULimit;
 927                     pToUArgs->source=(char *)source;
 928                     pFromUArgs->target=(char *)target;
 929                     return;
 930                 }
 931             }
 932
 933             if( toULength==toULimit &&      /* consumed all trail bytes */
 934                 (toULength==3 || toULength==2) &&             /* BMP */
 935                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
 936                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
 937             ) {
 938                 /* legal byte sequence for BMP code point */
 939             } else if(
 940                 toULength==toULimit && toULength==4 &&
 941                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
 942             ) {
 943                 /* legal byte sequence for supplementary code point */
 944             } else {
 945                 /* error handling: illegal UTF-8 byte sequence */
 946                 source-=(toULength-oldToULength);
 947                 while(oldToULength<toULength) {
 948                     utf8->toUBytes[oldToULength++]=*source++;
 949                 }
 950                 utf8->toULength=toULength;
 951                 pToUArgs->source=(char *)source;
 952                 pFromUArgs->target=(char *)target;
 953                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 954                 return;
 955             }
 956
 957             /* copy the legal byte sequence to the target */
 958             {
 959                 int8_t i;
 960
 961                 for(i=0; i<oldToULength; ++i) {
 962                     *target++=utf8->toUBytes[i];
 963                 }
 964                 source-=(toULength-oldToULength);
 965                 for(; i<toULength; ++i) {
 966                     *target++=*source++;
 967                 }
 968                 count-=toULength;
 969             }
 970         }
 971     }
 972
 973     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
 974         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
 975             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 976         } else {
 977             b=*source;
 978             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
 979             if(toULimit>(sourceLimit-source)) {
 980                 /* collect a truncated byte sequence */
 981                 toULength=0;
 982                 c=b;
 983                 for(;;) {
 984                     utf8->toUBytes[toULength++]=b;
 985                     if(++source==sourceLimit) {
 986                         /* partial byte sequence at end of source */
 987                         utf8->toUnicodeStatus=c;
 988                         utf8->toULength=toULength;
 989                         utf8->mode=toULimit;
 990                         break;
 991                     } else if(!U8_IS_TRAIL(b=*source)) {
 992                         /* lead byte in trail byte position */
 993                         utf8->toULength=toULength;
 994                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 995                         break;
 996                     }
 997                     c=(c<<6)+b;
 998                 }
 999             } else {
1000                 /* partial-sequence target overflow: fall back to the pivoting implementation */
1001                 *pErrorCode=U_USING_DEFAULT_WARNING;
1002             }
1003         }
1004     }
1005
1006     /* write back the updated pointers */
1007     pToUArgs->source=(char *)source;
1008     pFromUArgs->target=(char *)target;
1009 }
1010
1011 /* UTF-8 converter data ----------------------------------------------------- */
1012
1013 static const UConverterImpl _UTF8Impl={
1014     UCNV_UTF8,
1015
1016     NULL,
1017     NULL,
1018
1019     NULL,
1020     NULL,
1021     NULL,
1022
1023     ucnv_toUnicode_UTF8,
1024     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1025     ucnv_fromUnicode_UTF8,
1026     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1027     ucnv_getNextUChar_UTF8,
1028
1029     NULL,
1030     NULL,
1031     NULL,
1032     NULL,
1033     ucnv_getNonSurrogateUnicodeSet,
1034
1035     ucnv_UTF8FromUTF8,
1036     ucnv_UTF8FromUTF8
1037 };
1038
1039 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1040 static const UConverterStaticData _UTF8StaticData={
1041     sizeof(UConverterStaticData),
1042     "UTF-8",
1043     1208, UCNV_IBM, UCNV_UTF8,
1044     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1045     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1046     0,
1047     0,
1048     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1049 };
1050
1051
1052 const UConverterSharedData _UTF8Data=
1053         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1054
1055 /* CESU-8 converter data ---------------------------------------------------- */
1056
1057 static const UConverterImpl _CESU8Impl={
1058     UCNV_CESU8,
1059
1060     NULL,
1061     NULL,
1062
1063     NULL,
1064     NULL,
1065     NULL,
1066
1067     ucnv_toUnicode_UTF8,
1068     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1069     ucnv_fromUnicode_UTF8,
1070     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1071     NULL,
1072
1073     NULL,
1074     NULL,
1075     NULL,
1076     NULL,
1077     ucnv_getCompleteUnicodeSet,
1078
1079     NULL,
1080     NULL
1081 };
1082
1083 static const UConverterStaticData _CESU8StaticData={
1084     sizeof(UConverterStaticData),
1085     "CESU-8",
1086     9400, /* CCSID for CESU-8 */
1087     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1088     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1089     0,
1090     0,
1091     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1092 };
1093
1094
1095 const UConverterSharedData _CESU8Data=
1096         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1097
1098 #endif