source/common/ucnv_u32.c

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2015, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u32.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  22
  23 #include "unicode/ucnv.h"
  24 #include "unicode/utf.h"
  25 #include "ucnv_bld.h"
  26 #include "ucnv_cnv.h"
  27 #include "cmemory.h"
  28
  29 #define MAXIMUM_UCS2            0x0000FFFF
  30 #define MAXIMUM_UTF             0x0010FFFF
  31 #define HALF_SHIFT              10
  32 #define HALF_BASE               0x0010000
  33 #define HALF_MASK               0x3FF
  34 #define SURROGATE_HIGH_START    0xD800
  35 #define SURROGATE_LOW_START     0xDC00
  36
  37 /* -SURROGATE_LOW_START + HALF_BASE */
  38 #define SURROGATE_LOW_BASE      9216
  39
  40 enum {
  41     UCNV_NEED_TO_WRITE_BOM=1
  42 };
  43
  44 /* UTF-32BE ----------------------------------------------------------------- */
  45
  46 static void
  47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
  48                                 UErrorCode * err)
  49 {
  50     const unsigned char *mySource = (unsigned char *) args->source;
  51     UChar *myTarget = args->target;
  52     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  53     const UChar *targetLimit = args->targetLimit;
  54     unsigned char *toUBytes = args->converter->toUBytes;
  55     uint32_t ch, i;
  56
  57     /* Restore state of current sequence */
  58     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
  59         i = args->converter->toULength;       /* restore # of bytes consumed */
  60         args->converter->toULength = 0;
  61
  62         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
  63         args->converter->toUnicodeStatus = 0;
  64         goto morebytes;
  65     }
  66
  67     while (mySource < sourceLimit && myTarget < targetLimit) {
  68         i = 0;
  69         ch = 0;
  70 morebytes:
  71         while (i < sizeof(uint32_t)) {
  72             if (mySource < sourceLimit) {
  73                 ch = (ch << 8) | (uint8_t)(*mySource);
  74                 toUBytes[i++] = (char) *(mySource++);
  75             }
  76             else {
  77                 /* stores a partially calculated target*/
  78                 /* + 1 to make 0 a valid character */
  79                 args->converter->toUnicodeStatus = ch + 1;
  80                 args->converter->toULength = (int8_t) i;
  81                 goto donefornow;
  82             }
  83         }
  84
  85         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
  86             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  87             if (ch <= MAXIMUM_UCS2)
  88             {
  89                 /* fits in 16 bits */
  90                 *(myTarget++) = (UChar) ch;
  91             }
  92             else {
  93                 /* write out the surrogates */
  94                 *(myTarget++) = U16_LEAD(ch);
  95                 ch = U16_TRAIL(ch);
  96                 if (myTarget < targetLimit) {
  97                     *(myTarget++) = (UChar)ch;
  98                 }
  99                 else {
 100                     /* Put in overflow buffer (not handled here) */
 101                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 102                     args->converter->UCharErrorBufferLength = 1;
 103                     *err = U_BUFFER_OVERFLOW_ERROR;
 104                     break;
 105                 }
 106             }
 107         }
 108         else {
 109             args->converter->toULength = (int8_t)i;
 110             *err = U_ILLEGAL_CHAR_FOUND;
 111             break;
 112         }
 113     }
 114
 115 donefornow:
 116     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 117         /* End of target buffer */
 118         *err = U_BUFFER_OVERFLOW_ERROR;
 119     }
 120
 121     args->target = myTarget;
 122     args->source = (const char *) mySource;
 123 }
 124
 125 static void
 126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 127                                              UErrorCode * err)
 128 {
 129     const unsigned char *mySource = (unsigned char *) args->source;
 130     UChar *myTarget = args->target;
 131     int32_t *myOffsets = args->offsets;
 132     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 133     const UChar *targetLimit = args->targetLimit;
 134     unsigned char *toUBytes = args->converter->toUBytes;
 135     uint32_t ch, i;
 136     int32_t offsetNum = 0;
 137
 138     /* Restore state of current sequence */
 139     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
 140         i = args->converter->toULength;       /* restore # of bytes consumed */
 141         args->converter->toULength = 0;
 142
 143         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
 144         args->converter->toUnicodeStatus = 0;
 145         goto morebytes;
 146     }
 147
 148     while (mySource < sourceLimit && myTarget < targetLimit) {
 149         i = 0;
 150         ch = 0;
 151 morebytes:
 152         while (i < sizeof(uint32_t)) {
 153             if (mySource < sourceLimit) {
 154                 ch = (ch << 8) | (uint8_t)(*mySource);
 155                 toUBytes[i++] = (char) *(mySource++);
 156             }
 157             else {
 158                 /* stores a partially calculated target*/
 159                 /* + 1 to make 0 a valid character */
 160                 args->converter->toUnicodeStatus = ch + 1;
 161                 args->converter->toULength = (int8_t) i;
 162                 goto donefornow;
 163             }
 164         }
 165
 166         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 167             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 168             if (ch <= MAXIMUM_UCS2) {
 169                 /* fits in 16 bits */
 170                 *(myTarget++) = (UChar) ch;
 171                 *(myOffsets++) = offsetNum;
 172             }
 173             else {
 174                 /* write out the surrogates */
 175                 *(myTarget++) = U16_LEAD(ch);
 176                 *myOffsets++ = offsetNum;
 177                 ch = U16_TRAIL(ch);
 178                 if (myTarget < targetLimit)
 179                 {
 180                     *(myTarget++) = (UChar)ch;
 181                     *(myOffsets++) = offsetNum;
 182                 }
 183                 else {
 184                     /* Put in overflow buffer (not handled here) */
 185                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 186                     args->converter->UCharErrorBufferLength = 1;
 187                     *err = U_BUFFER_OVERFLOW_ERROR;
 188                     break;
 189                 }
 190             }
 191         }
 192         else {
 193             args->converter->toULength = (int8_t)i;
 194             *err = U_ILLEGAL_CHAR_FOUND;
 195             break;
 196         }
 197         offsetNum += i;
 198     }
 199
 200 donefornow:
 201     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 202     {
 203         /* End of target buffer */
 204         *err = U_BUFFER_OVERFLOW_ERROR;
 205     }
 206
 207     args->target = myTarget;
 208     args->source = (const char *) mySource;
 209     args->offsets = myOffsets;
 210 }
 211
 212 static void
 213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
 214                                   UErrorCode * err)
 215 {
 216     const UChar *mySource = args->source;
 217     unsigned char *myTarget;
 218     const UChar *sourceLimit = args->sourceLimit;
 219     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 220     UChar32 ch, ch2;
 221     unsigned int indexToWrite;
 222     unsigned char temp[sizeof(uint32_t)];
 223
 224     if(mySource >= sourceLimit) {
 225         /* no input, nothing to do */
 226         return;
 227     }
 228
 229     /* write the BOM if necessary */
 230     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 231         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 232         ucnv_fromUWriteBytes(args->converter,
 233                              bom, 4,
 234                              &args->target, args->targetLimit,
 235                              &args->offsets, -1,
 236                              err);
 237         args->converter->fromUnicodeStatus=0;
 238     }
 239
 240     myTarget = (unsigned char *) args->target;
 241     temp[0] = 0;
 242
 243     if (args->converter->fromUChar32) {
 244         ch = args->converter->fromUChar32;
 245         args->converter->fromUChar32 = 0;
 246         goto lowsurogate;
 247     }
 248
 249     while (mySource < sourceLimit && myTarget < targetLimit) {
 250         ch = *(mySource++);
 251
 252         if (U_IS_SURROGATE(ch)) {
 253             if (U_IS_LEAD(ch)) {
 254 lowsurogate:
 255                 if (mySource < sourceLimit) {
 256                     ch2 = *mySource;
 257                     if (U_IS_TRAIL(ch2)) {
 258                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 259                         mySource++;
 260                     }
 261                     else {
 262                         /* this is an unmatched trail code unit (2nd surrogate) */
 263                         /* callback(illegal) */
 264                         args->converter->fromUChar32 = ch;
 265                         *err = U_ILLEGAL_CHAR_FOUND;
 266                         break;
 267                     }
 268                 }
 269                 else {
 270                     /* ran out of source */
 271                     args->converter->fromUChar32 = ch;
 272                     if (args->flush) {
 273                         /* this is an unmatched trail code unit (2nd surrogate) */
 274                         /* callback(illegal) */
 275                         *err = U_ILLEGAL_CHAR_FOUND;
 276                     }
 277                     break;
 278                 }
 279             }
 280             else {
 281                 /* this is an unmatched trail code unit (2nd surrogate) */
 282                 /* callback(illegal) */
 283                 args->converter->fromUChar32 = ch;
 284                 *err = U_ILLEGAL_CHAR_FOUND;
 285                 break;
 286             }
 287         }
 288
 289         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 290         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 291         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 292         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 293
 294         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 295             if (myTarget < targetLimit) {
 296                 *(myTarget++) = temp[indexToWrite];
 297             }
 298             else {
 299                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 300                 *err = U_BUFFER_OVERFLOW_ERROR;
 301             }
 302         }
 303     }
 304
 305     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 306         *err = U_BUFFER_OVERFLOW_ERROR;
 307     }
 308
 309     args->target = (char *) myTarget;
 310     args->source = mySource;
 311 }
 312
 313 static void
 314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 315                                                UErrorCode * err)
 316 {
 317     const UChar *mySource = args->source;
 318     unsigned char *myTarget;
 319     int32_t *myOffsets;
 320     const UChar *sourceLimit = args->sourceLimit;
 321     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 322     UChar32 ch, ch2;
 323     int32_t offsetNum = 0;
 324     unsigned int indexToWrite;
 325     unsigned char temp[sizeof(uint32_t)];
 326
 327     if(mySource >= sourceLimit) {
 328         /* no input, nothing to do */
 329         return;
 330     }
 331
 332     /* write the BOM if necessary */
 333     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 334         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
 335         ucnv_fromUWriteBytes(args->converter,
 336                              bom, 4,
 337                              &args->target, args->targetLimit,
 338                              &args->offsets, -1,
 339                              err);
 340         args->converter->fromUnicodeStatus=0;
 341     }
 342
 343     myTarget = (unsigned char *) args->target;
 344     myOffsets = args->offsets;
 345     temp[0] = 0;
 346
 347     if (args->converter->fromUChar32) {
 348         ch = args->converter->fromUChar32;
 349         args->converter->fromUChar32 = 0;
 350         goto lowsurogate;
 351     }
 352
 353     while (mySource < sourceLimit && myTarget < targetLimit) {
 354         ch = *(mySource++);
 355
 356         if (U_IS_SURROGATE(ch)) {
 357             if (U_IS_LEAD(ch)) {
 358 lowsurogate:
 359                 if (mySource < sourceLimit) {
 360                     ch2 = *mySource;
 361                     if (U_IS_TRAIL(ch2)) {
 362                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 363                         mySource++;
 364                     }
 365                     else {
 366                         /* this is an unmatched trail code unit (2nd surrogate) */
 367                         /* callback(illegal) */
 368                         args->converter->fromUChar32 = ch;
 369                         *err = U_ILLEGAL_CHAR_FOUND;
 370                         break;
 371                     }
 372                 }
 373                 else {
 374                     /* ran out of source */
 375                     args->converter->fromUChar32 = ch;
 376                     if (args->flush) {
 377                         /* this is an unmatched trail code unit (2nd surrogate) */
 378                         /* callback(illegal) */
 379                         *err = U_ILLEGAL_CHAR_FOUND;
 380                     }
 381                     break;
 382                 }
 383             }
 384             else {
 385                 /* this is an unmatched trail code unit (2nd surrogate) */
 386                 /* callback(illegal) */
 387                 args->converter->fromUChar32 = ch;
 388                 *err = U_ILLEGAL_CHAR_FOUND;
 389                 break;
 390             }
 391         }
 392
 393         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 394         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
 395         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 396         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 397
 398         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
 399             if (myTarget < targetLimit) {
 400                 *(myTarget++) = temp[indexToWrite];
 401                 *(myOffsets++) = offsetNum;
 402             }
 403             else {
 404                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 405                 *err = U_BUFFER_OVERFLOW_ERROR;
 406             }
 407         }
 408         offsetNum = offsetNum + 1 + (temp[1] != 0);
 409     }
 410
 411     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
 412         *err = U_BUFFER_OVERFLOW_ERROR;
 413     }
 414
 415     args->target = (char *) myTarget;
 416     args->source = mySource;
 417     args->offsets = myOffsets;
 418 }
 419
 420 static UChar32
 421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
 422                                    UErrorCode* err)
 423 {
 424     const uint8_t *mySource;
 425     UChar32 myUChar;
 426     int32_t length;
 427
 428     mySource = (const uint8_t *)args->source;
 429     if (mySource >= (const uint8_t *)args->sourceLimit)
 430     {
 431         /* no input */
 432         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 433         return 0xffff;
 434     }
 435
 436     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 437     if (length < 4)
 438     {
 439         /* got a partial character */
 440         uprv_memcpy(args->converter->toUBytes, mySource, length);
 441         args->converter->toULength = (int8_t)length;
 442         args->source = (const char *)(mySource + length);
 443         *err = U_TRUNCATED_CHAR_FOUND;
 444         return 0xffff;
 445     }
 446
 447     /* Don't even try to do a direct cast because the value may be on an odd address. */
 448     myUChar = ((UChar32)mySource[0] << 24)
 449             | ((UChar32)mySource[1] << 16)
 450             | ((UChar32)mySource[2] << 8)
 451             | ((UChar32)mySource[3]);
 452
 453     args->source = (const char *)(mySource + 4);
 454     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 455         return myUChar;
 456     }
 457
 458     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 459     args->converter->toULength = 4;
 460
 461     *err = U_ILLEGAL_CHAR_FOUND;
 462     return 0xffff;
 463 }
 464
 465 static const UConverterImpl _UTF32BEImpl = {
 466     UCNV_UTF32_BigEndian,
 467
 468     NULL,
 469     NULL,
 470
 471     NULL,
 472     NULL,
 473     NULL,
 474
 475     T_UConverter_toUnicode_UTF32_BE,
 476     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
 477     T_UConverter_fromUnicode_UTF32_BE,
 478     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
 479     T_UConverter_getNextUChar_UTF32_BE,
 480
 481     NULL,
 482     NULL,
 483     NULL,
 484     NULL,
 485     ucnv_getNonSurrogateUnicodeSet,
 486
 487     NULL,
 488     NULL
 489 };
 490
 491 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 492 static const UConverterStaticData _UTF32BEStaticData = {
 493     sizeof(UConverterStaticData),
 494     "UTF-32BE",
 495     1232,
 496     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
 497     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
 498     0,
 499     0,
 500     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 501 };
 502
 503 const UConverterSharedData _UTF32BEData =
 504         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
 505
 506 /* UTF-32LE ---------------------------------------------------------- */
 507
 508 static void
 509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
 510                                 UErrorCode * err)
 511 {
 512     const unsigned char *mySource = (unsigned char *) args->source;
 513     UChar *myTarget = args->target;
 514     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 515     const UChar *targetLimit = args->targetLimit;
 516     unsigned char *toUBytes = args->converter->toUBytes;
 517     uint32_t ch, i;
 518
 519     /* Restore state of current sequence */
 520     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 521     {
 522         i = args->converter->toULength;       /* restore # of bytes consumed */
 523         args->converter->toULength = 0;
 524
 525         /* Stores the previously calculated ch from a previous call*/
 526         ch = args->converter->toUnicodeStatus - 1;
 527         args->converter->toUnicodeStatus = 0;
 528         goto morebytes;
 529     }
 530
 531     while (mySource < sourceLimit && myTarget < targetLimit)
 532     {
 533         i = 0;
 534         ch = 0;
 535 morebytes:
 536         while (i < sizeof(uint32_t))
 537         {
 538             if (mySource < sourceLimit)
 539             {
 540                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 541                 toUBytes[i++] = (char) *(mySource++);
 542             }
 543             else
 544             {
 545                 /* stores a partially calculated target*/
 546                 /* + 1 to make 0 a valid character */
 547                 args->converter->toUnicodeStatus = ch + 1;
 548                 args->converter->toULength = (int8_t) i;
 549                 goto donefornow;
 550             }
 551         }
 552
 553         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
 554             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 555             if (ch <= MAXIMUM_UCS2) {
 556                 /* fits in 16 bits */
 557                 *(myTarget++) = (UChar) ch;
 558             }
 559             else {
 560                 /* write out the surrogates */
 561                 *(myTarget++) = U16_LEAD(ch);
 562                 ch = U16_TRAIL(ch);
 563                 if (myTarget < targetLimit) {
 564                     *(myTarget++) = (UChar)ch;
 565                 }
 566                 else {
 567                     /* Put in overflow buffer (not handled here) */
 568                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 569                     args->converter->UCharErrorBufferLength = 1;
 570                     *err = U_BUFFER_OVERFLOW_ERROR;
 571                     break;
 572                 }
 573             }
 574         }
 575         else {
 576             args->converter->toULength = (int8_t)i;
 577             *err = U_ILLEGAL_CHAR_FOUND;
 578             break;
 579         }
 580     }
 581
 582 donefornow:
 583     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 584     {
 585         /* End of target buffer */
 586         *err = U_BUFFER_OVERFLOW_ERROR;
 587     }
 588
 589     args->target = myTarget;
 590     args->source = (const char *) mySource;
 591 }
 592
 593 static void
 594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
 595                                              UErrorCode * err)
 596 {
 597     const unsigned char *mySource = (unsigned char *) args->source;
 598     UChar *myTarget = args->target;
 599     int32_t *myOffsets = args->offsets;
 600     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
 601     const UChar *targetLimit = args->targetLimit;
 602     unsigned char *toUBytes = args->converter->toUBytes;
 603     uint32_t ch, i;
 604     int32_t offsetNum = 0;
 605
 606     /* Restore state of current sequence */
 607     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
 608     {
 609         i = args->converter->toULength;       /* restore # of bytes consumed */
 610         args->converter->toULength = 0;
 611
 612         /* Stores the previously calculated ch from a previous call*/
 613         ch = args->converter->toUnicodeStatus - 1;
 614         args->converter->toUnicodeStatus = 0;
 615         goto morebytes;
 616     }
 617
 618     while (mySource < sourceLimit && myTarget < targetLimit)
 619     {
 620         i = 0;
 621         ch = 0;
 622 morebytes:
 623         while (i < sizeof(uint32_t))
 624         {
 625             if (mySource < sourceLimit)
 626             {
 627                 ch |= ((uint8_t)(*mySource)) << (i * 8);
 628                 toUBytes[i++] = (char) *(mySource++);
 629             }
 630             else
 631             {
 632                 /* stores a partially calculated target*/
 633                 /* + 1 to make 0 a valid character */
 634                 args->converter->toUnicodeStatus = ch + 1;
 635                 args->converter->toULength = (int8_t) i;
 636                 goto donefornow;
 637             }
 638         }
 639
 640         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
 641         {
 642             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
 643             if (ch <= MAXIMUM_UCS2)
 644             {
 645                 /* fits in 16 bits */
 646                 *(myTarget++) = (UChar) ch;
 647                 *(myOffsets++) = offsetNum;
 648             }
 649             else {
 650                 /* write out the surrogates */
 651                 *(myTarget++) = U16_LEAD(ch);
 652                 *(myOffsets++) = offsetNum;
 653                 ch = U16_TRAIL(ch);
 654                 if (myTarget < targetLimit)
 655                 {
 656                     *(myTarget++) = (UChar)ch;
 657                     *(myOffsets++) = offsetNum;
 658                 }
 659                 else
 660                 {
 661                     /* Put in overflow buffer (not handled here) */
 662                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
 663                     args->converter->UCharErrorBufferLength = 1;
 664                     *err = U_BUFFER_OVERFLOW_ERROR;
 665                     break;
 666                 }
 667             }
 668         }
 669         else
 670         {
 671             args->converter->toULength = (int8_t)i;
 672             *err = U_ILLEGAL_CHAR_FOUND;
 673             break;
 674         }
 675         offsetNum += i;
 676     }
 677
 678 donefornow:
 679     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 680     {
 681         /* End of target buffer */
 682         *err = U_BUFFER_OVERFLOW_ERROR;
 683     }
 684
 685     args->target = myTarget;
 686     args->source = (const char *) mySource;
 687     args->offsets = myOffsets;
 688 }
 689
 690 static void
 691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
 692                                   UErrorCode * err)
 693 {
 694     const UChar *mySource = args->source;
 695     unsigned char *myTarget;
 696     const UChar *sourceLimit = args->sourceLimit;
 697     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 698     UChar32 ch, ch2;
 699     unsigned int indexToWrite;
 700     unsigned char temp[sizeof(uint32_t)];
 701
 702     if(mySource >= sourceLimit) {
 703         /* no input, nothing to do */
 704         return;
 705     }
 706
 707     /* write the BOM if necessary */
 708     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 709         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 710         ucnv_fromUWriteBytes(args->converter,
 711                              bom, 4,
 712                              &args->target, args->targetLimit,
 713                              &args->offsets, -1,
 714                              err);
 715         args->converter->fromUnicodeStatus=0;
 716     }
 717
 718     myTarget = (unsigned char *) args->target;
 719     temp[3] = 0;
 720
 721     if (args->converter->fromUChar32)
 722     {
 723         ch = args->converter->fromUChar32;
 724         args->converter->fromUChar32 = 0;
 725         goto lowsurogate;
 726     }
 727
 728     while (mySource < sourceLimit && myTarget < targetLimit)
 729     {
 730         ch = *(mySource++);
 731
 732         if (U16_IS_SURROGATE(ch)) {
 733             if (U16_IS_LEAD(ch))
 734             {
 735 lowsurogate:
 736                 if (mySource < sourceLimit)
 737                 {
 738                     ch2 = *mySource;
 739                     if (U16_IS_TRAIL(ch2)) {
 740                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 741                         mySource++;
 742                     }
 743                     else {
 744                         /* this is an unmatched trail code unit (2nd surrogate) */
 745                         /* callback(illegal) */
 746                         args->converter->fromUChar32 = ch;
 747                         *err = U_ILLEGAL_CHAR_FOUND;
 748                         break;
 749                     }
 750                 }
 751                 else {
 752                     /* ran out of source */
 753                     args->converter->fromUChar32 = ch;
 754                     if (args->flush) {
 755                         /* this is an unmatched trail code unit (2nd surrogate) */
 756                         /* callback(illegal) */
 757                         *err = U_ILLEGAL_CHAR_FOUND;
 758                     }
 759                     break;
 760                 }
 761             }
 762             else {
 763                 /* this is an unmatched trail code unit (2nd surrogate) */
 764                 /* callback(illegal) */
 765                 args->converter->fromUChar32 = ch;
 766                 *err = U_ILLEGAL_CHAR_FOUND;
 767                 break;
 768             }
 769         }
 770
 771         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 772         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 773         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 774         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 775
 776         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 777         {
 778             if (myTarget < targetLimit)
 779             {
 780                 *(myTarget++) = temp[indexToWrite];
 781             }
 782             else
 783             {
 784                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 785                 *err = U_BUFFER_OVERFLOW_ERROR;
 786             }
 787         }
 788     }
 789
 790     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 791     {
 792         *err = U_BUFFER_OVERFLOW_ERROR;
 793     }
 794
 795     args->target = (char *) myTarget;
 796     args->source = mySource;
 797 }
 798
 799 static void
 800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
 801                                                UErrorCode * err)
 802 {
 803     const UChar *mySource = args->source;
 804     unsigned char *myTarget;
 805     int32_t *myOffsets;
 806     const UChar *sourceLimit = args->sourceLimit;
 807     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
 808     UChar32 ch, ch2;
 809     unsigned int indexToWrite;
 810     unsigned char temp[sizeof(uint32_t)];
 811     int32_t offsetNum = 0;
 812
 813     if(mySource >= sourceLimit) {
 814         /* no input, nothing to do */
 815         return;
 816     }
 817
 818     /* write the BOM if necessary */
 819     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 820         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
 821         ucnv_fromUWriteBytes(args->converter,
 822                              bom, 4,
 823                              &args->target, args->targetLimit,
 824                              &args->offsets, -1,
 825                              err);
 826         args->converter->fromUnicodeStatus=0;
 827     }
 828
 829     myTarget = (unsigned char *) args->target;
 830     myOffsets = args->offsets;
 831     temp[3] = 0;
 832
 833     if (args->converter->fromUChar32)
 834     {
 835         ch = args->converter->fromUChar32;
 836         args->converter->fromUChar32 = 0;
 837         goto lowsurogate;
 838     }
 839
 840     while (mySource < sourceLimit && myTarget < targetLimit)
 841     {
 842         ch = *(mySource++);
 843
 844         if (U16_IS_SURROGATE(ch)) {
 845             if (U16_IS_LEAD(ch))
 846             {
 847 lowsurogate:
 848                 if (mySource < sourceLimit)
 849                 {
 850                     ch2 = *mySource;
 851                     if (U16_IS_TRAIL(ch2))
 852                     {
 853                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
 854                         mySource++;
 855                     }
 856                     else {
 857                         /* this is an unmatched trail code unit (2nd surrogate) */
 858                         /* callback(illegal) */
 859                         args->converter->fromUChar32 = ch;
 860                         *err = U_ILLEGAL_CHAR_FOUND;
 861                         break;
 862                     }
 863                 }
 864                 else {
 865                     /* ran out of source */
 866                     args->converter->fromUChar32 = ch;
 867                     if (args->flush) {
 868                         /* this is an unmatched trail code unit (2nd surrogate) */
 869                         /* callback(illegal) */
 870                         *err = U_ILLEGAL_CHAR_FOUND;
 871                     }
 872                     break;
 873                 }
 874             }
 875             else {
 876                 /* this is an unmatched trail code unit (2nd surrogate) */
 877                 /* callback(illegal) */
 878                 args->converter->fromUChar32 = ch;
 879                 *err = U_ILLEGAL_CHAR_FOUND;
 880                 break;
 881             }
 882         }
 883
 884         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
 885         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
 886         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
 887         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 888
 889         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
 890         {
 891             if (myTarget < targetLimit)
 892             {
 893                 *(myTarget++) = temp[indexToWrite];
 894                 *(myOffsets++) = offsetNum;
 895             }
 896             else
 897             {
 898                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
 899                 *err = U_BUFFER_OVERFLOW_ERROR;
 900             }
 901         }
 902         offsetNum = offsetNum + 1 + (temp[2] != 0);
 903     }
 904
 905     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
 906     {
 907         *err = U_BUFFER_OVERFLOW_ERROR;
 908     }
 909
 910     args->target = (char *) myTarget;
 911     args->source = mySource;
 912     args->offsets = myOffsets;
 913 }
 914
 915 static UChar32
 916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
 917                                    UErrorCode* err)
 918 {
 919     const uint8_t *mySource;
 920     UChar32 myUChar;
 921     int32_t length;
 922
 923     mySource = (const uint8_t *)args->source;
 924     if (mySource >= (const uint8_t *)args->sourceLimit)
 925     {
 926         /* no input */
 927         *err = U_INDEX_OUTOFBOUNDS_ERROR;
 928         return 0xffff;
 929     }
 930
 931     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
 932     if (length < 4)
 933     {
 934         /* got a partial character */
 935         uprv_memcpy(args->converter->toUBytes, mySource, length);
 936         args->converter->toULength = (int8_t)length;
 937         args->source = (const char *)(mySource + length);
 938         *err = U_TRUNCATED_CHAR_FOUND;
 939         return 0xffff;
 940     }
 941
 942     /* Don't even try to do a direct cast because the value may be on an odd address. */
 943     myUChar = ((UChar32)mySource[3] << 24)
 944             | ((UChar32)mySource[2] << 16)
 945             | ((UChar32)mySource[1] << 8)
 946             | ((UChar32)mySource[0]);
 947
 948     args->source = (const char *)(mySource + 4);
 949     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
 950         return myUChar;
 951     }
 952
 953     uprv_memcpy(args->converter->toUBytes, mySource, 4);
 954     args->converter->toULength = 4;
 955
 956     *err = U_ILLEGAL_CHAR_FOUND;
 957     return 0xffff;
 958 }
 959
 960 static const UConverterImpl _UTF32LEImpl = {
 961     UCNV_UTF32_LittleEndian,
 962
 963     NULL,
 964     NULL,
 965
 966     NULL,
 967     NULL,
 968     NULL,
 969
 970     T_UConverter_toUnicode_UTF32_LE,
 971     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
 972     T_UConverter_fromUnicode_UTF32_LE,
 973     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
 974     T_UConverter_getNextUChar_UTF32_LE,
 975
 976     NULL,
 977     NULL,
 978     NULL,
 979     NULL,
 980     ucnv_getNonSurrogateUnicodeSet,
 981
 982     NULL,
 983     NULL
 984 };
 985
 986 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
 987 static const UConverterStaticData _UTF32LEStaticData = {
 988     sizeof(UConverterStaticData),
 989     "UTF-32LE",
 990     1234,
 991     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
 992     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
 993     0,
 994     0,
 995     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 996 };
 997
 998
 999 const UConverterSharedData _UTF32LEData =
1000         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
1001
1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1003
1004 /*
1005  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1006  * accordingly.
1007  *
1008  * State values:
1009  * 0    initial state
1010  * 1    saw 00
1011  * 2    saw 00 00
1012  * 3    saw 00 00 FE
1013  * 4    -
1014  * 5    saw FF
1015  * 6    saw FF FE
1016  * 7    saw FF FE 00
1017  * 8    UTF-32BE mode
1018  * 9    UTF-32LE mode
1019  *
1020  * During detection: state&3==number of matching bytes so far.
1021  *
1022  * On output, emit U+FEFF as the first code point.
1023  */
1024
1025 static void
1026 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1027     if(choice<=UCNV_RESET_TO_UNICODE) {
1028         /* reset toUnicode: state=0 */
1029         cnv->mode=0;
1030     }
1031     if(choice!=UCNV_RESET_TO_UNICODE) {
1032         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1033         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1034     }
1035 }
1036
1037 static void
1038 _UTF32Open(UConverter *cnv,
1039            UConverterLoadArgs *pArgs,
1040            UErrorCode *pErrorCode) {
1041     _UTF32Reset(cnv, UCNV_RESET_BOTH);
1042 }
1043
1044 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
1045
1046 static void
1047 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1048                            UErrorCode *pErrorCode) {
1049     UConverter *cnv=pArgs->converter;
1050     const char *source=pArgs->source;
1051     const char *sourceLimit=pArgs->sourceLimit;
1052     int32_t *offsets=pArgs->offsets;
1053
1054     int32_t state, offsetDelta;
1055     char b;
1056
1057     state=cnv->mode;
1058
1059     /*
1060      * If we detect a BOM in this buffer, then we must add the BOM size to the
1061      * offsets because the actual converter function will not see and count the BOM.
1062      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1063      */
1064     offsetDelta=0;
1065
1066     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1067         switch(state) {
1068         case 0:
1069             b=*source;
1070             if(b==0) {
1071                 state=1; /* could be 00 00 FE FF */
1072             } else if(b==(char)0xff) {
1073                 state=5; /* could be FF FE 00 00 */
1074             } else {
1075                 state=8; /* default to UTF-32BE */
1076                 continue;
1077             }
1078             ++source;
1079             break;
1080         case 1:
1081         case 2:
1082         case 3:
1083         case 5:
1084         case 6:
1085         case 7:
1086             if(*source==utf32BOM[state]) {
1087                 ++state;
1088                 ++source;
1089                 if(state==4) {
1090                     state=8; /* detect UTF-32BE */
1091                     offsetDelta=(int32_t)(source-pArgs->source);
1092                 } else if(state==8) {
1093                     state=9; /* detect UTF-32LE */
1094                     offsetDelta=(int32_t)(source-pArgs->source);
1095                 }
1096             } else {
1097                 /* switch to UTF-32BE and pass the previous bytes */
1098                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1099
1100                 /* reset the source */
1101                 source=pArgs->source;
1102
1103                 if(count==(state&3)) {
1104                     /* simple: all in the same buffer, just reset source */
1105                 } else {
1106                     UBool oldFlush=pArgs->flush;
1107
1108                     /* some of the bytes are from a previous buffer, replay those first */
1109                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1110                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1111                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1112
1113                     /* no offsets: bytes from previous buffer, and not enough for output */
1114                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1115
1116                     /* restore real pointers; pArgs->source will be set in case 8/9 */
1117                     pArgs->sourceLimit=sourceLimit;
1118                     pArgs->flush=oldFlush;
1119                 }
1120                 state=8;
1121                 continue;
1122             }
1123             break;
1124         case 8:
1125             /* call UTF-32BE */
1126             pArgs->source=source;
1127             if(offsets==NULL) {
1128                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1129             } else {
1130                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1131             }
1132             source=pArgs->source;
1133             break;
1134         case 9:
1135             /* call UTF-32LE */
1136             pArgs->source=source;
1137             if(offsets==NULL) {
1138                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1139             } else {
1140                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1141             }
1142             source=pArgs->source;
1143             break;
1144         default:
1145             break; /* does not occur */
1146         }
1147     }
1148
1149     /* add BOM size to offsets - see comment at offsetDelta declaration */
1150     if(offsets!=NULL && offsetDelta!=0) {
1151         int32_t *offsetsLimit=pArgs->offsets;
1152         while(offsets<offsetsLimit) {
1153             *offsets++ += offsetDelta;
1154         }
1155     }
1156
1157     pArgs->source=source;
1158
1159     if(source==sourceLimit && pArgs->flush) {
1160         /* handle truncated input */
1161         switch(state) {
1162         case 0:
1163             break; /* no input at all, nothing to do */
1164         case 8:
1165             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1166             break;
1167         case 9:
1168             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1169             break;
1170         default:
1171             /* handle 0<state<8: call UTF-32BE with too-short input */
1172             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1173             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1174
1175             /* no offsets: not enough for output */
1176             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1177             pArgs->source=source;
1178             pArgs->sourceLimit=sourceLimit;
1179             state=8;
1180             break;
1181         }
1182     }
1183
1184     cnv->mode=state;
1185 }
1186
1187 static UChar32
1188 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1189                    UErrorCode *pErrorCode) {
1190     switch(pArgs->converter->mode) {
1191     case 8:
1192         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1193     case 9:
1194         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1195     default:
1196         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1197     }
1198 }
1199
1200 static const UConverterImpl _UTF32Impl = {
1201     UCNV_UTF32,
1202
1203     NULL,
1204     NULL,
1205
1206     _UTF32Open,
1207     NULL,
1208     _UTF32Reset,
1209
1210     _UTF32ToUnicodeWithOffsets,
1211     _UTF32ToUnicodeWithOffsets,
1212 #if U_IS_BIG_ENDIAN
1213     T_UConverter_fromUnicode_UTF32_BE,
1214     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1215 #else
1216     T_UConverter_fromUnicode_UTF32_LE,
1217     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1218 #endif
1219     _UTF32GetNextUChar,
1220
1221     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1222     NULL,
1223     NULL,
1224     NULL,
1225     ucnv_getNonSurrogateUnicodeSet,
1226
1227     NULL,
1228     NULL
1229 };
1230
1231 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1232 static const UConverterStaticData _UTF32StaticData = {
1233     sizeof(UConverterStaticData),
1234     "UTF-32",
1235     1236,
1236     UCNV_IBM, UCNV_UTF32, 4, 4,
1237 #if U_IS_BIG_ENDIAN
1238     { 0, 0, 0xff, 0xfd }, 4,
1239 #else
1240     { 0xfd, 0xff, 0, 0 }, 4,
1241 #endif
1242     FALSE, FALSE,
1243     0,
1244     0,
1245     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1246 };
1247
1248 const UConverterSharedData _UTF32Data =
1249         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
1250
1251 #endif