source/common/ucnv_u16.c

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 **********************************************************************
   5 *   Copyright (C) 2002-2015, International Business Machines
   6 *   Corporation and others.  All Rights Reserved.
   7 **********************************************************************
   8 *   file name:  ucnv_u16.c
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2002jul01
  14 *   created by: Markus W. Scherer
  15 *
  16 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_CONVERSION
  22
  23 #include "unicode/ucnv.h"
  24 #include "ucnv_bld.h"
  25 #include "ucnv_cnv.h"
  26 #include "cmemory.h"
  27
  28 enum {
  29     UCNV_NEED_TO_WRITE_BOM=1
  30 };
  31
  32 /*
  33  * The UTF-16 toUnicode implementation is also used for the Java-specific
  34  * "with BOM" variants of UTF-16BE and UTF-16LE.
  35  */
  36 static void
  37 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  38                            UErrorCode *pErrorCode);
  39
  40 /* UTF-16BE ----------------------------------------------------------------- */
  41
  42 #if U_IS_BIG_ENDIAN
  43 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
  44 #else
  45 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
  46 #endif
  47
  48
  49 static void
  50 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  51                                UErrorCode *pErrorCode) {
  52     UConverter *cnv;
  53     const UChar *source;
  54     char *target;
  55     int32_t *offsets;
  56
  57     uint32_t targetCapacity, length, sourceIndex;
  58     UChar c, trail;
  59     char overflow[4];
  60
  61     source=pArgs->source;
  62     length=(int32_t)(pArgs->sourceLimit-source);
  63     if(length<=0) {
  64         /* no input, nothing to do */
  65         return;
  66     }
  67
  68     cnv=pArgs->converter;
  69
  70     /* write the BOM if necessary */
  71     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
  72         static const char bom[]={ (char)0xfe, (char)0xff };
  73         ucnv_fromUWriteBytes(cnv,
  74                              bom, 2,
  75                              &pArgs->target, pArgs->targetLimit,
  76                              &pArgs->offsets, -1,
  77                              pErrorCode);
  78         cnv->fromUnicodeStatus=0;
  79     }
  80
  81     target=pArgs->target;
  82     if(target >= pArgs->targetLimit) {
  83         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  84         return;
  85     }
  86
  87     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
  88     offsets=pArgs->offsets;
  89     sourceIndex=0;
  90
  91     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
  92
  93     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
  94         /* the last buffer ended with a lead surrogate, output the surrogate pair */
  95         ++source;
  96         --length;
  97         target[0]=(uint8_t)(c>>8);
  98         target[1]=(uint8_t)c;
  99         target[2]=(uint8_t)(trail>>8);
 100         target[3]=(uint8_t)trail;
 101         target+=4;
 102         targetCapacity-=4;
 103         if(offsets!=NULL) {
 104             *offsets++=-1;
 105             *offsets++=-1;
 106             *offsets++=-1;
 107             *offsets++=-1;
 108         }
 109         sourceIndex=1;
 110         cnv->fromUChar32=c=0;
 111     }
 112
 113     if(c==0) {
 114         /* copy an even number of bytes for complete UChars */
 115         uint32_t count=2*length;
 116         if(count>targetCapacity) {
 117             count=targetCapacity&~1;
 118         }
 119         /* count is even */
 120         targetCapacity-=count;
 121         count>>=1;
 122         length-=count;
 123
 124         if(offsets==NULL) {
 125             while(count>0) {
 126                 c=*source++;
 127                 if(U16_IS_SINGLE(c)) {
 128                     target[0]=(uint8_t)(c>>8);
 129                     target[1]=(uint8_t)c;
 130                     target+=2;
 131                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 132                     ++source;
 133                     --count;
 134                     target[0]=(uint8_t)(c>>8);
 135                     target[1]=(uint8_t)c;
 136                     target[2]=(uint8_t)(trail>>8);
 137                     target[3]=(uint8_t)trail;
 138                     target+=4;
 139                 } else {
 140                     break;
 141                 }
 142                 --count;
 143             }
 144         } else {
 145             while(count>0) {
 146                 c=*source++;
 147                 if(U16_IS_SINGLE(c)) {
 148                     target[0]=(uint8_t)(c>>8);
 149                     target[1]=(uint8_t)c;
 150                     target+=2;
 151                     *offsets++=sourceIndex;
 152                     *offsets++=sourceIndex++;
 153                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 154                     ++source;
 155                     --count;
 156                     target[0]=(uint8_t)(c>>8);
 157                     target[1]=(uint8_t)c;
 158                     target[2]=(uint8_t)(trail>>8);
 159                     target[3]=(uint8_t)trail;
 160                     target+=4;
 161                     *offsets++=sourceIndex;
 162                     *offsets++=sourceIndex;
 163                     *offsets++=sourceIndex;
 164                     *offsets++=sourceIndex;
 165                     sourceIndex+=2;
 166                 } else {
 167                     break;
 168                 }
 169                 --count;
 170             }
 171         }
 172
 173         if(count==0) {
 174             /* done with the loop for complete UChars */
 175             if(length>0 && targetCapacity>0) {
 176                 /*
 177                  * there is more input and some target capacity -
 178                  * it must be targetCapacity==1 because otherwise
 179                  * the above would have copied more;
 180                  * prepare for overflow output
 181                  */
 182                 if(U16_IS_SINGLE(c=*source++)) {
 183                     overflow[0]=(char)(c>>8);
 184                     overflow[1]=(char)c;
 185                     length=2; /* 2 bytes to output */
 186                     c=0;
 187                 /* } else { keep c for surrogate handling, length will be set there */
 188                 }
 189             } else {
 190                 length=0;
 191                 c=0;
 192             }
 193         } else {
 194             /* keep c for surrogate handling, length will be set there */
 195             targetCapacity+=2*count;
 196         }
 197     } else {
 198         length=0; /* from here on, length counts the bytes in overflow[] */
 199     }
 200
 201     if(c!=0) {
 202         /*
 203          * c is a surrogate, and
 204          * - source or target too short
 205          * - or the surrogate is unmatched
 206          */
 207         length=0;
 208         if(U16_IS_SURROGATE_LEAD(c)) {
 209             if(source<pArgs->sourceLimit) {
 210                 if(U16_IS_TRAIL(trail=*source)) {
 211                     /* output the surrogate pair, will overflow (see conditions comment above) */
 212                     ++source;
 213                     overflow[0]=(char)(c>>8);
 214                     overflow[1]=(char)c;
 215                     overflow[2]=(char)(trail>>8);
 216                     overflow[3]=(char)trail;
 217                     length=4; /* 4 bytes to output */
 218                     c=0;
 219                 } else {
 220                     /* unmatched lead surrogate */
 221                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 222                 }
 223             } else {
 224                 /* see if the trail surrogate is in the next buffer */
 225             }
 226         } else {
 227             /* unmatched trail surrogate */
 228             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 229         }
 230         cnv->fromUChar32=c;
 231     }
 232
 233     if(length>0) {
 234         /* output length bytes with overflow (length>targetCapacity>0) */
 235         ucnv_fromUWriteBytes(cnv,
 236                              overflow, length,
 237                              (char **)&target, pArgs->targetLimit,
 238                              &offsets, sourceIndex,
 239                              pErrorCode);
 240         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
 241     }
 242
 243     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
 244         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 245     }
 246
 247     /* write back the updated pointers */
 248     pArgs->source=source;
 249     pArgs->target=(char *)target;
 250     pArgs->offsets=offsets;
 251 }
 252
 253 static void
 254 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 255                              UErrorCode *pErrorCode) {
 256     UConverter *cnv;
 257     const uint8_t *source;
 258     UChar *target;
 259     int32_t *offsets;
 260
 261     uint32_t targetCapacity, length, count, sourceIndex;
 262     UChar c, trail;
 263
 264     if(pArgs->converter->mode<8) {
 265         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
 266         return;
 267     }
 268
 269     cnv=pArgs->converter;
 270     source=(const uint8_t *)pArgs->source;
 271     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
 272     if(length<=0 && cnv->toUnicodeStatus==0) {
 273         /* no input, nothing to do */
 274         return;
 275     }
 276
 277     target=pArgs->target;
 278     if(target >= pArgs->targetLimit) {
 279         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 280         return;
 281     }
 282
 283     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
 284     offsets=pArgs->offsets;
 285     sourceIndex=0;
 286     c=0;
 287
 288     /* complete a partial UChar or pair from the last call */
 289     if(cnv->toUnicodeStatus!=0) {
 290         /*
 291          * special case: single byte from a previous buffer,
 292          * where the byte turned out not to belong to a trail surrogate
 293          * and the preceding, unmatched lead surrogate was put into toUBytes[]
 294          * for error handling
 295          */
 296         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
 297         cnv->toULength=1;
 298         cnv->toUnicodeStatus=0;
 299     }
 300     if((count=cnv->toULength)!=0) {
 301         uint8_t *p=cnv->toUBytes;
 302         do {
 303             p[count++]=*source++;
 304             ++sourceIndex;
 305             --length;
 306             if(count==2) {
 307                 c=((UChar)p[0]<<8)|p[1];
 308                 if(U16_IS_SINGLE(c)) {
 309                     /* output the BMP code point */
 310                     *target++=c;
 311                     if(offsets!=NULL) {
 312                         *offsets++=-1;
 313                     }
 314                     --targetCapacity;
 315                     count=0;
 316                     c=0;
 317                     break;
 318                 } else if(U16_IS_SURROGATE_LEAD(c)) {
 319                     /* continue collecting bytes for the trail surrogate */
 320                     c=0; /* avoid unnecessary surrogate handling below */
 321                 } else {
 322                     /* fall through to error handling for an unmatched trail surrogate */
 323                     break;
 324                 }
 325             } else if(count==4) {
 326                 c=((UChar)p[0]<<8)|p[1];
 327                 trail=((UChar)p[2]<<8)|p[3];
 328                 if(U16_IS_TRAIL(trail)) {
 329                     /* output the surrogate pair */
 330                     *target++=c;
 331                     if(targetCapacity>=2) {
 332                         *target++=trail;
 333                         if(offsets!=NULL) {
 334                             *offsets++=-1;
 335                             *offsets++=-1;
 336                         }
 337                         targetCapacity-=2;
 338                     } else /* targetCapacity==1 */ {
 339                         targetCapacity=0;
 340                         cnv->UCharErrorBuffer[0]=trail;
 341                         cnv->UCharErrorBufferLength=1;
 342                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 343                     }
 344                     count=0;
 345                     c=0;
 346                     break;
 347                 } else {
 348                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
 349                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 350
 351                     /* back out reading the code unit after it */
 352                     if(((const uint8_t *)pArgs->source-source)>=2) {
 353                         source-=2;
 354                     } else {
 355                         /*
 356                          * if the trail unit's first byte was in a previous buffer, then
 357                          * we need to put it into a special place because toUBytes[] will be
 358                          * used for the lead unit's bytes
 359                          */
 360                         cnv->toUnicodeStatus=0x100|p[2];
 361                         --source;
 362                     }
 363                     cnv->toULength=2;
 364
 365                     /* write back the updated pointers */
 366                     pArgs->source=(const char *)source;
 367                     pArgs->target=target;
 368                     pArgs->offsets=offsets;
 369                     return;
 370                 }
 371             }
 372         } while(length>0);
 373         cnv->toULength=(int8_t)count;
 374     }
 375
 376     /* copy an even number of bytes for complete UChars */
 377     count=2*targetCapacity;
 378     if(count>length) {
 379         count=length&~1;
 380     }
 381     if(c==0 && count>0) {
 382         length-=count;
 383         count>>=1;
 384         targetCapacity-=count;
 385         if(offsets==NULL) {
 386             do {
 387                 c=((UChar)source[0]<<8)|source[1];
 388                 source+=2;
 389                 if(U16_IS_SINGLE(c)) {
 390                     *target++=c;
 391                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 392                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
 393                 ) {
 394                     source+=2;
 395                     --count;
 396                     *target++=c;
 397                     *target++=trail;
 398                 } else {
 399                     break;
 400                 }
 401             } while(--count>0);
 402         } else {
 403             do {
 404                 c=((UChar)source[0]<<8)|source[1];
 405                 source+=2;
 406                 if(U16_IS_SINGLE(c)) {
 407                     *target++=c;
 408                     *offsets++=sourceIndex;
 409                     sourceIndex+=2;
 410                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 411                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
 412                 ) {
 413                     source+=2;
 414                     --count;
 415                     *target++=c;
 416                     *target++=trail;
 417                     *offsets++=sourceIndex;
 418                     *offsets++=sourceIndex;
 419                     sourceIndex+=4;
 420                 } else {
 421                     break;
 422                 }
 423             } while(--count>0);
 424         }
 425
 426         if(count==0) {
 427             /* done with the loop for complete UChars */
 428             c=0;
 429         } else {
 430             /* keep c for surrogate handling, trail will be set there */
 431             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
 432             targetCapacity+=count;
 433         }
 434     }
 435
 436     if(c!=0) {
 437         /*
 438          * c is a surrogate, and
 439          * - source or target too short
 440          * - or the surrogate is unmatched
 441          */
 442         cnv->toUBytes[0]=(uint8_t)(c>>8);
 443         cnv->toUBytes[1]=(uint8_t)c;
 444         cnv->toULength=2;
 445
 446         if(U16_IS_SURROGATE_LEAD(c)) {
 447             if(length>=2) {
 448                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
 449                     /* output the surrogate pair, will overflow (see conditions comment above) */
 450                     source+=2;
 451                     length-=2;
 452                     *target++=c;
 453                     if(offsets!=NULL) {
 454                         *offsets++=sourceIndex;
 455                     }
 456                     cnv->UCharErrorBuffer[0]=trail;
 457                     cnv->UCharErrorBufferLength=1;
 458                     cnv->toULength=0;
 459                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 460                 } else {
 461                     /* unmatched lead surrogate */
 462                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 463                 }
 464             } else {
 465                 /* see if the trail surrogate is in the next buffer */
 466             }
 467         } else {
 468             /* unmatched trail surrogate */
 469             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 470         }
 471     }
 472
 473     if(U_SUCCESS(*pErrorCode)) {
 474         /* check for a remaining source byte */
 475         if(length>0) {
 476             if(targetCapacity==0) {
 477                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 478             } else {
 479                 /* it must be length==1 because otherwise the above would have copied more */
 480                 cnv->toUBytes[cnv->toULength++]=*source++;
 481             }
 482         }
 483     }
 484
 485     /* write back the updated pointers */
 486     pArgs->source=(const char *)source;
 487     pArgs->target=target;
 488     pArgs->offsets=offsets;
 489 }
 490
 491 static UChar32
 492 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
 493     const uint8_t *s, *sourceLimit;
 494     UChar32 c;
 495
 496     if(pArgs->converter->mode<8) {
 497         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
 498     }
 499
 500     s=(const uint8_t *)pArgs->source;
 501     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
 502
 503     if(s>=sourceLimit) {
 504         /* no input */
 505         *err=U_INDEX_OUTOFBOUNDS_ERROR;
 506         return 0xffff;
 507     }
 508
 509     if(s+2>sourceLimit) {
 510         /* only one byte: truncated UChar */
 511         pArgs->converter->toUBytes[0]=*s++;
 512         pArgs->converter->toULength=1;
 513         pArgs->source=(const char *)s;
 514         *err = U_TRUNCATED_CHAR_FOUND;
 515         return 0xffff;
 516     }
 517
 518     /* get one UChar */
 519     c=((UChar32)*s<<8)|s[1];
 520     s+=2;
 521
 522     /* check for a surrogate pair */
 523     if(U_IS_SURROGATE(c)) {
 524         if(U16_IS_SURROGATE_LEAD(c)) {
 525             if(s+2<=sourceLimit) {
 526                 UChar trail;
 527
 528                 /* get a second UChar and see if it is a trail surrogate */
 529                 trail=((UChar)*s<<8)|s[1];
 530                 if(U16_IS_TRAIL(trail)) {
 531                     c=U16_GET_SUPPLEMENTARY(c, trail);
 532                     s+=2;
 533                 } else {
 534                     /* unmatched lead surrogate */
 535                     c=-2;
 536                 }
 537             } else {
 538                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
 539                 uint8_t *bytes=pArgs->converter->toUBytes;
 540                 s-=2;
 541                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
 542                 do {
 543                     *bytes++=*s++;
 544                 } while(s<sourceLimit);
 545
 546                 c=0xffff;
 547                 *err=U_TRUNCATED_CHAR_FOUND;
 548             }
 549         } else {
 550             /* unmatched trail surrogate */
 551             c=-2;
 552         }
 553
 554         if(c<0) {
 555             /* write the unmatched surrogate */
 556             uint8_t *bytes=pArgs->converter->toUBytes;
 557             pArgs->converter->toULength=2;
 558             *bytes=*(s-2);
 559             bytes[1]=*(s-1);
 560
 561             c=0xffff;
 562             *err=U_ILLEGAL_CHAR_FOUND;
 563         }
 564     }
 565
 566     pArgs->source=(const char *)s;
 567     return c;
 568 }
 569
 570 static void
 571 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
 572     if(choice<=UCNV_RESET_TO_UNICODE) {
 573         /* reset toUnicode state */
 574         if(UCNV_GET_VERSION(cnv)==0) {
 575             cnv->mode=8; /* no BOM handling */
 576         } else {
 577             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
 578         }
 579     }
 580     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
 581         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
 582         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
 583     }
 584 }
 585
 586 static void
 587 _UTF16BEOpen(UConverter *cnv,
 588              UConverterLoadArgs *pArgs,
 589              UErrorCode *pErrorCode) {
 590     if(UCNV_GET_VERSION(cnv)<=1) {
 591         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
 592     } else {
 593         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
 594     }
 595 }
 596
 597 static const char *
 598 _UTF16BEGetName(const UConverter *cnv) {
 599     if(UCNV_GET_VERSION(cnv)==0) {
 600         return "UTF-16BE";
 601     } else {
 602         return "UTF-16BE,version=1";
 603     }
 604 }
 605
 606 static const UConverterImpl _UTF16BEImpl={
 607     UCNV_UTF16_BigEndian,
 608
 609     NULL,
 610     NULL,
 611
 612     _UTF16BEOpen,
 613     NULL,
 614     _UTF16BEReset,
 615
 616     _UTF16BEToUnicodeWithOffsets,
 617     _UTF16BEToUnicodeWithOffsets,
 618     _UTF16BEFromUnicodeWithOffsets,
 619     _UTF16BEFromUnicodeWithOffsets,
 620     _UTF16BEGetNextUChar,
 621
 622     NULL,
 623     _UTF16BEGetName,
 624     NULL,
 625     NULL,
 626     ucnv_getNonSurrogateUnicodeSet,
 627
 628     NULL,
 629     NULL
 630 };
 631
 632 static const UConverterStaticData _UTF16BEStaticData={
 633     sizeof(UConverterStaticData),
 634     "UTF-16BE",
 635     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
 636     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
 637     0,
 638     0,
 639     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 640 };
 641
 642
 643 const UConverterSharedData _UTF16BEData=
 644         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
 645
 646 /* UTF-16LE ----------------------------------------------------------------- */
 647
 648 static void
 649 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
 650                                UErrorCode *pErrorCode) {
 651     UConverter *cnv;
 652     const UChar *source;
 653     char *target;
 654     int32_t *offsets;
 655
 656     uint32_t targetCapacity, length, sourceIndex;
 657     UChar c, trail;
 658     char overflow[4];
 659
 660     source=pArgs->source;
 661     length=(int32_t)(pArgs->sourceLimit-source);
 662     if(length<=0) {
 663         /* no input, nothing to do */
 664         return;
 665     }
 666
 667     cnv=pArgs->converter;
 668
 669     /* write the BOM if necessary */
 670     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
 671         static const char bom[]={ (char)0xff, (char)0xfe };
 672         ucnv_fromUWriteBytes(cnv,
 673                              bom, 2,
 674                              &pArgs->target, pArgs->targetLimit,
 675                              &pArgs->offsets, -1,
 676                              pErrorCode);
 677         cnv->fromUnicodeStatus=0;
 678     }
 679
 680     target=pArgs->target;
 681     if(target >= pArgs->targetLimit) {
 682         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 683         return;
 684     }
 685
 686     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
 687     offsets=pArgs->offsets;
 688     sourceIndex=0;
 689
 690     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
 691
 692     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
 693         /* the last buffer ended with a lead surrogate, output the surrogate pair */
 694         ++source;
 695         --length;
 696         target[0]=(uint8_t)c;
 697         target[1]=(uint8_t)(c>>8);
 698         target[2]=(uint8_t)trail;
 699         target[3]=(uint8_t)(trail>>8);
 700         target+=4;
 701         targetCapacity-=4;
 702         if(offsets!=NULL) {
 703             *offsets++=-1;
 704             *offsets++=-1;
 705             *offsets++=-1;
 706             *offsets++=-1;
 707         }
 708         sourceIndex=1;
 709         cnv->fromUChar32=c=0;
 710     }
 711
 712     if(c==0) {
 713         /* copy an even number of bytes for complete UChars */
 714         uint32_t count=2*length;
 715         if(count>targetCapacity) {
 716             count=targetCapacity&~1;
 717         }
 718         /* count is even */
 719         targetCapacity-=count;
 720         count>>=1;
 721         length-=count;
 722
 723         if(offsets==NULL) {
 724             while(count>0) {
 725                 c=*source++;
 726                 if(U16_IS_SINGLE(c)) {
 727                     target[0]=(uint8_t)c;
 728                     target[1]=(uint8_t)(c>>8);
 729                     target+=2;
 730                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 731                     ++source;
 732                     --count;
 733                     target[0]=(uint8_t)c;
 734                     target[1]=(uint8_t)(c>>8);
 735                     target[2]=(uint8_t)trail;
 736                     target[3]=(uint8_t)(trail>>8);
 737                     target+=4;
 738                 } else {
 739                     break;
 740                 }
 741                 --count;
 742             }
 743         } else {
 744             while(count>0) {
 745                 c=*source++;
 746                 if(U16_IS_SINGLE(c)) {
 747                     target[0]=(uint8_t)c;
 748                     target[1]=(uint8_t)(c>>8);
 749                     target+=2;
 750                     *offsets++=sourceIndex;
 751                     *offsets++=sourceIndex++;
 752                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
 753                     ++source;
 754                     --count;
 755                     target[0]=(uint8_t)c;
 756                     target[1]=(uint8_t)(c>>8);
 757                     target[2]=(uint8_t)trail;
 758                     target[3]=(uint8_t)(trail>>8);
 759                     target+=4;
 760                     *offsets++=sourceIndex;
 761                     *offsets++=sourceIndex;
 762                     *offsets++=sourceIndex;
 763                     *offsets++=sourceIndex;
 764                     sourceIndex+=2;
 765                 } else {
 766                     break;
 767                 }
 768                 --count;
 769             }
 770         }
 771
 772         if(count==0) {
 773             /* done with the loop for complete UChars */
 774             if(length>0 && targetCapacity>0) {
 775                 /*
 776                  * there is more input and some target capacity -
 777                  * it must be targetCapacity==1 because otherwise
 778                  * the above would have copied more;
 779                  * prepare for overflow output
 780                  */
 781                 if(U16_IS_SINGLE(c=*source++)) {
 782                     overflow[0]=(char)c;
 783                     overflow[1]=(char)(c>>8);
 784                     length=2; /* 2 bytes to output */
 785                     c=0;
 786                 /* } else { keep c for surrogate handling, length will be set there */
 787                 }
 788             } else {
 789                 length=0;
 790                 c=0;
 791             }
 792         } else {
 793             /* keep c for surrogate handling, length will be set there */
 794             targetCapacity+=2*count;
 795         }
 796     } else {
 797         length=0; /* from here on, length counts the bytes in overflow[] */
 798     }
 799
 800     if(c!=0) {
 801         /*
 802          * c is a surrogate, and
 803          * - source or target too short
 804          * - or the surrogate is unmatched
 805          */
 806         length=0;
 807         if(U16_IS_SURROGATE_LEAD(c)) {
 808             if(source<pArgs->sourceLimit) {
 809                 if(U16_IS_TRAIL(trail=*source)) {
 810                     /* output the surrogate pair, will overflow (see conditions comment above) */
 811                     ++source;
 812                     overflow[0]=(char)c;
 813                     overflow[1]=(char)(c>>8);
 814                     overflow[2]=(char)trail;
 815                     overflow[3]=(char)(trail>>8);
 816                     length=4; /* 4 bytes to output */
 817                     c=0;
 818                 } else {
 819                     /* unmatched lead surrogate */
 820                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 821                 }
 822             } else {
 823                 /* see if the trail surrogate is in the next buffer */
 824             }
 825         } else {
 826             /* unmatched trail surrogate */
 827             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 828         }
 829         cnv->fromUChar32=c;
 830     }
 831
 832     if(length>0) {
 833         /* output length bytes with overflow (length>targetCapacity>0) */
 834         ucnv_fromUWriteBytes(cnv,
 835                              overflow, length,
 836                              &target, pArgs->targetLimit,
 837                              &offsets, sourceIndex,
 838                              pErrorCode);
 839         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
 840     }
 841
 842     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
 843         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 844     }
 845
 846     /* write back the updated pointers */
 847     pArgs->source=source;
 848     pArgs->target=target;
 849     pArgs->offsets=offsets;
 850 }
 851
 852 static void
 853 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
 854                              UErrorCode *pErrorCode) {
 855     UConverter *cnv;
 856     const uint8_t *source;
 857     UChar *target;
 858     int32_t *offsets;
 859
 860     uint32_t targetCapacity, length, count, sourceIndex;
 861     UChar c, trail;
 862
 863     if(pArgs->converter->mode<8) {
 864         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
 865         return;
 866     }
 867
 868     cnv=pArgs->converter;
 869     source=(const uint8_t *)pArgs->source;
 870     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
 871     if(length<=0 && cnv->toUnicodeStatus==0) {
 872         /* no input, nothing to do */
 873         return;
 874     }
 875
 876     target=pArgs->target;
 877     if(target >= pArgs->targetLimit) {
 878         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 879         return;
 880     }
 881
 882     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
 883     offsets=pArgs->offsets;
 884     sourceIndex=0;
 885     c=0;
 886
 887     /* complete a partial UChar or pair from the last call */
 888     if(cnv->toUnicodeStatus!=0) {
 889         /*
 890          * special case: single byte from a previous buffer,
 891          * where the byte turned out not to belong to a trail surrogate
 892          * and the preceding, unmatched lead surrogate was put into toUBytes[]
 893          * for error handling
 894          */
 895         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
 896         cnv->toULength=1;
 897         cnv->toUnicodeStatus=0;
 898     }
 899     if((count=cnv->toULength)!=0) {
 900         uint8_t *p=cnv->toUBytes;
 901         do {
 902             p[count++]=*source++;
 903             ++sourceIndex;
 904             --length;
 905             if(count==2) {
 906                 c=((UChar)p[1]<<8)|p[0];
 907                 if(U16_IS_SINGLE(c)) {
 908                     /* output the BMP code point */
 909                     *target++=c;
 910                     if(offsets!=NULL) {
 911                         *offsets++=-1;
 912                     }
 913                     --targetCapacity;
 914                     count=0;
 915                     c=0;
 916                     break;
 917                 } else if(U16_IS_SURROGATE_LEAD(c)) {
 918                     /* continue collecting bytes for the trail surrogate */
 919                     c=0; /* avoid unnecessary surrogate handling below */
 920                 } else {
 921                     /* fall through to error handling for an unmatched trail surrogate */
 922                     break;
 923                 }
 924             } else if(count==4) {
 925                 c=((UChar)p[1]<<8)|p[0];
 926                 trail=((UChar)p[3]<<8)|p[2];
 927                 if(U16_IS_TRAIL(trail)) {
 928                     /* output the surrogate pair */
 929                     *target++=c;
 930                     if(targetCapacity>=2) {
 931                         *target++=trail;
 932                         if(offsets!=NULL) {
 933                             *offsets++=-1;
 934                             *offsets++=-1;
 935                         }
 936                         targetCapacity-=2;
 937                     } else /* targetCapacity==1 */ {
 938                         targetCapacity=0;
 939                         cnv->UCharErrorBuffer[0]=trail;
 940                         cnv->UCharErrorBufferLength=1;
 941                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 942                     }
 943                     count=0;
 944                     c=0;
 945                     break;
 946                 } else {
 947                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
 948                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
 949
 950                     /* back out reading the code unit after it */
 951                     if(((const uint8_t *)pArgs->source-source)>=2) {
 952                         source-=2;
 953                     } else {
 954                         /*
 955                          * if the trail unit's first byte was in a previous buffer, then
 956                          * we need to put it into a special place because toUBytes[] will be
 957                          * used for the lead unit's bytes
 958                          */
 959                         cnv->toUnicodeStatus=0x100|p[2];
 960                         --source;
 961                     }
 962                     cnv->toULength=2;
 963
 964                     /* write back the updated pointers */
 965                     pArgs->source=(const char *)source;
 966                     pArgs->target=target;
 967                     pArgs->offsets=offsets;
 968                     return;
 969                 }
 970             }
 971         } while(length>0);
 972         cnv->toULength=(int8_t)count;
 973     }
 974
 975     /* copy an even number of bytes for complete UChars */
 976     count=2*targetCapacity;
 977     if(count>length) {
 978         count=length&~1;
 979     }
 980     if(c==0 && count>0) {
 981         length-=count;
 982         count>>=1;
 983         targetCapacity-=count;
 984         if(offsets==NULL) {
 985             do {
 986                 c=((UChar)source[1]<<8)|source[0];
 987                 source+=2;
 988                 if(U16_IS_SINGLE(c)) {
 989                     *target++=c;
 990                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
 991                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
 992                 ) {
 993                     source+=2;
 994                     --count;
 995                     *target++=c;
 996                     *target++=trail;
 997                 } else {
 998                     break;
 999                 }
1000             } while(--count>0);
1001         } else {
1002             do {
1003                 c=((UChar)source[1]<<8)|source[0];
1004                 source+=2;
1005                 if(U16_IS_SINGLE(c)) {
1006                     *target++=c;
1007                     *offsets++=sourceIndex;
1008                     sourceIndex+=2;
1009                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1010                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1011                 ) {
1012                     source+=2;
1013                     --count;
1014                     *target++=c;
1015                     *target++=trail;
1016                     *offsets++=sourceIndex;
1017                     *offsets++=sourceIndex;
1018                     sourceIndex+=4;
1019                 } else {
1020                     break;
1021                 }
1022             } while(--count>0);
1023         }
1024
1025         if(count==0) {
1026             /* done with the loop for complete UChars */
1027             c=0;
1028         } else {
1029             /* keep c for surrogate handling, trail will be set there */
1030             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1031             targetCapacity+=count;
1032         }
1033     }
1034
1035     if(c!=0) {
1036         /*
1037          * c is a surrogate, and
1038          * - source or target too short
1039          * - or the surrogate is unmatched
1040          */
1041         cnv->toUBytes[0]=(uint8_t)c;
1042         cnv->toUBytes[1]=(uint8_t)(c>>8);
1043         cnv->toULength=2;
1044
1045         if(U16_IS_SURROGATE_LEAD(c)) {
1046             if(length>=2) {
1047                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1048                     /* output the surrogate pair, will overflow (see conditions comment above) */
1049                     source+=2;
1050                     length-=2;
1051                     *target++=c;
1052                     if(offsets!=NULL) {
1053                         *offsets++=sourceIndex;
1054                     }
1055                     cnv->UCharErrorBuffer[0]=trail;
1056                     cnv->UCharErrorBufferLength=1;
1057                     cnv->toULength=0;
1058                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1059                 } else {
1060                     /* unmatched lead surrogate */
1061                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1062                 }
1063             } else {
1064                 /* see if the trail surrogate is in the next buffer */
1065             }
1066         } else {
1067             /* unmatched trail surrogate */
1068             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1069         }
1070     }
1071
1072     if(U_SUCCESS(*pErrorCode)) {
1073         /* check for a remaining source byte */
1074         if(length>0) {
1075             if(targetCapacity==0) {
1076                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1077             } else {
1078                 /* it must be length==1 because otherwise the above would have copied more */
1079                 cnv->toUBytes[cnv->toULength++]=*source++;
1080             }
1081         }
1082     }
1083
1084     /* write back the updated pointers */
1085     pArgs->source=(const char *)source;
1086     pArgs->target=target;
1087     pArgs->offsets=offsets;
1088 }
1089
1090 static UChar32
1091 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1092     const uint8_t *s, *sourceLimit;
1093     UChar32 c;
1094
1095     if(pArgs->converter->mode<8) {
1096         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1097     }
1098
1099     s=(const uint8_t *)pArgs->source;
1100     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1101
1102     if(s>=sourceLimit) {
1103         /* no input */
1104         *err=U_INDEX_OUTOFBOUNDS_ERROR;
1105         return 0xffff;
1106     }
1107
1108     if(s+2>sourceLimit) {
1109         /* only one byte: truncated UChar */
1110         pArgs->converter->toUBytes[0]=*s++;
1111         pArgs->converter->toULength=1;
1112         pArgs->source=(const char *)s;
1113         *err = U_TRUNCATED_CHAR_FOUND;
1114         return 0xffff;
1115     }
1116
1117     /* get one UChar */
1118     c=((UChar32)s[1]<<8)|*s;
1119     s+=2;
1120
1121     /* check for a surrogate pair */
1122     if(U_IS_SURROGATE(c)) {
1123         if(U16_IS_SURROGATE_LEAD(c)) {
1124             if(s+2<=sourceLimit) {
1125                 UChar trail;
1126
1127                 /* get a second UChar and see if it is a trail surrogate */
1128                 trail=((UChar)s[1]<<8)|*s;
1129                 if(U16_IS_TRAIL(trail)) {
1130                     c=U16_GET_SUPPLEMENTARY(c, trail);
1131                     s+=2;
1132                 } else {
1133                     /* unmatched lead surrogate */
1134                     c=-2;
1135                 }
1136             } else {
1137                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1138                 uint8_t *bytes=pArgs->converter->toUBytes;
1139                 s-=2;
1140                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1141                 do {
1142                     *bytes++=*s++;
1143                 } while(s<sourceLimit);
1144
1145                 c=0xffff;
1146                 *err=U_TRUNCATED_CHAR_FOUND;
1147             }
1148         } else {
1149             /* unmatched trail surrogate */
1150             c=-2;
1151         }
1152
1153         if(c<0) {
1154             /* write the unmatched surrogate */
1155             uint8_t *bytes=pArgs->converter->toUBytes;
1156             pArgs->converter->toULength=2;
1157             *bytes=*(s-2);
1158             bytes[1]=*(s-1);
1159
1160             c=0xffff;
1161             *err=U_ILLEGAL_CHAR_FOUND;
1162         }
1163     }
1164
1165     pArgs->source=(const char *)s;
1166     return c;
1167 }
1168
1169 static void
1170 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1171     if(choice<=UCNV_RESET_TO_UNICODE) {
1172         /* reset toUnicode state */
1173         if(UCNV_GET_VERSION(cnv)==0) {
1174             cnv->mode=8; /* no BOM handling */
1175         } else {
1176             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1177         }
1178     }
1179     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1180         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1181         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1182     }
1183 }
1184
1185 static void
1186 _UTF16LEOpen(UConverter *cnv,
1187              UConverterLoadArgs *pArgs,
1188              UErrorCode *pErrorCode) {
1189     if(UCNV_GET_VERSION(cnv)<=1) {
1190         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1191     } else {
1192         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1193     }
1194 }
1195
1196 static const char *
1197 _UTF16LEGetName(const UConverter *cnv) {
1198     if(UCNV_GET_VERSION(cnv)==0) {
1199         return "UTF-16LE";
1200     } else {
1201         return "UTF-16LE,version=1";
1202     }
1203 }
1204
1205 static const UConverterImpl _UTF16LEImpl={
1206     UCNV_UTF16_LittleEndian,
1207
1208     NULL,
1209     NULL,
1210
1211     _UTF16LEOpen,
1212     NULL,
1213     _UTF16LEReset,
1214
1215     _UTF16LEToUnicodeWithOffsets,
1216     _UTF16LEToUnicodeWithOffsets,
1217     _UTF16LEFromUnicodeWithOffsets,
1218     _UTF16LEFromUnicodeWithOffsets,
1219     _UTF16LEGetNextUChar,
1220
1221     NULL,
1222     _UTF16LEGetName,
1223     NULL,
1224     NULL,
1225     ucnv_getNonSurrogateUnicodeSet,
1226
1227     NULL,
1228     NULL
1229 };
1230
1231
1232 static const UConverterStaticData _UTF16LEStaticData={
1233     sizeof(UConverterStaticData),
1234     "UTF-16LE",
1235     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1236     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1237     0,
1238     0,
1239     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1240 };
1241
1242
1243 const UConverterSharedData _UTF16LEData=
1244         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1245
1246 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1247
1248 /*
1249  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1250  * accordingly.
1251  * This is a simpler version of the UTF-32 converter, with
1252  * fewer states for shorter BOMs.
1253  *
1254  * State values:
1255  * 0    initial state
1256  * 1    saw first byte
1257  * 2..5 -
1258  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1259  * 8    UTF-16BE mode
1260  * 9    UTF-16LE mode
1261  *
1262  * During detection: state==number of initial bytes seen so far.
1263  *
1264  * On output, emit U+FEFF as the first code point.
1265  *
1266  * Variants:
1267  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1268  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1269  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1270  */
1271
1272 static void
1273 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1274     if(choice<=UCNV_RESET_TO_UNICODE) {
1275         /* reset toUnicode: state=0 */
1276         cnv->mode=0;
1277     }
1278     if(choice!=UCNV_RESET_TO_UNICODE) {
1279         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1280         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1281     }
1282 }
1283
1284 static const UConverterSharedData _UTF16v2Data;
1285
1286 static void
1287 _UTF16Open(UConverter *cnv,
1288            UConverterLoadArgs *pArgs,
1289            UErrorCode *pErrorCode) {
1290     if(UCNV_GET_VERSION(cnv)<=2) {
1291         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1292             /*
1293              * Switch implementation, and switch the staticData that's different
1294              * and was copied into the UConverter.
1295              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1296              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1297              */
1298             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1299             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1300         }
1301         _UTF16Reset(cnv, UCNV_RESET_BOTH);
1302     } else {
1303         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1304     }
1305 }
1306
1307 static const char *
1308 _UTF16GetName(const UConverter *cnv) {
1309     if(UCNV_GET_VERSION(cnv)==0) {
1310         return "UTF-16";
1311     } else if(UCNV_GET_VERSION(cnv)==1) {
1312         return "UTF-16,version=1";
1313     } else {
1314         return "UTF-16,version=2";
1315     }
1316 }
1317
1318 const UConverterSharedData _UTF16Data;
1319
1320 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
1321 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
1322 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
1323
1324 static void
1325 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1326                            UErrorCode *pErrorCode) {
1327     UConverter *cnv=pArgs->converter;
1328     const char *source=pArgs->source;
1329     const char *sourceLimit=pArgs->sourceLimit;
1330     int32_t *offsets=pArgs->offsets;
1331
1332     int32_t state, offsetDelta;
1333     uint8_t b;
1334
1335     state=cnv->mode;
1336
1337     /*
1338      * If we detect a BOM in this buffer, then we must add the BOM size to the
1339      * offsets because the actual converter function will not see and count the BOM.
1340      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1341      */
1342     offsetDelta=0;
1343
1344     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1345         switch(state) {
1346         case 0:
1347             cnv->toUBytes[0]=(uint8_t)*source++;
1348             cnv->toULength=1;
1349             state=1;
1350             break;
1351         case 1:
1352             /*
1353              * Only inside this switch case can the state variable
1354              * temporarily take two additional values:
1355              * 6: BOM error, continue with BE
1356              * 7: BOM error, continue with LE
1357              */
1358             b=*source;
1359             if(cnv->toUBytes[0]==0xfe && b==0xff) {
1360                 if(IS_UTF16LE(cnv)) {
1361                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1362                 } else {
1363                     state=8; /* detect UTF-16BE */
1364                 }
1365             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1366                 if(IS_UTF16BE(cnv)) {
1367                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1368                 } else {
1369                     state=9; /* detect UTF-16LE */
1370                 }
1371             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1372                 state=6; /* illegal missing BOM for Java "Unicode" */
1373             }
1374             if(state>=8) {
1375                 /* BOM detected, consume it */
1376                 ++source;
1377                 cnv->toULength=0;
1378                 offsetDelta=(int32_t)(source-pArgs->source);
1379             } else if(state<6) {
1380                 /* ok: no BOM, and not a reverse BOM */
1381                 if(source!=pArgs->source) {
1382                     /* reset the source for a correct first offset */
1383                     source=pArgs->source;
1384                     cnv->toULength=0;
1385                 }
1386                 if(IS_UTF16LE(cnv)) {
1387                     /* Make Java "UnicodeLittle" default to LE. */
1388                     state=9;
1389                 } else {
1390                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1391                     state=8;
1392                 }
1393             } else {
1394                 /*
1395                  * error: missing BOM, or reverse BOM
1396                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1397                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1398                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1399                  */
1400                 /* report the non-BOM or reverse BOM as an illegal sequence */
1401                 cnv->toUBytes[1]=b;
1402                 cnv->toULength=2;
1403                 pArgs->source=source+1;
1404                 /* continue with conversion if the callback resets the error */
1405                 /*
1406                  * Make Java "Unicode" default to BE like standard UTF-16.
1407                  * Make Java "UnicodeBig" and "UnicodeLittle" default
1408                  * to their normal endiannesses.
1409                  */
1410                 cnv->mode=state+2;
1411                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1412                 return;
1413             }
1414             /* convert the rest of the stream */
1415             cnv->mode=state;
1416             continue;
1417         case 8:
1418             /* call UTF-16BE */
1419             pArgs->source=source;
1420             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1421             source=pArgs->source;
1422             break;
1423         case 9:
1424             /* call UTF-16LE */
1425             pArgs->source=source;
1426             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1427             source=pArgs->source;
1428             break;
1429         default:
1430             break; /* does not occur */
1431         }
1432     }
1433
1434     /* add BOM size to offsets - see comment at offsetDelta declaration */
1435     if(offsets!=NULL && offsetDelta!=0) {
1436         int32_t *offsetsLimit=pArgs->offsets;
1437         while(offsets<offsetsLimit) {
1438             *offsets++ += offsetDelta;
1439         }
1440     }
1441
1442     pArgs->source=source;
1443
1444     if(source==sourceLimit && pArgs->flush) {
1445         /* handle truncated input */
1446         switch(state) {
1447         case 0:
1448             break; /* no input at all, nothing to do */
1449         case 8:
1450             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1451             break;
1452         case 9:
1453             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1454             break;
1455         default:
1456             /* 0<state<8: framework will report truncation, nothing to do here */
1457             break;
1458         }
1459     }
1460
1461     cnv->mode=state;
1462 }
1463
1464 static UChar32
1465 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1466                    UErrorCode *pErrorCode) {
1467     switch(pArgs->converter->mode) {
1468     case 8:
1469         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1470     case 9:
1471         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1472     default:
1473         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1474     }
1475 }
1476
1477 static const UConverterImpl _UTF16Impl = {
1478     UCNV_UTF16,
1479
1480     NULL,
1481     NULL,
1482
1483     _UTF16Open,
1484     NULL,
1485     _UTF16Reset,
1486
1487     _UTF16ToUnicodeWithOffsets,
1488     _UTF16ToUnicodeWithOffsets,
1489     _UTF16PEFromUnicodeWithOffsets,
1490     _UTF16PEFromUnicodeWithOffsets,
1491     _UTF16GetNextUChar,
1492
1493     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1494     _UTF16GetName,
1495     NULL,
1496     NULL,
1497     ucnv_getNonSurrogateUnicodeSet,
1498
1499     NULL,
1500     NULL
1501 };
1502
1503 static const UConverterStaticData _UTF16StaticData = {
1504     sizeof(UConverterStaticData),
1505     "UTF-16",
1506     1204, /* CCSID for BOM sensitive UTF-16 */
1507     UCNV_IBM, UCNV_UTF16, 2, 2,
1508 #if U_IS_BIG_ENDIAN
1509     { 0xff, 0xfd, 0, 0 }, 2,
1510 #else
1511     { 0xfd, 0xff, 0, 0 }, 2,
1512 #endif
1513     FALSE, FALSE,
1514     0,
1515     0,
1516     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1517 };
1518
1519 const UConverterSharedData _UTF16Data =
1520         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1521
1522 static const UConverterImpl _UTF16v2Impl = {
1523     UCNV_UTF16,
1524
1525     NULL,
1526     NULL,
1527
1528     _UTF16Open,
1529     NULL,
1530     _UTF16Reset,
1531
1532     _UTF16ToUnicodeWithOffsets,
1533     _UTF16ToUnicodeWithOffsets,
1534     _UTF16BEFromUnicodeWithOffsets,
1535     _UTF16BEFromUnicodeWithOffsets,
1536     _UTF16GetNextUChar,
1537
1538     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1539     _UTF16GetName,
1540     NULL,
1541     NULL,
1542     ucnv_getNonSurrogateUnicodeSet,
1543
1544     NULL,
1545     NULL
1546 };
1547
1548 static const UConverterStaticData _UTF16v2StaticData = {
1549     sizeof(UConverterStaticData),
1550     "UTF-16,version=2",
1551     1204, /* CCSID for BOM sensitive UTF-16 */
1552     UCNV_IBM, UCNV_UTF16, 2, 2,
1553     { 0xff, 0xfd, 0, 0 }, 2,
1554     FALSE, FALSE,
1555     0,
1556     0,
1557     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1558 };
1559
1560 static const UConverterSharedData _UTF16v2Data =
1561         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1562
1563 #endif