Imported Upstream version 58.1
[platform/upstream/icu.git] / source / common / ucnv_u16.c
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*  
4 **********************************************************************
5 *   Copyright (C) 2002-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv_u16.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002jul01
14 *   created by: Markus W. Scherer
15 *
16 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION
22
23 #include "unicode/ucnv.h"
24 #include "ucnv_bld.h"
25 #include "ucnv_cnv.h"
26 #include "cmemory.h"
27
28 enum {
29     UCNV_NEED_TO_WRITE_BOM=1
30 };
31
32 /*
33  * The UTF-16 toUnicode implementation is also used for the Java-specific
34  * "with BOM" variants of UTF-16BE and UTF-16LE.
35  */
36 static void
37 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
38                            UErrorCode *pErrorCode);
39
40 /* UTF-16BE ----------------------------------------------------------------- */
41
42 #if U_IS_BIG_ENDIAN
43 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
44 #else
45 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
46 #endif
47
48
49 static void
50 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
51                                UErrorCode *pErrorCode) {
52     UConverter *cnv;
53     const UChar *source;
54     char *target;
55     int32_t *offsets;
56
57     uint32_t targetCapacity, length, sourceIndex;
58     UChar c, trail;
59     char overflow[4];
60
61     source=pArgs->source;
62     length=(int32_t)(pArgs->sourceLimit-source);
63     if(length<=0) {
64         /* no input, nothing to do */
65         return;
66     }
67
68     cnv=pArgs->converter;
69
70     /* write the BOM if necessary */
71     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
72         static const char bom[]={ (char)0xfe, (char)0xff };
73         ucnv_fromUWriteBytes(cnv,
74                              bom, 2,
75                              &pArgs->target, pArgs->targetLimit,
76                              &pArgs->offsets, -1,
77                              pErrorCode);
78         cnv->fromUnicodeStatus=0;
79     }
80
81     target=pArgs->target;
82     if(target >= pArgs->targetLimit) {
83         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
84         return;
85     }
86
87     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
88     offsets=pArgs->offsets;
89     sourceIndex=0;
90
91     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
92
93     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
94         /* the last buffer ended with a lead surrogate, output the surrogate pair */
95         ++source;
96         --length;
97         target[0]=(uint8_t)(c>>8);
98         target[1]=(uint8_t)c;
99         target[2]=(uint8_t)(trail>>8);
100         target[3]=(uint8_t)trail;
101         target+=4;
102         targetCapacity-=4;
103         if(offsets!=NULL) {
104             *offsets++=-1;
105             *offsets++=-1;
106             *offsets++=-1;
107             *offsets++=-1;
108         }
109         sourceIndex=1;
110         cnv->fromUChar32=c=0;
111     }
112
113     if(c==0) {
114         /* copy an even number of bytes for complete UChars */
115         uint32_t count=2*length;
116         if(count>targetCapacity) {
117             count=targetCapacity&~1;
118         }
119         /* count is even */
120         targetCapacity-=count;
121         count>>=1;
122         length-=count;
123
124         if(offsets==NULL) {
125             while(count>0) {
126                 c=*source++;
127                 if(U16_IS_SINGLE(c)) {
128                     target[0]=(uint8_t)(c>>8);
129                     target[1]=(uint8_t)c;
130                     target+=2;
131                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
132                     ++source;
133                     --count;
134                     target[0]=(uint8_t)(c>>8);
135                     target[1]=(uint8_t)c;
136                     target[2]=(uint8_t)(trail>>8);
137                     target[3]=(uint8_t)trail;
138                     target+=4;
139                 } else {
140                     break;
141                 }
142                 --count;
143             }
144         } else {
145             while(count>0) {
146                 c=*source++;
147                 if(U16_IS_SINGLE(c)) {
148                     target[0]=(uint8_t)(c>>8);
149                     target[1]=(uint8_t)c;
150                     target+=2;
151                     *offsets++=sourceIndex;
152                     *offsets++=sourceIndex++;
153                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
154                     ++source;
155                     --count;
156                     target[0]=(uint8_t)(c>>8);
157                     target[1]=(uint8_t)c;
158                     target[2]=(uint8_t)(trail>>8);
159                     target[3]=(uint8_t)trail;
160                     target+=4;
161                     *offsets++=sourceIndex;
162                     *offsets++=sourceIndex;
163                     *offsets++=sourceIndex;
164                     *offsets++=sourceIndex;
165                     sourceIndex+=2;
166                 } else {
167                     break;
168                 }
169                 --count;
170             }
171         }
172
173         if(count==0) {
174             /* done with the loop for complete UChars */
175             if(length>0 && targetCapacity>0) {
176                 /*
177                  * there is more input and some target capacity -
178                  * it must be targetCapacity==1 because otherwise
179                  * the above would have copied more;
180                  * prepare for overflow output
181                  */
182                 if(U16_IS_SINGLE(c=*source++)) {
183                     overflow[0]=(char)(c>>8);
184                     overflow[1]=(char)c;
185                     length=2; /* 2 bytes to output */
186                     c=0;
187                 /* } else { keep c for surrogate handling, length will be set there */
188                 }
189             } else {
190                 length=0;
191                 c=0;
192             }
193         } else {
194             /* keep c for surrogate handling, length will be set there */
195             targetCapacity+=2*count;
196         }
197     } else {
198         length=0; /* from here on, length counts the bytes in overflow[] */
199     }
200     
201     if(c!=0) {
202         /*
203          * c is a surrogate, and
204          * - source or target too short
205          * - or the surrogate is unmatched
206          */
207         length=0;
208         if(U16_IS_SURROGATE_LEAD(c)) {
209             if(source<pArgs->sourceLimit) {
210                 if(U16_IS_TRAIL(trail=*source)) {
211                     /* output the surrogate pair, will overflow (see conditions comment above) */
212                     ++source;
213                     overflow[0]=(char)(c>>8);
214                     overflow[1]=(char)c;
215                     overflow[2]=(char)(trail>>8);
216                     overflow[3]=(char)trail;
217                     length=4; /* 4 bytes to output */
218                     c=0;
219                 } else {
220                     /* unmatched lead surrogate */
221                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
222                 }
223             } else {
224                 /* see if the trail surrogate is in the next buffer */
225             }
226         } else {
227             /* unmatched trail surrogate */
228             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
229         }
230         cnv->fromUChar32=c;
231     }
232
233     if(length>0) {
234         /* output length bytes with overflow (length>targetCapacity>0) */
235         ucnv_fromUWriteBytes(cnv,
236                              overflow, length,
237                              (char **)&target, pArgs->targetLimit,
238                              &offsets, sourceIndex,
239                              pErrorCode);
240         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
241     }
242
243     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
244         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
245     }
246
247     /* write back the updated pointers */
248     pArgs->source=source;
249     pArgs->target=(char *)target;
250     pArgs->offsets=offsets;
251 }
252
253 static void
254 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
255                              UErrorCode *pErrorCode) {
256     UConverter *cnv;
257     const uint8_t *source;
258     UChar *target;
259     int32_t *offsets;
260
261     uint32_t targetCapacity, length, count, sourceIndex;
262     UChar c, trail;
263
264     if(pArgs->converter->mode<8) {
265         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
266         return;
267     }
268
269     cnv=pArgs->converter;
270     source=(const uint8_t *)pArgs->source;
271     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
272     if(length<=0 && cnv->toUnicodeStatus==0) {
273         /* no input, nothing to do */
274         return;
275     }
276
277     target=pArgs->target;
278     if(target >= pArgs->targetLimit) {
279         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
280         return;
281     }
282
283     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
284     offsets=pArgs->offsets;
285     sourceIndex=0;
286     c=0;
287
288     /* complete a partial UChar or pair from the last call */
289     if(cnv->toUnicodeStatus!=0) {
290         /*
291          * special case: single byte from a previous buffer,
292          * where the byte turned out not to belong to a trail surrogate
293          * and the preceding, unmatched lead surrogate was put into toUBytes[]
294          * for error handling
295          */
296         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
297         cnv->toULength=1;
298         cnv->toUnicodeStatus=0;
299     }
300     if((count=cnv->toULength)!=0) {
301         uint8_t *p=cnv->toUBytes;
302         do {
303             p[count++]=*source++;
304             ++sourceIndex;
305             --length;
306             if(count==2) {
307                 c=((UChar)p[0]<<8)|p[1];
308                 if(U16_IS_SINGLE(c)) {
309                     /* output the BMP code point */
310                     *target++=c;
311                     if(offsets!=NULL) {
312                         *offsets++=-1;
313                     }
314                     --targetCapacity;
315                     count=0;
316                     c=0;
317                     break;
318                 } else if(U16_IS_SURROGATE_LEAD(c)) {
319                     /* continue collecting bytes for the trail surrogate */
320                     c=0; /* avoid unnecessary surrogate handling below */
321                 } else {
322                     /* fall through to error handling for an unmatched trail surrogate */
323                     break;
324                 }
325             } else if(count==4) {
326                 c=((UChar)p[0]<<8)|p[1];
327                 trail=((UChar)p[2]<<8)|p[3];
328                 if(U16_IS_TRAIL(trail)) {
329                     /* output the surrogate pair */
330                     *target++=c;
331                     if(targetCapacity>=2) {
332                         *target++=trail;
333                         if(offsets!=NULL) {
334                             *offsets++=-1;
335                             *offsets++=-1;
336                         }
337                         targetCapacity-=2;
338                     } else /* targetCapacity==1 */ {
339                         targetCapacity=0;
340                         cnv->UCharErrorBuffer[0]=trail;
341                         cnv->UCharErrorBufferLength=1;
342                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
343                     }
344                     count=0;
345                     c=0;
346                     break;
347                 } else {
348                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
349                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
350
351                     /* back out reading the code unit after it */
352                     if(((const uint8_t *)pArgs->source-source)>=2) {
353                         source-=2;
354                     } else {
355                         /*
356                          * if the trail unit's first byte was in a previous buffer, then
357                          * we need to put it into a special place because toUBytes[] will be
358                          * used for the lead unit's bytes
359                          */
360                         cnv->toUnicodeStatus=0x100|p[2];
361                         --source;
362                     }
363                     cnv->toULength=2;
364
365                     /* write back the updated pointers */
366                     pArgs->source=(const char *)source;
367                     pArgs->target=target;
368                     pArgs->offsets=offsets;
369                     return;
370                 }
371             }
372         } while(length>0);
373         cnv->toULength=(int8_t)count;
374     }
375
376     /* copy an even number of bytes for complete UChars */
377     count=2*targetCapacity;
378     if(count>length) {
379         count=length&~1;
380     }
381     if(c==0 && count>0) {
382         length-=count;
383         count>>=1;
384         targetCapacity-=count;
385         if(offsets==NULL) {
386             do {
387                 c=((UChar)source[0]<<8)|source[1];
388                 source+=2;
389                 if(U16_IS_SINGLE(c)) {
390                     *target++=c;
391                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
392                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
393                 ) {
394                     source+=2;
395                     --count;
396                     *target++=c;
397                     *target++=trail;
398                 } else {
399                     break;
400                 }
401             } while(--count>0);
402         } else {
403             do {
404                 c=((UChar)source[0]<<8)|source[1];
405                 source+=2;
406                 if(U16_IS_SINGLE(c)) {
407                     *target++=c;
408                     *offsets++=sourceIndex;
409                     sourceIndex+=2;
410                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
411                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
412                 ) {
413                     source+=2;
414                     --count;
415                     *target++=c;
416                     *target++=trail;
417                     *offsets++=sourceIndex;
418                     *offsets++=sourceIndex;
419                     sourceIndex+=4;
420                 } else {
421                     break;
422                 }
423             } while(--count>0);
424         }
425
426         if(count==0) {
427             /* done with the loop for complete UChars */
428             c=0;
429         } else {
430             /* keep c for surrogate handling, trail will be set there */
431             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
432             targetCapacity+=count;
433         }
434     }
435
436     if(c!=0) {
437         /*
438          * c is a surrogate, and
439          * - source or target too short
440          * - or the surrogate is unmatched
441          */
442         cnv->toUBytes[0]=(uint8_t)(c>>8);
443         cnv->toUBytes[1]=(uint8_t)c;
444         cnv->toULength=2;
445
446         if(U16_IS_SURROGATE_LEAD(c)) {
447             if(length>=2) {
448                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
449                     /* output the surrogate pair, will overflow (see conditions comment above) */
450                     source+=2;
451                     length-=2;
452                     *target++=c;
453                     if(offsets!=NULL) {
454                         *offsets++=sourceIndex;
455                     }
456                     cnv->UCharErrorBuffer[0]=trail;
457                     cnv->UCharErrorBufferLength=1;
458                     cnv->toULength=0;
459                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
460                 } else {
461                     /* unmatched lead surrogate */
462                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
463                 }
464             } else {
465                 /* see if the trail surrogate is in the next buffer */
466             }
467         } else {
468             /* unmatched trail surrogate */
469             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
470         }
471     }
472
473     if(U_SUCCESS(*pErrorCode)) {
474         /* check for a remaining source byte */
475         if(length>0) {
476             if(targetCapacity==0) {
477                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
478             } else {
479                 /* it must be length==1 because otherwise the above would have copied more */
480                 cnv->toUBytes[cnv->toULength++]=*source++;
481             }
482         }
483     }
484
485     /* write back the updated pointers */
486     pArgs->source=(const char *)source;
487     pArgs->target=target;
488     pArgs->offsets=offsets;
489 }
490
491 static UChar32
492 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
493     const uint8_t *s, *sourceLimit;
494     UChar32 c;
495
496     if(pArgs->converter->mode<8) {
497         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
498     }
499
500     s=(const uint8_t *)pArgs->source;
501     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
502
503     if(s>=sourceLimit) {
504         /* no input */
505         *err=U_INDEX_OUTOFBOUNDS_ERROR;
506         return 0xffff;
507     }
508
509     if(s+2>sourceLimit) {
510         /* only one byte: truncated UChar */
511         pArgs->converter->toUBytes[0]=*s++;
512         pArgs->converter->toULength=1;
513         pArgs->source=(const char *)s;
514         *err = U_TRUNCATED_CHAR_FOUND;
515         return 0xffff;
516     }
517
518     /* get one UChar */
519     c=((UChar32)*s<<8)|s[1];
520     s+=2;
521
522     /* check for a surrogate pair */
523     if(U_IS_SURROGATE(c)) {
524         if(U16_IS_SURROGATE_LEAD(c)) {
525             if(s+2<=sourceLimit) {
526                 UChar trail;
527
528                 /* get a second UChar and see if it is a trail surrogate */
529                 trail=((UChar)*s<<8)|s[1];
530                 if(U16_IS_TRAIL(trail)) {
531                     c=U16_GET_SUPPLEMENTARY(c, trail);
532                     s+=2;
533                 } else {
534                     /* unmatched lead surrogate */
535                     c=-2;
536                 }
537             } else {
538                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
539                 uint8_t *bytes=pArgs->converter->toUBytes;
540                 s-=2;
541                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
542                 do {
543                     *bytes++=*s++;
544                 } while(s<sourceLimit);
545
546                 c=0xffff;
547                 *err=U_TRUNCATED_CHAR_FOUND;
548             }
549         } else {
550             /* unmatched trail surrogate */
551             c=-2;
552         }
553
554         if(c<0) {
555             /* write the unmatched surrogate */
556             uint8_t *bytes=pArgs->converter->toUBytes;
557             pArgs->converter->toULength=2;
558             *bytes=*(s-2);
559             bytes[1]=*(s-1);
560
561             c=0xffff;
562             *err=U_ILLEGAL_CHAR_FOUND;
563         }
564     }
565
566     pArgs->source=(const char *)s;
567     return c;
568
569
570 static void
571 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
572     if(choice<=UCNV_RESET_TO_UNICODE) {
573         /* reset toUnicode state */
574         if(UCNV_GET_VERSION(cnv)==0) {
575             cnv->mode=8; /* no BOM handling */
576         } else {
577             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
578         }
579     }
580     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
581         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
582         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
583     }
584 }
585
586 static void
587 _UTF16BEOpen(UConverter *cnv,
588              UConverterLoadArgs *pArgs,
589              UErrorCode *pErrorCode) {
590     if(UCNV_GET_VERSION(cnv)<=1) {
591         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
592     } else {
593         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
594     }
595 }
596
597 static const char *
598 _UTF16BEGetName(const UConverter *cnv) {
599     if(UCNV_GET_VERSION(cnv)==0) {
600         return "UTF-16BE";
601     } else {
602         return "UTF-16BE,version=1";
603     }
604 }
605
606 static const UConverterImpl _UTF16BEImpl={
607     UCNV_UTF16_BigEndian,
608
609     NULL,
610     NULL,
611
612     _UTF16BEOpen,
613     NULL,
614     _UTF16BEReset,
615
616     _UTF16BEToUnicodeWithOffsets,
617     _UTF16BEToUnicodeWithOffsets,
618     _UTF16BEFromUnicodeWithOffsets,
619     _UTF16BEFromUnicodeWithOffsets,
620     _UTF16BEGetNextUChar,
621
622     NULL,
623     _UTF16BEGetName,
624     NULL,
625     NULL,
626     ucnv_getNonSurrogateUnicodeSet,
627
628     NULL,
629     NULL
630 };
631
632 static const UConverterStaticData _UTF16BEStaticData={
633     sizeof(UConverterStaticData),
634     "UTF-16BE",
635     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
636     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
637     0,
638     0,
639     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
640 };
641
642
643 const UConverterSharedData _UTF16BEData=
644         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
645
646 /* UTF-16LE ----------------------------------------------------------------- */
647
648 static void
649 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
650                                UErrorCode *pErrorCode) {
651     UConverter *cnv;
652     const UChar *source;
653     char *target;
654     int32_t *offsets;
655
656     uint32_t targetCapacity, length, sourceIndex;
657     UChar c, trail;
658     char overflow[4];
659
660     source=pArgs->source;
661     length=(int32_t)(pArgs->sourceLimit-source);
662     if(length<=0) {
663         /* no input, nothing to do */
664         return;
665     }
666
667     cnv=pArgs->converter;
668
669     /* write the BOM if necessary */
670     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
671         static const char bom[]={ (char)0xff, (char)0xfe };
672         ucnv_fromUWriteBytes(cnv,
673                              bom, 2,
674                              &pArgs->target, pArgs->targetLimit,
675                              &pArgs->offsets, -1,
676                              pErrorCode);
677         cnv->fromUnicodeStatus=0;
678     }
679
680     target=pArgs->target;
681     if(target >= pArgs->targetLimit) {
682         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
683         return;
684     }
685
686     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
687     offsets=pArgs->offsets;
688     sourceIndex=0;
689
690     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
691
692     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
693         /* the last buffer ended with a lead surrogate, output the surrogate pair */
694         ++source;
695         --length;
696         target[0]=(uint8_t)c;
697         target[1]=(uint8_t)(c>>8);
698         target[2]=(uint8_t)trail;
699         target[3]=(uint8_t)(trail>>8);
700         target+=4;
701         targetCapacity-=4;
702         if(offsets!=NULL) {
703             *offsets++=-1;
704             *offsets++=-1;
705             *offsets++=-1;
706             *offsets++=-1;
707         }
708         sourceIndex=1;
709         cnv->fromUChar32=c=0;
710     }
711
712     if(c==0) {
713         /* copy an even number of bytes for complete UChars */
714         uint32_t count=2*length;
715         if(count>targetCapacity) {
716             count=targetCapacity&~1;
717         }
718         /* count is even */
719         targetCapacity-=count;
720         count>>=1;
721         length-=count;
722
723         if(offsets==NULL) {
724             while(count>0) {
725                 c=*source++;
726                 if(U16_IS_SINGLE(c)) {
727                     target[0]=(uint8_t)c;
728                     target[1]=(uint8_t)(c>>8);
729                     target+=2;
730                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
731                     ++source;
732                     --count;
733                     target[0]=(uint8_t)c;
734                     target[1]=(uint8_t)(c>>8);
735                     target[2]=(uint8_t)trail;
736                     target[3]=(uint8_t)(trail>>8);
737                     target+=4;
738                 } else {
739                     break;
740                 }
741                 --count;
742             }
743         } else {
744             while(count>0) {
745                 c=*source++;
746                 if(U16_IS_SINGLE(c)) {
747                     target[0]=(uint8_t)c;
748                     target[1]=(uint8_t)(c>>8);
749                     target+=2;
750                     *offsets++=sourceIndex;
751                     *offsets++=sourceIndex++;
752                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
753                     ++source;
754                     --count;
755                     target[0]=(uint8_t)c;
756                     target[1]=(uint8_t)(c>>8);
757                     target[2]=(uint8_t)trail;
758                     target[3]=(uint8_t)(trail>>8);
759                     target+=4;
760                     *offsets++=sourceIndex;
761                     *offsets++=sourceIndex;
762                     *offsets++=sourceIndex;
763                     *offsets++=sourceIndex;
764                     sourceIndex+=2;
765                 } else {
766                     break;
767                 }
768                 --count;
769             }
770         }
771
772         if(count==0) {
773             /* done with the loop for complete UChars */
774             if(length>0 && targetCapacity>0) {
775                 /*
776                  * there is more input and some target capacity -
777                  * it must be targetCapacity==1 because otherwise
778                  * the above would have copied more;
779                  * prepare for overflow output
780                  */
781                 if(U16_IS_SINGLE(c=*source++)) {
782                     overflow[0]=(char)c;
783                     overflow[1]=(char)(c>>8);
784                     length=2; /* 2 bytes to output */
785                     c=0;
786                 /* } else { keep c for surrogate handling, length will be set there */
787                 }
788             } else {
789                 length=0;
790                 c=0;
791             }
792         } else {
793             /* keep c for surrogate handling, length will be set there */
794             targetCapacity+=2*count;
795         }
796     } else {
797         length=0; /* from here on, length counts the bytes in overflow[] */
798     }
799     
800     if(c!=0) {
801         /*
802          * c is a surrogate, and
803          * - source or target too short
804          * - or the surrogate is unmatched
805          */
806         length=0;
807         if(U16_IS_SURROGATE_LEAD(c)) {
808             if(source<pArgs->sourceLimit) {
809                 if(U16_IS_TRAIL(trail=*source)) {
810                     /* output the surrogate pair, will overflow (see conditions comment above) */
811                     ++source;
812                     overflow[0]=(char)c;
813                     overflow[1]=(char)(c>>8);
814                     overflow[2]=(char)trail;
815                     overflow[3]=(char)(trail>>8);
816                     length=4; /* 4 bytes to output */
817                     c=0;
818                 } else {
819                     /* unmatched lead surrogate */
820                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
821                 }
822             } else {
823                 /* see if the trail surrogate is in the next buffer */
824             }
825         } else {
826             /* unmatched trail surrogate */
827             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
828         }
829         cnv->fromUChar32=c;
830     }
831
832     if(length>0) {
833         /* output length bytes with overflow (length>targetCapacity>0) */
834         ucnv_fromUWriteBytes(cnv,
835                              overflow, length,
836                              &target, pArgs->targetLimit,
837                              &offsets, sourceIndex,
838                              pErrorCode);
839         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
840     }
841
842     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
843         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
844     }
845
846     /* write back the updated pointers */
847     pArgs->source=source;
848     pArgs->target=target;
849     pArgs->offsets=offsets;
850 }
851
852 static void
853 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
854                              UErrorCode *pErrorCode) {
855     UConverter *cnv;
856     const uint8_t *source;
857     UChar *target;
858     int32_t *offsets;
859
860     uint32_t targetCapacity, length, count, sourceIndex;
861     UChar c, trail;
862
863     if(pArgs->converter->mode<8) {
864         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
865         return;
866     }
867
868     cnv=pArgs->converter;
869     source=(const uint8_t *)pArgs->source;
870     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
871     if(length<=0 && cnv->toUnicodeStatus==0) {
872         /* no input, nothing to do */
873         return;
874     }
875
876     target=pArgs->target;
877     if(target >= pArgs->targetLimit) {
878         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
879         return;
880     }
881
882     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
883     offsets=pArgs->offsets;
884     sourceIndex=0;
885     c=0;
886
887     /* complete a partial UChar or pair from the last call */
888     if(cnv->toUnicodeStatus!=0) {
889         /*
890          * special case: single byte from a previous buffer,
891          * where the byte turned out not to belong to a trail surrogate
892          * and the preceding, unmatched lead surrogate was put into toUBytes[]
893          * for error handling
894          */
895         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
896         cnv->toULength=1;
897         cnv->toUnicodeStatus=0;
898     }
899     if((count=cnv->toULength)!=0) {
900         uint8_t *p=cnv->toUBytes;
901         do {
902             p[count++]=*source++;
903             ++sourceIndex;
904             --length;
905             if(count==2) {
906                 c=((UChar)p[1]<<8)|p[0];
907                 if(U16_IS_SINGLE(c)) {
908                     /* output the BMP code point */
909                     *target++=c;
910                     if(offsets!=NULL) {
911                         *offsets++=-1;
912                     }
913                     --targetCapacity;
914                     count=0;
915                     c=0;
916                     break;
917                 } else if(U16_IS_SURROGATE_LEAD(c)) {
918                     /* continue collecting bytes for the trail surrogate */
919                     c=0; /* avoid unnecessary surrogate handling below */
920                 } else {
921                     /* fall through to error handling for an unmatched trail surrogate */
922                     break;
923                 }
924             } else if(count==4) {
925                 c=((UChar)p[1]<<8)|p[0];
926                 trail=((UChar)p[3]<<8)|p[2];
927                 if(U16_IS_TRAIL(trail)) {
928                     /* output the surrogate pair */
929                     *target++=c;
930                     if(targetCapacity>=2) {
931                         *target++=trail;
932                         if(offsets!=NULL) {
933                             *offsets++=-1;
934                             *offsets++=-1;
935                         }
936                         targetCapacity-=2;
937                     } else /* targetCapacity==1 */ {
938                         targetCapacity=0;
939                         cnv->UCharErrorBuffer[0]=trail;
940                         cnv->UCharErrorBufferLength=1;
941                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
942                     }
943                     count=0;
944                     c=0;
945                     break;
946                 } else {
947                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
948                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
949
950                     /* back out reading the code unit after it */
951                     if(((const uint8_t *)pArgs->source-source)>=2) {
952                         source-=2;
953                     } else {
954                         /*
955                          * if the trail unit's first byte was in a previous buffer, then
956                          * we need to put it into a special place because toUBytes[] will be
957                          * used for the lead unit's bytes
958                          */
959                         cnv->toUnicodeStatus=0x100|p[2];
960                         --source;
961                     }
962                     cnv->toULength=2;
963
964                     /* write back the updated pointers */
965                     pArgs->source=(const char *)source;
966                     pArgs->target=target;
967                     pArgs->offsets=offsets;
968                     return;
969                 }
970             }
971         } while(length>0);
972         cnv->toULength=(int8_t)count;
973     }
974
975     /* copy an even number of bytes for complete UChars */
976     count=2*targetCapacity;
977     if(count>length) {
978         count=length&~1;
979     }
980     if(c==0 && count>0) {
981         length-=count;
982         count>>=1;
983         targetCapacity-=count;
984         if(offsets==NULL) {
985             do {
986                 c=((UChar)source[1]<<8)|source[0];
987                 source+=2;
988                 if(U16_IS_SINGLE(c)) {
989                     *target++=c;
990                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
991                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
992                 ) {
993                     source+=2;
994                     --count;
995                     *target++=c;
996                     *target++=trail;
997                 } else {
998                     break;
999                 }
1000             } while(--count>0);
1001         } else {
1002             do {
1003                 c=((UChar)source[1]<<8)|source[0];
1004                 source+=2;
1005                 if(U16_IS_SINGLE(c)) {
1006                     *target++=c;
1007                     *offsets++=sourceIndex;
1008                     sourceIndex+=2;
1009                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1010                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1011                 ) {
1012                     source+=2;
1013                     --count;
1014                     *target++=c;
1015                     *target++=trail;
1016                     *offsets++=sourceIndex;
1017                     *offsets++=sourceIndex;
1018                     sourceIndex+=4;
1019                 } else {
1020                     break;
1021                 }
1022             } while(--count>0);
1023         }
1024
1025         if(count==0) {
1026             /* done with the loop for complete UChars */
1027             c=0;
1028         } else {
1029             /* keep c for surrogate handling, trail will be set there */
1030             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1031             targetCapacity+=count;
1032         }
1033     }
1034
1035     if(c!=0) {
1036         /*
1037          * c is a surrogate, and
1038          * - source or target too short
1039          * - or the surrogate is unmatched
1040          */
1041         cnv->toUBytes[0]=(uint8_t)c;
1042         cnv->toUBytes[1]=(uint8_t)(c>>8);
1043         cnv->toULength=2;
1044
1045         if(U16_IS_SURROGATE_LEAD(c)) {
1046             if(length>=2) {
1047                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1048                     /* output the surrogate pair, will overflow (see conditions comment above) */
1049                     source+=2;
1050                     length-=2;
1051                     *target++=c;
1052                     if(offsets!=NULL) {
1053                         *offsets++=sourceIndex;
1054                     }
1055                     cnv->UCharErrorBuffer[0]=trail;
1056                     cnv->UCharErrorBufferLength=1;
1057                     cnv->toULength=0;
1058                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1059                 } else {
1060                     /* unmatched lead surrogate */
1061                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1062                 }
1063             } else {
1064                 /* see if the trail surrogate is in the next buffer */
1065             }
1066         } else {
1067             /* unmatched trail surrogate */
1068             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1069         }
1070     }
1071
1072     if(U_SUCCESS(*pErrorCode)) {
1073         /* check for a remaining source byte */
1074         if(length>0) {
1075             if(targetCapacity==0) {
1076                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1077             } else {
1078                 /* it must be length==1 because otherwise the above would have copied more */
1079                 cnv->toUBytes[cnv->toULength++]=*source++;
1080             }
1081         }
1082     }
1083
1084     /* write back the updated pointers */
1085     pArgs->source=(const char *)source;
1086     pArgs->target=target;
1087     pArgs->offsets=offsets;
1088 }
1089
1090 static UChar32
1091 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1092     const uint8_t *s, *sourceLimit;
1093     UChar32 c;
1094
1095     if(pArgs->converter->mode<8) {
1096         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1097     }
1098
1099     s=(const uint8_t *)pArgs->source;
1100     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1101
1102     if(s>=sourceLimit) {
1103         /* no input */
1104         *err=U_INDEX_OUTOFBOUNDS_ERROR;
1105         return 0xffff;
1106     }
1107
1108     if(s+2>sourceLimit) {
1109         /* only one byte: truncated UChar */
1110         pArgs->converter->toUBytes[0]=*s++;
1111         pArgs->converter->toULength=1;
1112         pArgs->source=(const char *)s;
1113         *err = U_TRUNCATED_CHAR_FOUND;
1114         return 0xffff;
1115     }
1116
1117     /* get one UChar */
1118     c=((UChar32)s[1]<<8)|*s;
1119     s+=2;
1120
1121     /* check for a surrogate pair */
1122     if(U_IS_SURROGATE(c)) {
1123         if(U16_IS_SURROGATE_LEAD(c)) {
1124             if(s+2<=sourceLimit) {
1125                 UChar trail;
1126
1127                 /* get a second UChar and see if it is a trail surrogate */
1128                 trail=((UChar)s[1]<<8)|*s;
1129                 if(U16_IS_TRAIL(trail)) {
1130                     c=U16_GET_SUPPLEMENTARY(c, trail);
1131                     s+=2;
1132                 } else {
1133                     /* unmatched lead surrogate */
1134                     c=-2;
1135                 }
1136             } else {
1137                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1138                 uint8_t *bytes=pArgs->converter->toUBytes;
1139                 s-=2;
1140                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1141                 do {
1142                     *bytes++=*s++;
1143                 } while(s<sourceLimit);
1144
1145                 c=0xffff;
1146                 *err=U_TRUNCATED_CHAR_FOUND;
1147             }
1148         } else {
1149             /* unmatched trail surrogate */
1150             c=-2;
1151         }
1152
1153         if(c<0) {
1154             /* write the unmatched surrogate */
1155             uint8_t *bytes=pArgs->converter->toUBytes;
1156             pArgs->converter->toULength=2;
1157             *bytes=*(s-2);
1158             bytes[1]=*(s-1);
1159
1160             c=0xffff;
1161             *err=U_ILLEGAL_CHAR_FOUND;
1162         }
1163     }
1164
1165     pArgs->source=(const char *)s;
1166     return c;
1167
1168
1169 static void
1170 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1171     if(choice<=UCNV_RESET_TO_UNICODE) {
1172         /* reset toUnicode state */
1173         if(UCNV_GET_VERSION(cnv)==0) {
1174             cnv->mode=8; /* no BOM handling */
1175         } else {
1176             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1177         }
1178     }
1179     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1180         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1181         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1182     }
1183 }
1184
1185 static void
1186 _UTF16LEOpen(UConverter *cnv,
1187              UConverterLoadArgs *pArgs,
1188              UErrorCode *pErrorCode) {
1189     if(UCNV_GET_VERSION(cnv)<=1) {
1190         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1191     } else {
1192         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1193     }
1194 }
1195
1196 static const char *
1197 _UTF16LEGetName(const UConverter *cnv) {
1198     if(UCNV_GET_VERSION(cnv)==0) {
1199         return "UTF-16LE";
1200     } else {
1201         return "UTF-16LE,version=1";
1202     }
1203 }
1204
1205 static const UConverterImpl _UTF16LEImpl={
1206     UCNV_UTF16_LittleEndian,
1207
1208     NULL,
1209     NULL,
1210
1211     _UTF16LEOpen,
1212     NULL,
1213     _UTF16LEReset,
1214
1215     _UTF16LEToUnicodeWithOffsets,
1216     _UTF16LEToUnicodeWithOffsets,
1217     _UTF16LEFromUnicodeWithOffsets,
1218     _UTF16LEFromUnicodeWithOffsets,
1219     _UTF16LEGetNextUChar,
1220
1221     NULL,
1222     _UTF16LEGetName,
1223     NULL,
1224     NULL,
1225     ucnv_getNonSurrogateUnicodeSet,
1226
1227     NULL,
1228     NULL
1229 };
1230
1231
1232 static const UConverterStaticData _UTF16LEStaticData={
1233     sizeof(UConverterStaticData),
1234     "UTF-16LE",
1235     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1236     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1237     0,
1238     0,
1239     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1240 };
1241
1242
1243 const UConverterSharedData _UTF16LEData=
1244         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1245
1246 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
1247
1248 /*
1249  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1250  * accordingly.
1251  * This is a simpler version of the UTF-32 converter, with
1252  * fewer states for shorter BOMs.
1253  *
1254  * State values:
1255  * 0    initial state
1256  * 1    saw first byte
1257  * 2..5 -
1258  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1259  * 8    UTF-16BE mode
1260  * 9    UTF-16LE mode
1261  *
1262  * During detection: state==number of initial bytes seen so far.
1263  *
1264  * On output, emit U+FEFF as the first code point.
1265  *
1266  * Variants:
1267  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1268  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1269  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1270  */
1271
1272 static void
1273 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1274     if(choice<=UCNV_RESET_TO_UNICODE) {
1275         /* reset toUnicode: state=0 */
1276         cnv->mode=0;
1277     }
1278     if(choice!=UCNV_RESET_TO_UNICODE) {
1279         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1280         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1281     }
1282 }
1283
1284 static const UConverterSharedData _UTF16v2Data;
1285
1286 static void
1287 _UTF16Open(UConverter *cnv,
1288            UConverterLoadArgs *pArgs,
1289            UErrorCode *pErrorCode) {
1290     if(UCNV_GET_VERSION(cnv)<=2) {
1291         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1292             /*
1293              * Switch implementation, and switch the staticData that's different
1294              * and was copied into the UConverter.
1295              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1296              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1297              */
1298             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1299             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1300         }
1301         _UTF16Reset(cnv, UCNV_RESET_BOTH);
1302     } else {
1303         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1304     }
1305 }
1306
1307 static const char *
1308 _UTF16GetName(const UConverter *cnv) {
1309     if(UCNV_GET_VERSION(cnv)==0) {
1310         return "UTF-16";
1311     } else if(UCNV_GET_VERSION(cnv)==1) {
1312         return "UTF-16,version=1";
1313     } else {
1314         return "UTF-16,version=2";
1315     }
1316 }
1317
1318 const UConverterSharedData _UTF16Data;
1319
1320 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
1321 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
1322 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
1323
1324 static void
1325 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1326                            UErrorCode *pErrorCode) {
1327     UConverter *cnv=pArgs->converter;
1328     const char *source=pArgs->source;
1329     const char *sourceLimit=pArgs->sourceLimit;
1330     int32_t *offsets=pArgs->offsets;
1331
1332     int32_t state, offsetDelta;
1333     uint8_t b;
1334
1335     state=cnv->mode;
1336
1337     /*
1338      * If we detect a BOM in this buffer, then we must add the BOM size to the
1339      * offsets because the actual converter function will not see and count the BOM.
1340      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1341      */
1342     offsetDelta=0;
1343
1344     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1345         switch(state) {
1346         case 0:
1347             cnv->toUBytes[0]=(uint8_t)*source++;
1348             cnv->toULength=1;
1349             state=1;
1350             break;
1351         case 1:
1352             /*
1353              * Only inside this switch case can the state variable
1354              * temporarily take two additional values:
1355              * 6: BOM error, continue with BE
1356              * 7: BOM error, continue with LE
1357              */
1358             b=*source;
1359             if(cnv->toUBytes[0]==0xfe && b==0xff) {
1360                 if(IS_UTF16LE(cnv)) {
1361                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1362                 } else {
1363                     state=8; /* detect UTF-16BE */
1364                 }
1365             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1366                 if(IS_UTF16BE(cnv)) {
1367                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1368                 } else {
1369                     state=9; /* detect UTF-16LE */
1370                 }
1371             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1372                 state=6; /* illegal missing BOM for Java "Unicode" */
1373             }
1374             if(state>=8) {
1375                 /* BOM detected, consume it */
1376                 ++source;
1377                 cnv->toULength=0;
1378                 offsetDelta=(int32_t)(source-pArgs->source);
1379             } else if(state<6) {
1380                 /* ok: no BOM, and not a reverse BOM */
1381                 if(source!=pArgs->source) {
1382                     /* reset the source for a correct first offset */
1383                     source=pArgs->source;
1384                     cnv->toULength=0;
1385                 }
1386                 if(IS_UTF16LE(cnv)) {
1387                     /* Make Java "UnicodeLittle" default to LE. */
1388                     state=9;
1389                 } else {
1390                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1391                     state=8;
1392                 }
1393             } else {
1394                 /*
1395                  * error: missing BOM, or reverse BOM
1396                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1397                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1398                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1399                  */
1400                 /* report the non-BOM or reverse BOM as an illegal sequence */
1401                 cnv->toUBytes[1]=b;
1402                 cnv->toULength=2;
1403                 pArgs->source=source+1;
1404                 /* continue with conversion if the callback resets the error */
1405                 /*
1406                  * Make Java "Unicode" default to BE like standard UTF-16.
1407                  * Make Java "UnicodeBig" and "UnicodeLittle" default
1408                  * to their normal endiannesses.
1409                  */
1410                 cnv->mode=state+2;
1411                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1412                 return;
1413             }
1414             /* convert the rest of the stream */
1415             cnv->mode=state;
1416             continue;
1417         case 8:
1418             /* call UTF-16BE */
1419             pArgs->source=source;
1420             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1421             source=pArgs->source;
1422             break;
1423         case 9:
1424             /* call UTF-16LE */
1425             pArgs->source=source;
1426             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1427             source=pArgs->source;
1428             break;
1429         default:
1430             break; /* does not occur */
1431         }
1432     }
1433
1434     /* add BOM size to offsets - see comment at offsetDelta declaration */
1435     if(offsets!=NULL && offsetDelta!=0) {
1436         int32_t *offsetsLimit=pArgs->offsets;
1437         while(offsets<offsetsLimit) {
1438             *offsets++ += offsetDelta;
1439         }
1440     }
1441
1442     pArgs->source=source;
1443
1444     if(source==sourceLimit && pArgs->flush) {
1445         /* handle truncated input */
1446         switch(state) {
1447         case 0:
1448             break; /* no input at all, nothing to do */
1449         case 8:
1450             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1451             break;
1452         case 9:
1453             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1454             break;
1455         default:
1456             /* 0<state<8: framework will report truncation, nothing to do here */
1457             break;
1458         }
1459     }
1460
1461     cnv->mode=state;
1462 }
1463
1464 static UChar32
1465 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1466                    UErrorCode *pErrorCode) {
1467     switch(pArgs->converter->mode) {
1468     case 8:
1469         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1470     case 9:
1471         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1472     default:
1473         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1474     }
1475 }
1476
1477 static const UConverterImpl _UTF16Impl = {
1478     UCNV_UTF16,
1479
1480     NULL,
1481     NULL,
1482
1483     _UTF16Open,
1484     NULL,
1485     _UTF16Reset,
1486
1487     _UTF16ToUnicodeWithOffsets,
1488     _UTF16ToUnicodeWithOffsets,
1489     _UTF16PEFromUnicodeWithOffsets,
1490     _UTF16PEFromUnicodeWithOffsets,
1491     _UTF16GetNextUChar,
1492
1493     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1494     _UTF16GetName,
1495     NULL,
1496     NULL,
1497     ucnv_getNonSurrogateUnicodeSet,
1498
1499     NULL,
1500     NULL
1501 };
1502
1503 static const UConverterStaticData _UTF16StaticData = {
1504     sizeof(UConverterStaticData),
1505     "UTF-16",
1506     1204, /* CCSID for BOM sensitive UTF-16 */
1507     UCNV_IBM, UCNV_UTF16, 2, 2,
1508 #if U_IS_BIG_ENDIAN
1509     { 0xff, 0xfd, 0, 0 }, 2,
1510 #else
1511     { 0xfd, 0xff, 0, 0 }, 2,
1512 #endif
1513     FALSE, FALSE,
1514     0,
1515     0,
1516     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1517 };
1518
1519 const UConverterSharedData _UTF16Data =
1520         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1521
1522 static const UConverterImpl _UTF16v2Impl = {
1523     UCNV_UTF16,
1524
1525     NULL,
1526     NULL,
1527
1528     _UTF16Open,
1529     NULL,
1530     _UTF16Reset,
1531
1532     _UTF16ToUnicodeWithOffsets,
1533     _UTF16ToUnicodeWithOffsets,
1534     _UTF16BEFromUnicodeWithOffsets,
1535     _UTF16BEFromUnicodeWithOffsets,
1536     _UTF16GetNextUChar,
1537
1538     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1539     _UTF16GetName,
1540     NULL,
1541     NULL,
1542     ucnv_getNonSurrogateUnicodeSet,
1543
1544     NULL,
1545     NULL
1546 };
1547
1548 static const UConverterStaticData _UTF16v2StaticData = {
1549     sizeof(UConverterStaticData),
1550     "UTF-16,version=2",
1551     1204, /* CCSID for BOM sensitive UTF-16 */
1552     UCNV_IBM, UCNV_UTF16, 2, 2,
1553     { 0xff, 0xfd, 0, 0 }, 2,
1554     FALSE, FALSE,
1555     0,
1556     0,
1557     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1558 };
1559
1560 static const UConverterSharedData _UTF16v2Data =
1561         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1562
1563 #endif