1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-8 converter implementation. Used to be in ucnv_utf.c.
18 * Also, CESU-8 implementation, see UTR 26.
19 * The CESU-8 converter uses all the same functions as the
20 * UTF-8 converter, with a branch for converting supplementary code points.
23 #include "unicode/utypes.h"
25 #if !UCONFIG_NO_CONVERSION
27 #include "unicode/ucnv.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf8.h"
30 #include "unicode/utf16.h"
35 /* Prototypes --------------------------------------------------------------- */
37 /* Keep these here to make finicky compilers happy */
39 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
41 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
45 /* UTF-8 -------------------------------------------------------------------- */
47 /* UTF-8 Conversion DATA
48 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
51 #define MAXIMUM_UCS2 0x0000FFFF
52 #define MAXIMUM_UTF 0x0010FFFF
53 #define MAXIMUM_UCS4 0x7FFFFFFF
55 #define HALF_BASE 0x0010000
56 #define HALF_MASK 0x3FF
57 #define SURROGATE_HIGH_START 0xD800
58 #define SURROGATE_HIGH_END 0xDBFF
59 #define SURROGATE_LOW_START 0xDC00
60 #define SURROGATE_LOW_END 0xDFFF
62 /* -SURROGATE_LOW_START + HALF_BASE */
63 #define SURROGATE_LOW_BASE 9216
65 static const uint32_t offsetsFromUTF8[7] = {0,
66 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
67 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
70 /* END OF UTF-8 Conversion DATA */
72 static const int8_t bytesFromUTF8[256] = {
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
84 * Starting with Unicode 3.0.1:
85 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
86 * byte sequences with more than 4 bytes are illegal in UTF-8,
87 * which is tested with impossible values for them
90 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
92 static UBool hasCESU8Data(const UConverter *cnv)
94 #if UCONFIG_ONLY_HTML_CONVERSION
97 return (UBool)(cnv->sharedData == &_CESU8Data);
101 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
104 UConverter *cnv = args->converter;
105 const unsigned char *mySource = (unsigned char *) args->source;
106 UChar *myTarget = args->target;
107 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
108 const UChar *targetLimit = args->targetLimit;
109 unsigned char *toUBytes = cnv->toUBytes;
110 UBool isCESU8 = hasCESU8Data(cnv);
111 uint32_t ch, ch2 = 0;
114 /* Restore size of current sequence */
115 if (cnv->toUnicodeStatus && myTarget < targetLimit)
117 inBytes = cnv->mode; /* restore # of bytes to consume */
118 i = cnv->toULength; /* restore # of bytes consumed */
121 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
122 cnv->toUnicodeStatus = 0;
127 while (mySource < sourceLimit && myTarget < targetLimit)
130 if (ch < 0x80) /* Simple case */
132 *(myTarget++) = (UChar) ch;
136 /* store the first char */
137 toUBytes[0] = (char)ch;
138 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
144 if (mySource < sourceLimit)
146 toUBytes[i] = (char) (ch2 = *mySource);
147 if (!U8_IS_TRAIL(ch2))
149 break; /* i < inBytes */
151 ch = (ch << 6) + ch2;
157 /* stores a partially calculated target*/
158 cnv->toUnicodeStatus = ch;
160 cnv->toULength = (int8_t) i;
165 /* Remove the accumulated high bits */
166 ch -= offsetsFromUTF8[inBytes];
169 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
170 * - use only trail bytes after a lead byte (checked above)
171 * - use the right number of trail bytes for a given lead byte
172 * - encode a code point <= U+10ffff
173 * - use the fewest possible number of bytes for their code points
174 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
176 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
177 * There are no irregular sequences any more.
178 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
180 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
181 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
183 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
184 if (ch <= MAXIMUM_UCS2)
186 /* fits in 16 bits */
187 *(myTarget++) = (UChar) ch;
191 /* write out the surrogates */
193 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
194 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
195 if (myTarget < targetLimit)
197 *(myTarget++) = (UChar)ch;
201 /* Put in overflow buffer (not handled here) */
202 cnv->UCharErrorBuffer[0] = (UChar) ch;
203 cnv->UCharErrorBufferLength = 1;
204 *err = U_BUFFER_OVERFLOW_ERROR;
211 cnv->toULength = (int8_t)i;
212 *err = U_ILLEGAL_CHAR_FOUND;
219 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
221 /* End of target buffer */
222 *err = U_BUFFER_OVERFLOW_ERROR;
225 args->target = myTarget;
226 args->source = (const char *) mySource;
229 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
232 UConverter *cnv = args->converter;
233 const unsigned char *mySource = (unsigned char *) args->source;
234 UChar *myTarget = args->target;
235 int32_t *myOffsets = args->offsets;
236 int32_t offsetNum = 0;
237 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
238 const UChar *targetLimit = args->targetLimit;
239 unsigned char *toUBytes = cnv->toUBytes;
240 UBool isCESU8 = hasCESU8Data(cnv);
241 uint32_t ch, ch2 = 0;
244 /* Restore size of current sequence */
245 if (cnv->toUnicodeStatus && myTarget < targetLimit)
247 inBytes = cnv->mode; /* restore # of bytes to consume */
248 i = cnv->toULength; /* restore # of bytes consumed */
251 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
252 cnv->toUnicodeStatus = 0;
256 while (mySource < sourceLimit && myTarget < targetLimit)
259 if (ch < 0x80) /* Simple case */
261 *(myTarget++) = (UChar) ch;
262 *(myOffsets++) = offsetNum++;
266 toUBytes[0] = (char)ch;
267 inBytes = bytesFromUTF8[ch];
273 if (mySource < sourceLimit)
275 toUBytes[i] = (char) (ch2 = *mySource);
276 if (!U8_IS_TRAIL(ch2))
278 break; /* i < inBytes */
280 ch = (ch << 6) + ch2;
286 cnv->toUnicodeStatus = ch;
288 cnv->toULength = (int8_t)i;
293 /* Remove the accumulated high bits */
294 ch -= offsetsFromUTF8[inBytes];
297 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
298 * - use only trail bytes after a lead byte (checked above)
299 * - use the right number of trail bytes for a given lead byte
300 * - encode a code point <= U+10ffff
301 * - use the fewest possible number of bytes for their code points
302 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
304 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
305 * There are no irregular sequences any more.
306 * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
308 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
309 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
311 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
312 if (ch <= MAXIMUM_UCS2)
314 /* fits in 16 bits */
315 *(myTarget++) = (UChar) ch;
316 *(myOffsets++) = offsetNum;
320 /* write out the surrogates */
322 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
323 *(myOffsets++) = offsetNum;
324 ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
325 if (myTarget < targetLimit)
327 *(myTarget++) = (UChar)ch;
328 *(myOffsets++) = offsetNum;
332 cnv->UCharErrorBuffer[0] = (UChar) ch;
333 cnv->UCharErrorBufferLength = 1;
334 *err = U_BUFFER_OVERFLOW_ERROR;
341 cnv->toULength = (int8_t)i;
342 *err = U_ILLEGAL_CHAR_FOUND;
349 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
350 { /* End of target buffer */
351 *err = U_BUFFER_OVERFLOW_ERROR;
354 args->target = myTarget;
355 args->source = (const char *) mySource;
356 args->offsets = myOffsets;
359 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
362 UConverter *cnv = args->converter;
363 const UChar *mySource = args->source;
364 const UChar *sourceLimit = args->sourceLimit;
365 uint8_t *myTarget = (uint8_t *) args->target;
366 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
370 int32_t indexToWrite;
371 UBool isNotCESU8 = !hasCESU8Data(cnv);
373 if (cnv->fromUChar32 && myTarget < targetLimit)
375 ch = cnv->fromUChar32;
376 cnv->fromUChar32 = 0;
380 while (mySource < sourceLimit && myTarget < targetLimit)
384 if (ch < 0x80) /* Single byte */
386 *(myTarget++) = (uint8_t) ch;
388 else if (ch < 0x800) /* Double byte */
390 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
391 if (myTarget < targetLimit)
393 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
397 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
398 cnv->charErrorBufferLength = 1;
399 *err = U_BUFFER_OVERFLOW_ERROR;
403 /* Check for surrogates */
404 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
406 if (mySource < sourceLimit) {
407 /* test both code units */
408 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
409 /* convert and consume this supplementary code point */
410 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
412 /* exit this condition tree */
415 /* this is an unpaired trail or lead code unit */
416 /* callback(illegal) */
417 cnv->fromUChar32 = ch;
418 *err = U_ILLEGAL_CHAR_FOUND;
424 cnv->fromUChar32 = ch;
429 /* Do we write the buffer directly for speed,
430 or do we have to be careful about target buffer space? */
431 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
433 if (ch <= MAXIMUM_UCS2) {
435 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
439 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
440 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
442 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
443 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
445 if (tempPtr == myTarget) {
446 /* There was enough space to write the codepoint directly. */
447 myTarget += (indexToWrite + 1);
450 /* We might run out of room soon. Write it slowly. */
451 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
452 if (myTarget < targetLimit) {
453 *(myTarget++) = *tempPtr;
456 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
457 *err = U_BUFFER_OVERFLOW_ERROR;
464 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
466 *err = U_BUFFER_OVERFLOW_ERROR;
469 args->target = (char *) myTarget;
470 args->source = mySource;
473 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
476 UConverter *cnv = args->converter;
477 const UChar *mySource = args->source;
478 int32_t *myOffsets = args->offsets;
479 const UChar *sourceLimit = args->sourceLimit;
480 uint8_t *myTarget = (uint8_t *) args->target;
481 const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
484 int32_t offsetNum, nextSourceIndex;
485 int32_t indexToWrite;
487 UBool isNotCESU8 = !hasCESU8Data(cnv);
489 if (cnv->fromUChar32 && myTarget < targetLimit)
491 ch = cnv->fromUChar32;
492 cnv->fromUChar32 = 0;
500 while (mySource < sourceLimit && myTarget < targetLimit)
504 if (ch < 0x80) /* Single byte */
506 *(myOffsets++) = offsetNum++;
507 *(myTarget++) = (char) ch;
509 else if (ch < 0x800) /* Double byte */
511 *(myOffsets++) = offsetNum;
512 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
513 if (myTarget < targetLimit)
515 *(myOffsets++) = offsetNum++;
516 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
520 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
521 cnv->charErrorBufferLength = 1;
522 *err = U_BUFFER_OVERFLOW_ERROR;
526 /* Check for surrogates */
528 nextSourceIndex = offsetNum + 1;
530 if(U16_IS_SURROGATE(ch) && isNotCESU8) {
532 if (mySource < sourceLimit) {
533 /* test both code units */
534 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
535 /* convert and consume this supplementary code point */
536 ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
539 /* exit this condition tree */
542 /* this is an unpaired trail or lead code unit */
543 /* callback(illegal) */
544 cnv->fromUChar32 = ch;
545 *err = U_ILLEGAL_CHAR_FOUND;
551 cnv->fromUChar32 = ch;
556 /* Do we write the buffer directly for speed,
557 or do we have to be careful about target buffer space? */
558 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
560 if (ch <= MAXIMUM_UCS2) {
562 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
566 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
567 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
569 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
570 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
572 if (tempPtr == myTarget) {
573 /* There was enough space to write the codepoint directly. */
574 myTarget += (indexToWrite + 1);
575 myOffsets[0] = offsetNum;
576 myOffsets[1] = offsetNum;
577 myOffsets[2] = offsetNum;
578 if (indexToWrite >= 3) {
579 myOffsets[3] = offsetNum;
581 myOffsets += (indexToWrite + 1);
584 /* We might run out of room soon. Write it slowly. */
585 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
586 if (myTarget < targetLimit)
588 *(myOffsets++) = offsetNum;
589 *(myTarget++) = *tempPtr;
593 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
594 *err = U_BUFFER_OVERFLOW_ERROR;
598 offsetNum = nextSourceIndex;
602 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
604 *err = U_BUFFER_OVERFLOW_ERROR;
607 args->target = (char *) myTarget;
608 args->source = mySource;
609 args->offsets = myOffsets;
612 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
615 const uint8_t *sourceInitial;
616 const uint8_t *source;
617 uint16_t extraBytesToWrite;
620 int8_t i, isLegalSequence;
622 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
624 cnv = args->converter;
625 sourceInitial = source = (const uint8_t *)args->source;
626 if (source >= (const uint8_t *)args->sourceLimit)
629 *err = U_INDEX_OUTOFBOUNDS_ERROR;
633 myByte = (uint8_t)*(source++);
636 args->source = (const char *)source;
637 return (UChar32)myByte;
640 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
641 if (extraBytesToWrite == 0) {
642 cnv->toUBytes[0] = myByte;
644 *err = U_ILLEGAL_CHAR_FOUND;
645 args->source = (const char *)source;
649 /*The byte sequence is longer than the buffer area passed*/
650 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
652 /* check if all of the remaining bytes are trail bytes */
653 cnv->toUBytes[0] = myByte;
655 *err = U_TRUNCATED_CHAR_FOUND;
656 while(source < (const uint8_t *)args->sourceLimit) {
657 if(U8_IS_TRAIL(myByte = *source)) {
658 cnv->toUBytes[i++] = myByte;
661 /* error even before we run out of input */
662 *err = U_ILLEGAL_CHAR_FOUND;
667 args->source = (const char *)source;
673 switch(extraBytesToWrite)
675 /* note: code falls through cases! (sic)*/
677 ch += (myByte = *source);
679 if (!U8_IS_TRAIL(myByte))
687 ch += (myByte = *source);
689 if (!U8_IS_TRAIL(myByte))
697 ch += (myByte = *source);
699 if (!U8_IS_TRAIL(myByte))
707 ch += (myByte = *source);
709 if (!U8_IS_TRAIL(myByte))
717 ch += (myByte = *source);
718 if (!U8_IS_TRAIL(myByte))
725 ch -= offsetsFromUTF8[extraBytesToWrite];
726 args->source = (const char *)source;
729 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
730 * - use only trail bytes after a lead byte (checked above)
731 * - use the right number of trail bytes for a given lead byte
732 * - encode a code point <= U+10ffff
733 * - use the fewest possible number of bytes for their code points
734 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
736 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
737 * There are no irregular sequences any more.
739 if (isLegalSequence &&
740 (uint32_t)ch <= MAXIMUM_UTF &&
741 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
744 return ch; /* return the code point */
747 for(i = 0; sourceInitial < source; ++i) {
748 cnv->toUBytes[i] = *sourceInitial++;
751 *err = U_ILLEGAL_CHAR_FOUND;
755 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
757 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
759 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
761 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
763 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
765 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
767 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
768 UConverterToUnicodeArgs *pToUArgs,
769 UErrorCode *pErrorCode) {
771 const uint8_t *source, *sourceLimit;
773 int32_t targetCapacity;
776 int8_t oldToULength, toULength, toULimit;
781 /* set up the local pointers */
782 utf8=pToUArgs->converter;
783 source=(uint8_t *)pToUArgs->source;
784 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
785 target=(uint8_t *)pFromUArgs->target;
786 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
788 /* get the converter state from the UTF-8 UConverter */
789 c=(UChar32)utf8->toUnicodeStatus;
791 toULength=oldToULength=utf8->toULength;
792 toULimit=(int8_t)utf8->mode;
794 toULength=oldToULength=toULimit=0;
797 count=(int32_t)(sourceLimit-source)+oldToULength;
800 * Not enough input to complete the partial character.
801 * Jump to moreBytes below - it will not output to target.
803 } else if(targetCapacity<toULimit) {
805 * Not enough target capacity to output the partial character.
806 * Let the standard converter handle this.
808 *pErrorCode=U_USING_DEFAULT_WARNING;
812 * Use a single counter for source and target, counting the minimum of
813 * the source length and the target capacity.
814 * As a result, the source length is checked only once per multi-byte
815 * character instead of twice.
817 * Make sure that the last byte sequence is complete, or else
818 * stop just before it.
819 * (The longest legal byte sequence has 3 trail bytes.)
820 * Count oldToULength (number of source bytes from a previous buffer)
821 * into the source length but reduce the source index by toULimit
822 * while going back over trail bytes in order to not go back into
823 * the bytes that will be read for finishing a partial
824 * sequence from the previous buffer.
825 * Let the standard converter handle edge cases.
829 if(count>targetCapacity) {
830 count=targetCapacity;
834 while(i<3 && i<(count-toULimit)) {
835 b=source[count-oldToULength-i-1];
839 if(i<U8_COUNT_TRAIL_BYTES(b)) {
840 /* stop converting before the lead byte if there are not enough trail bytes for it */
849 utf8->toUnicodeStatus=0;
852 /* See note in ucnv_SBCSFromUTF8() about this goto. */
855 /* conversion loop */
865 if( /* handle U+1000..U+D7FF inline */
866 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
867 (b==0xed && (t1 <= 0x9f))) &&
868 (t2=source[1]) >= 0x80 && t2 <= 0xbf
878 if( /* handle U+0080..U+07FF inline */
880 (t1=*source) >= 0x80 && t1 <= 0xbf
889 if( /* handle U+0800..U+0FFF inline */
890 (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
891 (t2=source[1]) >= 0x80 && t2 <= 0xbf
902 /* handle "complicated" and error cases, and continuing partial characters */
905 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
908 while(toULength<toULimit) {
909 if(source<sourceLimit) {
916 break; /* sequence too short, stop with toULength<toULimit */
919 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
920 source-=(toULength-oldToULength);
921 while(oldToULength<toULength) {
922 utf8->toUBytes[oldToULength++]=*source++;
924 utf8->toUnicodeStatus=c;
925 utf8->toULength=toULength;
927 pToUArgs->source=(char *)source;
928 pFromUArgs->target=(char *)target;
933 if( toULength==toULimit && /* consumed all trail bytes */
934 (toULength==3 || toULength==2) && /* BMP */
935 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
936 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
938 /* legal byte sequence for BMP code point */
940 toULength==toULimit && toULength==4 &&
941 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
943 /* legal byte sequence for supplementary code point */
945 /* error handling: illegal UTF-8 byte sequence */
946 source-=(toULength-oldToULength);
947 while(oldToULength<toULength) {
948 utf8->toUBytes[oldToULength++]=*source++;
950 utf8->toULength=toULength;
951 pToUArgs->source=(char *)source;
952 pFromUArgs->target=(char *)target;
953 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
957 /* copy the legal byte sequence to the target */
961 for(i=0; i<oldToULength; ++i) {
962 *target++=utf8->toUBytes[i];
964 source-=(toULength-oldToULength);
965 for(; i<toULength; ++i) {
973 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
974 if(target==(const uint8_t *)pFromUArgs->targetLimit) {
975 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
978 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
979 if(toULimit>(sourceLimit-source)) {
980 /* collect a truncated byte sequence */
984 utf8->toUBytes[toULength++]=b;
985 if(++source==sourceLimit) {
986 /* partial byte sequence at end of source */
987 utf8->toUnicodeStatus=c;
988 utf8->toULength=toULength;
991 } else if(!U8_IS_TRAIL(b=*source)) {
992 /* lead byte in trail byte position */
993 utf8->toULength=toULength;
994 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1000 /* partial-sequence target overflow: fall back to the pivoting implementation */
1001 *pErrorCode=U_USING_DEFAULT_WARNING;
1006 /* write back the updated pointers */
1007 pToUArgs->source=(char *)source;
1008 pFromUArgs->target=(char *)target;
1011 /* UTF-8 converter data ----------------------------------------------------- */
1013 static const UConverterImpl _UTF8Impl={
1023 ucnv_toUnicode_UTF8,
1024 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1025 ucnv_fromUnicode_UTF8,
1026 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1027 ucnv_getNextUChar_UTF8,
1033 ucnv_getNonSurrogateUnicodeSet,
1039 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1040 static const UConverterStaticData _UTF8StaticData={
1041 sizeof(UConverterStaticData),
1043 1208, UCNV_IBM, UCNV_UTF8,
1044 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1045 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1048 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1052 const UConverterSharedData _UTF8Data=
1053 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1055 /* CESU-8 converter data ---------------------------------------------------- */
1057 static const UConverterImpl _CESU8Impl={
1067 ucnv_toUnicode_UTF8,
1068 ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1069 ucnv_fromUnicode_UTF8,
1070 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1077 ucnv_getCompleteUnicodeSet,
1083 static const UConverterStaticData _CESU8StaticData={
1084 sizeof(UConverterStaticData),
1086 9400, /* CCSID for CESU-8 */
1087 UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1088 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1091 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1095 const UConverterSharedData _CESU8Data=
1096 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);