1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 2001-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 9/10/2001 Ram Creation.
17 ******************************************************************************
20 /*******************************************************************************
22 * u_strTo* and u_strFrom* APIs
23 * WCS functions moved to ustr_wcs.c for better modularization
25 *******************************************************************************
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
39 U_CAPI UChar* U_EXPORT2
40 u_strFromUTF32WithSub(UChar *dest,
45 UChar32 subchar, int32_t *pNumSubstitutions,
46 UErrorCode *pErrorCode) {
47 const UChar32 *srcLimit;
52 int32_t numSubstitutions;
55 if(U_FAILURE(*pErrorCode)){
58 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
66 if(pNumSubstitutions != NULL) {
67 *pNumSubstitutions = 0;
71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
76 /* simple loop for conversion of a NUL-terminated BMP string */
77 while((ch=*src) != 0 &&
78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
80 if(pDest < destLimit) {
88 /* "complicated" case, find the end of the remaining string */
89 while(*++srcLimit != 0) {}
92 srcLimit = (src!=NULL)?(src + srcLength):NULL;
95 /* convert with length */
96 while(src < srcLimit) {
99 /* usually "loops" once; twice only for writing subchar */
100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101 if(pDest < destLimit) {
102 *pDest++ = (UChar)ch;
107 } else if(0x10000 <= ch && ch <= 0x10ffff) {
108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109 *pDest++ = U16_LEAD(ch);
110 *pDest++ = U16_TRAIL(ch);
115 } else if((ch = subchar) < 0) {
116 /* surrogate code point, or not a Unicode code point at all */
117 *pErrorCode = U_INVALID_CHAR_FOUND;
125 reqLength += (int32_t)(pDest - dest);
127 *pDestLength = reqLength;
129 if(pNumSubstitutions != NULL) {
130 *pNumSubstitutions = numSubstitutions;
133 /* Terminate the buffer */
134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
139 U_CAPI UChar* U_EXPORT2
140 u_strFromUTF32(UChar *dest,
141 int32_t destCapacity,
142 int32_t *pDestLength,
145 UErrorCode *pErrorCode) {
146 return u_strFromUTF32WithSub(
147 dest, destCapacity, pDestLength,
153 U_CAPI UChar32* U_EXPORT2
154 u_strToUTF32WithSub(UChar32 *dest,
155 int32_t destCapacity,
156 int32_t *pDestLength,
159 UChar32 subchar, int32_t *pNumSubstitutions,
160 UErrorCode *pErrorCode) {
161 const UChar *srcLimit;
167 int32_t numSubstitutions;
170 if(U_FAILURE(*pErrorCode)){
173 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
181 if(pNumSubstitutions != NULL) {
182 *pNumSubstitutions = 0;
186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
188 numSubstitutions = 0;
191 /* simple loop for conversion of a NUL-terminated BMP string */
192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
194 if(pDest < destLimit) {
202 /* "complicated" case, find the end of the remaining string */
203 while(*++srcLimit != 0) {}
206 srcLimit = (src!=NULL)?(src + srcLength):NULL;
209 /* convert with length */
210 while(src < srcLimit) {
212 if(!U16_IS_SURROGATE(ch)) {
213 /* write or count ch below */
214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
216 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217 } else if((ch = subchar) < 0) {
218 /* unpaired surrogate */
219 *pErrorCode = U_INVALID_CHAR_FOUND;
224 if(pDest < destLimit) {
231 reqLength += (int32_t)(pDest - dest);
233 *pDestLength = reqLength;
235 if(pNumSubstitutions != NULL) {
236 *pNumSubstitutions = numSubstitutions;
239 /* Terminate the buffer */
240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
245 U_CAPI UChar32* U_EXPORT2
246 u_strToUTF32(UChar32 *dest,
247 int32_t destCapacity,
248 int32_t *pDestLength,
251 UErrorCode *pErrorCode) {
252 return u_strToUTF32WithSub(
253 dest, destCapacity, pDestLength,
259 /* for utf8_nextCharSafeBodyTerminated() */
261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
264 * Version of utf8_nextCharSafeBody() with the following differences:
265 * - checks for NUL termination instead of length
266 * - works with pointers instead of indexes
267 * - always strict (strict==-1)
269 * *ps points to after the lead byte and will be moved to after the last trail byte.
270 * c is the lead byte.
271 * @return the code point, or U_SENTINEL
274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275 const uint8_t *s=*ps;
276 uint8_t trail, illegal=0;
277 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
279 U8_MASK_LEAD_BYTE((c), count);
280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
282 /* each branch falls through to the next one */
285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
289 trail=(uint8_t)(*s++ - 0x80);
291 if(trail>0x3f || c>=0x110) {
292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
298 trail=(uint8_t)(*s++ - 0x80);
300 /* not a trail byte */
307 trail=(uint8_t)(*s++ - 0x80);
309 /* not a trail byte */
316 /* no default branch to optimize switch() - all values are covered */
319 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
320 /* illegal is also set if count>=4 */
321 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
323 /* don't go beyond this sequence */
325 while(count>0 && U8_IS_TRAIL(*s)) {
336 * Version of utf8_nextCharSafeBody() with the following differences:
337 * - works with pointers instead of indexes
338 * - always strict (strict==-1)
340 * *ps points to after the lead byte and will be moved to after the last trail byte.
341 * c is the lead byte.
342 * @return the code point, or U_SENTINEL
345 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
346 const uint8_t *s=*ps;
347 uint8_t trail, illegal=0;
348 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
349 if((limit-s)>=count) {
350 U8_MASK_LEAD_BYTE((c), count);
351 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
353 /* each branch falls through to the next one */
356 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
361 c=(c<<6)|(trail&0x3f);
363 illegal|=(trail&0xc0)^0x80;
365 /* code point>0x10ffff, outside Unicode */
372 c=(c<<6)|(trail&0x3f);
373 illegal|=(trail&0xc0)^0x80;
377 c=(c<<6)|(trail&0x3f);
378 illegal|=(trail&0xc0)^0x80;
382 /* no default branch to optimize switch() - all values are covered */
385 illegal=1; /* too few bytes left */
388 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
389 /* illegal is also set if count>=4 */
390 U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
391 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
393 /* don't go beyond this sequence */
395 while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
405 U_CAPI UChar* U_EXPORT2
406 u_strFromUTF8WithSub(UChar *dest,
407 int32_t destCapacity,
408 int32_t *pDestLength,
411 UChar32 subchar, int32_t *pNumSubstitutions,
412 UErrorCode *pErrorCode){
414 UChar *pDestLimit = dest+destCapacity;
416 int32_t reqLength = 0;
417 const uint8_t* pSrc = (const uint8_t*) src;
418 uint8_t t1, t2; /* trail bytes */
419 int32_t numSubstitutions;
422 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
426 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
427 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
428 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
430 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
434 if(pNumSubstitutions!=NULL) {
435 *pNumSubstitutions=0;
440 * Inline processing of UTF-8 byte sequences:
442 * Byte sequences for the most common characters are handled inline in
443 * the conversion loops. In order to reduce the path lengths for those
444 * characters, the tests are arranged in a kind of binary search.
445 * ASCII (<=0x7f) is checked first, followed by the dividing point
446 * between 2- and 3-byte sequences (0xe0).
447 * The 3-byte branch is tested first to speed up CJK text.
448 * The compiler should combine the subtractions for the two tests for 0xe0.
449 * Each branch then tests for the other end of its range.
454 * Transform a NUL-terminated string.
455 * The code explicitly checks for NULs only in the lead byte position.
456 * A NUL byte in the trail byte position fails the trail byte range check anyway.
458 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
464 if( /* handle U+1000..U+CFFF inline */
466 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
467 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
469 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
470 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
474 } else if(ch < 0xe0) {
475 if( /* handle U+0080..U+07FF inline */
477 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
479 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
485 /* function call for "complicated" and error cases */
486 ++pSrc; /* continue after the lead byte */
487 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
488 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
489 *pErrorCode = U_INVALID_CHAR_FOUND;
491 } else if(ch<=0xFFFF) {
492 *(pDest++)=(UChar)ch;
494 *(pDest++)=U16_LEAD(ch);
495 if(pDest<pDestLimit) {
496 *(pDest++)=U16_TRAIL(ch);
505 /* Pre-flight the rest of the string. */
506 while((ch = *pSrc) != 0) {
512 if( /* handle U+1000..U+CFFF inline */
514 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
515 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
521 } else if(ch < 0xe0) {
522 if( /* handle U+0080..U+07FF inline */
524 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
532 /* function call for "complicated" and error cases */
533 ++pSrc; /* continue after the lead byte */
534 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
535 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
536 *pErrorCode = U_INVALID_CHAR_FOUND;
539 reqLength += U16_LENGTH(ch);
542 } else /* srcLength >= 0 */ {
543 const uint8_t *pSrcLimit = pSrc + srcLength;
546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
549 * Each iteration of the inner loop progresses by at most 3 UTF-8
550 * bytes and one UChar, for most characters.
551 * For supplementary code points (4 & 2), which are rare,
552 * there is an additional adjustment.
554 count = (int32_t)(pDestLimit - pDest);
555 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
556 if(count > srcLength) {
557 count = srcLength; /* min(remaining dest, remaining src/3) */
561 * Too much overhead if we get near the end of the string,
562 * continue with the next loop.
574 if( /* handle U+1000..U+CFFF inline */
576 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
577 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
579 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
580 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
584 } else if(ch < 0xe0) {
585 if( /* handle U+0080..U+07FF inline */
587 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
589 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
595 if(ch >= 0xf0 || subchar > 0xffff) {
597 * We may read up to six bytes and write up to two UChars,
598 * which we didn't account for with computing count,
599 * so we adjust it here.
606 /* function call for "complicated" and error cases */
607 ++pSrc; /* continue after the lead byte */
608 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
609 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
610 *pErrorCode = U_INVALID_CHAR_FOUND;
612 }else if(ch<=0xFFFF){
613 *(pDest++)=(UChar)ch;
615 *(pDest++)=U16_LEAD(ch);
616 *(pDest++)=U16_TRAIL(ch);
619 } while(--count > 0);
622 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
629 if( /* handle U+1000..U+CFFF inline */
631 ((pSrcLimit - pSrc) >= 3) &&
632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
633 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
635 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
636 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
640 } else if(ch < 0xe0) {
641 if( /* handle U+0080..U+07FF inline */
643 ((pSrcLimit - pSrc) >= 2) &&
644 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
646 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
652 /* function call for "complicated" and error cases */
653 ++pSrc; /* continue after the lead byte */
654 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
655 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
656 *pErrorCode = U_INVALID_CHAR_FOUND;
658 }else if(ch<=0xFFFF){
659 *(pDest++)=(UChar)ch;
661 *(pDest++)=U16_LEAD(ch);
662 if(pDest<pDestLimit){
663 *(pDest++)=U16_TRAIL(ch);
671 /* do not fill the dest buffer just count the UChars needed */
672 while(pSrc < pSrcLimit){
679 if( /* handle U+1000..U+CFFF inline */
681 ((pSrcLimit - pSrc) >= 3) &&
682 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
683 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
689 } else if(ch < 0xe0) {
690 if( /* handle U+0080..U+07FF inline */
692 ((pSrcLimit - pSrc) >= 2) &&
693 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
701 /* function call for "complicated" and error cases */
702 ++pSrc; /* continue after the lead byte */
703 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
704 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
705 *pErrorCode = U_INVALID_CHAR_FOUND;
708 reqLength+=U16_LENGTH(ch);
713 reqLength+=(int32_t)(pDest - dest);
715 if(pNumSubstitutions!=NULL) {
716 *pNumSubstitutions=numSubstitutions;
720 *pDestLength = reqLength;
723 /* Terminate the buffer */
724 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
729 U_CAPI UChar* U_EXPORT2
730 u_strFromUTF8(UChar *dest,
731 int32_t destCapacity,
732 int32_t *pDestLength,
735 UErrorCode *pErrorCode){
736 return u_strFromUTF8WithSub(
737 dest, destCapacity, pDestLength,
743 U_CAPI UChar * U_EXPORT2
744 u_strFromUTF8Lenient(UChar *dest,
745 int32_t destCapacity,
746 int32_t *pDestLength,
749 UErrorCode *pErrorCode) {
752 int32_t reqLength = 0;
753 uint8_t* pSrc = (uint8_t*) src;
756 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
760 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
761 (destCapacity<0) || (dest == NULL && destCapacity > 0)
763 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
768 /* Transform a NUL-terminated string. */
769 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
770 uint8_t t1, t2, t3; /* trail bytes */
772 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
775 * ASCII, or a trail byte in lead position which is treated like
776 * a single-byte sequence for better character boundary
777 * resynchronization after illegal sequences.
782 } else if(ch < 0xe0) { /* U+0080..U+07FF */
783 if((t1 = pSrc[1]) != 0) {
784 /* 0x3080 = (0xc0 << 6) + 0x80 */
785 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
789 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
790 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
791 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
792 /* 0x2080 = (0x80 << 6) + 0x80 */
793 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
797 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
798 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
800 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
801 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
802 *(pDest++) = U16_LEAD(ch);
803 if(pDest < pDestLimit) {
804 *(pDest++) = U16_TRAIL(ch);
813 /* truncated character at the end */
815 while(*++pSrc != 0) {}
819 /* Pre-flight the rest of the string. */
820 while((ch = *pSrc) != 0) {
823 * ASCII, or a trail byte in lead position which is treated like
824 * a single-byte sequence for better character boundary
825 * resynchronization after illegal sequences.
830 } else if(ch < 0xe0) { /* U+0080..U+07FF */
836 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
837 if(pSrc[1] != 0 && pSrc[2] != 0) {
842 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
843 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
850 /* truncated character at the end */
854 } else /* srcLength >= 0 */ {
855 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
858 * This function requires that if srcLength is given, then it must be
859 * destCapatity >= srcLength so that we need not check for
860 * destination buffer overflow in the loop.
862 if(destCapacity < srcLength) {
863 if(pDestLength != NULL) {
864 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
866 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
870 if((pSrcLimit - pSrc) >= 4) {
871 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
873 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
878 * ASCII, or a trail byte in lead position which is treated like
879 * a single-byte sequence for better character boundary
880 * resynchronization after illegal sequences.
883 } else if(ch < 0xe0) { /* U+0080..U+07FF */
884 /* 0x3080 = (0xc0 << 6) + 0x80 */
885 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
886 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
887 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
888 /* 0x2080 = (0x80 << 6) + 0x80 */
889 ch = (ch << 12) + (*pSrc++ << 6);
890 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
891 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
892 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
893 ch = (ch << 18) + (*pSrc++ << 12);
895 ch += *pSrc++ - 0x3c82080;
896 *(pDest++) = U16_LEAD(ch);
897 *(pDest++) = U16_TRAIL(ch);
899 } while(pSrc < pSrcLimit);
901 pSrcLimit += 3; /* restore original pSrcLimit */
904 while(pSrc < pSrcLimit) {
908 * ASCII, or a trail byte in lead position which is treated like
909 * a single-byte sequence for better character boundary
910 * resynchronization after illegal sequences.
914 } else if(ch < 0xe0) { /* U+0080..U+07FF */
915 if(pSrc < pSrcLimit) {
916 /* 0x3080 = (0xc0 << 6) + 0x80 */
917 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
920 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
921 if((pSrcLimit - pSrc) >= 2) {
922 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
923 /* 0x2080 = (0x80 << 6) + 0x80 */
924 ch = (ch << 12) + (*pSrc++ << 6);
925 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
929 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
930 if((pSrcLimit - pSrc) >= 3) {
931 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
932 ch = (ch << 18) + (*pSrc++ << 12);
934 ch += *pSrc++ - 0x3c82080;
935 *(pDest++) = U16_LEAD(ch);
936 *(pDest++) = U16_TRAIL(ch);
942 /* truncated character at the end */
948 reqLength+=(int32_t)(pDest - dest);
951 *pDestLength = reqLength;
954 /* Terminate the buffer */
955 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
960 static inline uint8_t *
961 _appendUTF8(uint8_t *pDest, UChar32 c) {
962 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
965 } else if(c<=0x7ff) {
966 *pDest++=(uint8_t)((c>>6)|0xc0);
967 *pDest++=(uint8_t)((c&0x3f)|0x80);
968 } else if(c<=0xffff) {
969 *pDest++=(uint8_t)((c>>12)|0xe0);
970 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
971 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
972 } else /* if((uint32_t)(c)<=0x10ffff) */ {
973 *pDest++=(uint8_t)(((c)>>18)|0xf0);
974 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
975 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
976 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
982 U_CAPI char* U_EXPORT2
983 u_strToUTF8WithSub(char *dest,
984 int32_t destCapacity,
985 int32_t *pDestLength,
988 UChar32 subchar, int32_t *pNumSubstitutions,
989 UErrorCode *pErrorCode){
992 uint8_t *pDest = (uint8_t *)dest;
993 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
994 int32_t numSubstitutions;
997 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
1001 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1002 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1003 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1005 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1009 if(pNumSubstitutions!=NULL) {
1010 *pNumSubstitutions=0;
1015 while((ch=*pSrc)!=0) {
1018 if(pDest<pDestLimit) {
1019 *pDest++ = (uint8_t)ch;
1024 } else if(ch <= 0x7ff) {
1025 if((pDestLimit - pDest) >= 2) {
1026 *pDest++=(uint8_t)((ch>>6)|0xc0);
1027 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1032 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1033 if((pDestLimit - pDest) >= 3) {
1034 *pDest++=(uint8_t)((ch>>12)|0xe0);
1035 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1036 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1041 } else /* ch is a surrogate */ {
1044 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1047 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1048 } else if(subchar>=0) {
1052 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053 *pErrorCode = U_INVALID_CHAR_FOUND;
1057 length = U8_LENGTH(ch);
1058 if((pDestLimit - pDest) >= length) {
1059 /* convert and append*/
1060 pDest=_appendUTF8(pDest, ch);
1067 while((ch=*pSrc++)!=0) {
1070 } else if(ch<=0x7ff) {
1072 } else if(!U16_IS_SURROGATE(ch)) {
1074 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1077 } else if(subchar>=0) {
1078 reqLength+=U8_LENGTH(subchar);
1081 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082 *pErrorCode = U_INVALID_CHAR_FOUND;
1087 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1090 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1093 * Each iteration of the inner loop progresses by at most 3 UTF-8
1094 * bytes and one UChar, for most characters.
1095 * For supplementary code points (4 & 2), which are rare,
1096 * there is an additional adjustment.
1098 count = (int32_t)((pDestLimit - pDest) / 3);
1099 srcLength = (int32_t)(pSrcLimit - pSrc);
1100 if(count > srcLength) {
1101 count = srcLength; /* min(remaining dest/3, remaining src) */
1105 * Too much overhead if we get near the end of the string,
1106 * continue with the next loop.
1113 *pDest++ = (uint8_t)ch;
1114 } else if(ch <= 0x7ff) {
1115 *pDest++=(uint8_t)((ch>>6)|0xc0);
1116 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1118 *pDest++=(uint8_t)((ch>>12)|0xe0);
1119 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1120 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1121 } else /* ch is a surrogate */ {
1123 * We will read two UChars and probably output four bytes,
1124 * which we didn't account for with computing count,
1125 * so we adjust it here.
1128 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1129 break; /* recompute count */
1132 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1134 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1136 /* writing 4 bytes per 2 UChars is ok */
1137 *pDest++=(uint8_t)((ch>>18)|0xf0);
1138 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1139 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1140 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1142 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1147 *pErrorCode = U_INVALID_CHAR_FOUND;
1151 /* convert and append*/
1152 pDest=_appendUTF8(pDest, ch);
1155 } while(--count > 0);
1158 while(pSrc<pSrcLimit) {
1161 if(pDest<pDestLimit) {
1162 *pDest++ = (uint8_t)ch;
1167 } else if(ch <= 0x7ff) {
1168 if((pDestLimit - pDest) >= 2) {
1169 *pDest++=(uint8_t)((ch>>6)|0xc0);
1170 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1175 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1176 if((pDestLimit - pDest) >= 3) {
1177 *pDest++=(uint8_t)((ch>>12)|0xe0);
1178 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1179 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1184 } else /* ch is a surrogate */ {
1187 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1189 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1190 } else if(subchar>=0) {
1194 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195 *pErrorCode = U_INVALID_CHAR_FOUND;
1199 length = U8_LENGTH(ch);
1200 if((pDestLimit - pDest) >= length) {
1201 /* convert and append*/
1202 pDest=_appendUTF8(pDest, ch);
1209 while(pSrc<pSrcLimit) {
1213 } else if(ch<=0x7ff) {
1215 } else if(!U16_IS_SURROGATE(ch)) {
1217 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1220 } else if(subchar>=0) {
1221 reqLength+=U8_LENGTH(subchar);
1224 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225 *pErrorCode = U_INVALID_CHAR_FOUND;
1231 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1233 if(pNumSubstitutions!=NULL) {
1234 *pNumSubstitutions=numSubstitutions;
1238 *pDestLength = reqLength;
1241 /* Terminate the buffer */
1242 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1246 U_CAPI char* U_EXPORT2
1247 u_strToUTF8(char *dest,
1248 int32_t destCapacity,
1249 int32_t *pDestLength,
1252 UErrorCode *pErrorCode){
1253 return u_strToUTF8WithSub(
1254 dest, destCapacity, pDestLength,
1260 U_CAPI UChar* U_EXPORT2
1261 u_strFromJavaModifiedUTF8WithSub(
1263 int32_t destCapacity,
1264 int32_t *pDestLength,
1267 UChar32 subchar, int32_t *pNumSubstitutions,
1268 UErrorCode *pErrorCode) {
1269 UChar *pDest = dest;
1270 UChar *pDestLimit = dest+destCapacity;
1272 int32_t reqLength = 0;
1273 const uint8_t* pSrc = (const uint8_t*) src;
1274 const uint8_t *pSrcLimit;
1276 uint8_t t1, t2; /* trail bytes */
1277 int32_t numSubstitutions;
1280 if(U_FAILURE(*pErrorCode)){
1283 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1284 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1285 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1287 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1291 if(pNumSubstitutions!=NULL) {
1292 *pNumSubstitutions=0;
1298 * Transform a NUL-terminated ASCII string.
1299 * Handle non-ASCII strings with slower code.
1301 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1306 reqLength=(int32_t)(pDest - dest);
1308 *pDestLength = reqLength;
1311 /* Terminate the buffer */
1312 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1315 srcLength = uprv_strlen((const char *)pSrc);
1318 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1321 count = (int32_t)(pDestLimit - pDest);
1322 srcLength = (int32_t)(pSrcLimit - pSrc);
1323 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1324 /* fast ASCII loop */
1325 const uint8_t *prevSrc = pSrc;
1327 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1331 delta = (int32_t)(pSrc - prevSrc);
1336 * Each iteration of the inner loop progresses by at most 3 UTF-8
1337 * bytes and one UChar.
1340 if(count > srcLength) {
1341 count = srcLength; /* min(remaining dest, remaining src/3) */
1345 * Too much overhead if we get near the end of the string,
1346 * continue with the next loop.
1357 if( /* handle U+0000..U+FFFF inline */
1359 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1360 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1362 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1368 if( /* handle U+0000..U+07FF inline */
1370 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1372 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1379 *pErrorCode = U_INVALID_CHAR_FOUND;
1381 } else if(subchar > 0xffff && --count == 0) {
1383 * We need to write two UChars, adjusted count for that,
1384 * and ran out of space.
1388 /* function call for error cases */
1389 ++pSrc; /* continue after the lead byte */
1390 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1392 if(subchar<=0xFFFF) {
1393 *(pDest++)=(UChar)subchar;
1395 *(pDest++)=U16_LEAD(subchar);
1396 *(pDest++)=U16_TRAIL(subchar);
1400 } while(--count > 0);
1403 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1410 if( /* handle U+0000..U+FFFF inline */
1412 ((pSrcLimit - pSrc) >= 3) &&
1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1414 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1416 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1422 if( /* handle U+0000..U+07FF inline */
1424 ((pSrcLimit - pSrc) >= 2) &&
1425 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1427 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1434 *pErrorCode = U_INVALID_CHAR_FOUND;
1437 /* function call for error cases */
1438 ++pSrc; /* continue after the lead byte */
1439 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1441 if(subchar<=0xFFFF) {
1442 *(pDest++)=(UChar)subchar;
1444 *(pDest++)=U16_LEAD(subchar);
1445 if(pDest<pDestLimit) {
1446 *(pDest++)=U16_TRAIL(subchar);
1456 /* do not fill the dest buffer just count the UChars needed */
1457 while(pSrc < pSrcLimit){
1464 if( /* handle U+0000..U+FFFF inline */
1466 ((pSrcLimit - pSrc) >= 3) &&
1467 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1468 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1475 if( /* handle U+0000..U+07FF inline */
1477 ((pSrcLimit - pSrc) >= 2) &&
1478 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1487 *pErrorCode = U_INVALID_CHAR_FOUND;
1490 /* function call for error cases */
1491 ++pSrc; /* continue after the lead byte */
1492 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1494 reqLength+=U16_LENGTH(ch);
1499 if(pNumSubstitutions!=NULL) {
1500 *pNumSubstitutions=numSubstitutions;
1503 reqLength+=(int32_t)(pDest - dest);
1505 *pDestLength = reqLength;
1508 /* Terminate the buffer */
1509 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1513 U_CAPI char* U_EXPORT2
1514 u_strToJavaModifiedUTF8(
1516 int32_t destCapacity,
1517 int32_t *pDestLength,
1520 UErrorCode *pErrorCode) {
1521 int32_t reqLength=0;
1523 uint8_t *pDest = (uint8_t *)dest;
1524 uint8_t *pDestLimit = pDest + destCapacity;
1525 const UChar *pSrcLimit;
1529 if(U_FAILURE(*pErrorCode)){
1532 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1533 (dest==NULL && destCapacity!=0) || destCapacity<0
1535 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1540 /* Convert NUL-terminated ASCII, then find the string length. */
1541 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1542 *pDest++ = (uint8_t)ch;
1546 reqLength=(int32_t)(pDest - (uint8_t *)dest);
1548 *pDestLength = reqLength;
1551 /* Terminate the buffer */
1552 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1555 srcLength = u_strlen(src);
1558 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1561 count = (int32_t)(pDestLimit - pDest);
1562 srcLength = (int32_t)(pSrcLimit - src);
1563 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1564 /* fast ASCII loop */
1565 const UChar *prevSrc = src;
1567 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1568 *pDest++=(uint8_t)ch;
1571 delta = (int32_t)(src - prevSrc);
1576 * Each iteration of the inner loop progresses by at most 3 UTF-8
1577 * bytes and one UChar.
1580 if(count > srcLength) {
1581 count = srcLength; /* min(remaining dest/3, remaining src) */
1585 * Too much overhead if we get near the end of the string,
1586 * continue with the next loop.
1592 if(ch <= 0x7f && ch != 0) {
1593 *pDest++ = (uint8_t)ch;
1594 } else if(ch <= 0x7ff) {
1595 *pDest++=(uint8_t)((ch>>6)|0xc0);
1596 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1598 *pDest++=(uint8_t)((ch>>12)|0xe0);
1599 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1600 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1602 } while(--count > 0);
1605 while(src<pSrcLimit) {
1607 if(ch <= 0x7f && ch != 0) {
1608 if(pDest<pDestLimit) {
1609 *pDest++ = (uint8_t)ch;
1614 } else if(ch <= 0x7ff) {
1615 if((pDestLimit - pDest) >= 2) {
1616 *pDest++=(uint8_t)((ch>>6)|0xc0);
1617 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1623 if((pDestLimit - pDest) >= 3) {
1624 *pDest++=(uint8_t)((ch>>12)|0xe0);
1625 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1626 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1633 while(src<pSrcLimit) {
1635 if(ch <= 0x7f && ch != 0) {
1637 } else if(ch<=0x7ff) {
1644 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1646 *pDestLength = reqLength;
1649 /* Terminate the buffer */
1650 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);