Imported Upstream version 58.1
[platform/upstream/icu.git] / source / common / ustrtrns.cpp
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2001-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustrtrns.cpp
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   9/10/2001    Ram    Creation.
17 ******************************************************************************
18 */
19
20 /*******************************************************************************
21  *
22  * u_strTo* and u_strFrom* APIs
23  * WCS functions moved to ustr_wcs.c for better modularization
24  *
25  *******************************************************************************
26  */
27
28
29 #include "unicode/putil.h"
30 #include "unicode/ustring.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utf16.h"
34 #include "cstring.h"
35 #include "cmemory.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38
39 U_CAPI UChar* U_EXPORT2 
40 u_strFromUTF32WithSub(UChar *dest,
41                int32_t destCapacity,
42                int32_t *pDestLength,
43                const UChar32 *src,
44                int32_t srcLength,
45                UChar32 subchar, int32_t *pNumSubstitutions,
46                UErrorCode *pErrorCode) {
47     const UChar32 *srcLimit;
48     UChar32 ch;
49     UChar *destLimit;
50     UChar *pDest;
51     int32_t reqLength;
52     int32_t numSubstitutions;
53
54     /* args check */
55     if(U_FAILURE(*pErrorCode)){
56         return NULL;
57     }
58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61     ) {
62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63         return NULL;
64     }
65
66     if(pNumSubstitutions != NULL) {
67         *pNumSubstitutions = 0;
68     }
69
70     pDest = dest;
71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72     reqLength = 0;
73     numSubstitutions = 0;
74
75     if(srcLength < 0) {
76         /* simple loop for conversion of a NUL-terminated BMP string */
77         while((ch=*src) != 0 &&
78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79             ++src;
80             if(pDest < destLimit) {
81                 *pDest++ = (UChar)ch;
82             } else {
83                 ++reqLength;
84             }
85         }
86         srcLimit = src;
87         if(ch != 0) {
88             /* "complicated" case, find the end of the remaining string */
89             while(*++srcLimit != 0) {}
90         }
91     } else {
92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
93     }
94
95     /* convert with length */
96     while(src < srcLimit) {
97         ch = *src++;
98         do {
99             /* usually "loops" once; twice only for writing subchar */
100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101                 if(pDest < destLimit) {
102                     *pDest++ = (UChar)ch;
103                 } else {
104                     ++reqLength;
105                 }
106                 break;
107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109                     *pDest++ = U16_LEAD(ch);
110                     *pDest++ = U16_TRAIL(ch);
111                 } else {
112                     reqLength += 2;
113                 }
114                 break;
115             } else if((ch = subchar) < 0) {
116                 /* surrogate code point, or not a Unicode code point at all */
117                 *pErrorCode = U_INVALID_CHAR_FOUND;
118                 return NULL;
119             } else {
120                 ++numSubstitutions;
121             }
122         } while(TRUE);
123     }
124
125     reqLength += (int32_t)(pDest - dest);
126     if(pDestLength) {
127         *pDestLength = reqLength;
128     }
129     if(pNumSubstitutions != NULL) {
130         *pNumSubstitutions = numSubstitutions;
131     }
132
133     /* Terminate the buffer */
134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135     
136     return dest;
137 }
138
139 U_CAPI UChar* U_EXPORT2 
140 u_strFromUTF32(UChar *dest,
141                int32_t destCapacity, 
142                int32_t *pDestLength,
143                const UChar32 *src,
144                int32_t srcLength,
145                UErrorCode *pErrorCode) {
146     return u_strFromUTF32WithSub(
147             dest, destCapacity, pDestLength,
148             src, srcLength,
149             U_SENTINEL, NULL,
150             pErrorCode);
151 }
152
153 U_CAPI UChar32* U_EXPORT2 
154 u_strToUTF32WithSub(UChar32 *dest,
155              int32_t destCapacity,
156              int32_t *pDestLength,
157              const UChar *src,
158              int32_t srcLength,
159              UChar32 subchar, int32_t *pNumSubstitutions,
160              UErrorCode *pErrorCode) {
161     const UChar *srcLimit;
162     UChar32 ch;
163     UChar ch2;
164     UChar32 *destLimit;
165     UChar32 *pDest;
166     int32_t reqLength;
167     int32_t numSubstitutions;
168
169     /* args check */
170     if(U_FAILURE(*pErrorCode)){
171         return NULL;
172     }
173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176     ) {
177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178         return NULL;
179     }
180
181     if(pNumSubstitutions != NULL) {
182         *pNumSubstitutions = 0;
183     }
184
185     pDest = dest;
186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187     reqLength = 0;
188     numSubstitutions = 0;
189
190     if(srcLength < 0) {
191         /* simple loop for conversion of a NUL-terminated BMP string */
192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193             ++src;
194             if(pDest < destLimit) {
195                 *pDest++ = ch;
196             } else {
197                 ++reqLength;
198             }
199         }
200         srcLimit = src;
201         if(ch != 0) {
202             /* "complicated" case, find the end of the remaining string */
203             while(*++srcLimit != 0) {}
204         }
205     } else {
206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
207     }
208
209     /* convert with length */
210     while(src < srcLimit) {
211         ch = *src++;
212         if(!U16_IS_SURROGATE(ch)) {
213             /* write or count ch below */
214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215             ++src;
216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217         } else if((ch = subchar) < 0) {
218             /* unpaired surrogate */
219             *pErrorCode = U_INVALID_CHAR_FOUND;
220             return NULL;
221         } else {
222             ++numSubstitutions;
223         }
224         if(pDest < destLimit) {
225             *pDest++ = ch;
226         } else {
227             ++reqLength;
228         }
229     }
230
231     reqLength += (int32_t)(pDest - dest);
232     if(pDestLength) {
233         *pDestLength = reqLength;
234     }
235     if(pNumSubstitutions != NULL) {
236         *pNumSubstitutions = numSubstitutions;
237     }
238
239     /* Terminate the buffer */
240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242     return dest;
243 }
244
245 U_CAPI UChar32* U_EXPORT2 
246 u_strToUTF32(UChar32 *dest, 
247              int32_t destCapacity,
248              int32_t *pDestLength,
249              const UChar *src, 
250              int32_t srcLength,
251              UErrorCode *pErrorCode) {
252     return u_strToUTF32WithSub(
253             dest, destCapacity, pDestLength,
254             src, srcLength,
255             U_SENTINEL, NULL,
256             pErrorCode);
257 }
258
259 /* for utf8_nextCharSafeBodyTerminated() */
260 static const UChar32
261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
262
263 /*
264  * Version of utf8_nextCharSafeBody() with the following differences:
265  * - checks for NUL termination instead of length
266  * - works with pointers instead of indexes
267  * - always strict (strict==-1)
268  *
269  * *ps points to after the lead byte and will be moved to after the last trail byte.
270  * c is the lead byte.
271  * @return the code point, or U_SENTINEL
272  */
273 static UChar32
274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275     const uint8_t *s=*ps;
276     uint8_t trail, illegal=0;
277     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
278     U_ASSERT(count<6);
279     U8_MASK_LEAD_BYTE((c), count);
280     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
281     switch(count) {
282     /* each branch falls through to the next one */
283     case 5:
284     case 4:
285         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
286         illegal=1;
287         break;
288     case 3:
289         trail=(uint8_t)(*s++ - 0x80);
290         c=(c<<6)|trail;
291         if(trail>0x3f || c>=0x110) {
292             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
293             illegal=1;
294             break;
295         }
296         U_FALLTHROUGH;
297     case 2:
298         trail=(uint8_t)(*s++ - 0x80);
299         if(trail>0x3f) {
300             /* not a trail byte */
301             illegal=1;
302             break;
303         }
304         c=(c<<6)|trail;
305         U_FALLTHROUGH;
306     case 1:
307         trail=(uint8_t)(*s++ - 0x80);
308         if(trail>0x3f) {
309             /* not a trail byte */
310             illegal=1;
311         }
312         c=(c<<6)|trail;
313         break;
314     case 0:
315         return U_SENTINEL;
316     /* no default branch to optimize switch()  - all values are covered */
317     }
318
319     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
320     /* illegal is also set if count>=4 */
321     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
322         /* error handling */
323         /* don't go beyond this sequence */
324         s=*ps;
325         while(count>0 && U8_IS_TRAIL(*s)) {
326             ++s;
327             --count;
328         }
329         c=U_SENTINEL;
330     }
331     *ps=s;
332     return c;
333 }
334
335 /*
336  * Version of utf8_nextCharSafeBody() with the following differences:
337  * - works with pointers instead of indexes
338  * - always strict (strict==-1)
339  *
340  * *ps points to after the lead byte and will be moved to after the last trail byte.
341  * c is the lead byte.
342  * @return the code point, or U_SENTINEL
343  */
344 static UChar32
345 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
346     const uint8_t *s=*ps;
347     uint8_t trail, illegal=0;
348     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
349     if((limit-s)>=count) {
350         U8_MASK_LEAD_BYTE((c), count);
351         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
352         switch(count) {
353         /* each branch falls through to the next one */
354         case 5:
355         case 4:
356             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
357             illegal=1;
358             break;
359         case 3:
360             trail=*s++;
361             c=(c<<6)|(trail&0x3f);
362             if(c<0x110) {
363                 illegal|=(trail&0xc0)^0x80;
364             } else {
365                 /* code point>0x10ffff, outside Unicode */
366                 illegal=1;
367                 break;
368             }
369             U_FALLTHROUGH;
370         case 2:
371             trail=*s++;
372             c=(c<<6)|(trail&0x3f);
373             illegal|=(trail&0xc0)^0x80;
374             U_FALLTHROUGH;
375         case 1:
376             trail=*s++;
377             c=(c<<6)|(trail&0x3f);
378             illegal|=(trail&0xc0)^0x80;
379             break;
380         case 0:
381             return U_SENTINEL;
382         /* no default branch to optimize switch()  - all values are covered */
383         }
384     } else {
385         illegal=1; /* too few bytes left */
386     }
387
388     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
389     /* illegal is also set if count>=4 */
390     U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
391     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
392         /* error handling */
393         /* don't go beyond this sequence */
394         s=*ps;
395         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
396             ++s;
397             --count;
398         }
399         c=U_SENTINEL;
400     }
401     *ps=s;
402     return c;
403 }
404
405 U_CAPI UChar* U_EXPORT2
406 u_strFromUTF8WithSub(UChar *dest,
407               int32_t destCapacity,
408               int32_t *pDestLength,
409               const char* src,
410               int32_t srcLength,
411               UChar32 subchar, int32_t *pNumSubstitutions,
412               UErrorCode *pErrorCode){
413     UChar *pDest = dest;
414     UChar *pDestLimit = dest+destCapacity;
415     UChar32 ch;
416     int32_t reqLength = 0;
417     const uint8_t* pSrc = (const uint8_t*) src;
418     uint8_t t1, t2; /* trail bytes */
419     int32_t numSubstitutions;
420
421     /* args check */
422     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
423         return NULL;
424     }
425         
426     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
427         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
428         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
429     ) {
430         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
431         return NULL;
432     }
433
434     if(pNumSubstitutions!=NULL) {
435         *pNumSubstitutions=0;
436     }
437     numSubstitutions=0;
438
439     /*
440      * Inline processing of UTF-8 byte sequences:
441      *
442      * Byte sequences for the most common characters are handled inline in
443      * the conversion loops. In order to reduce the path lengths for those
444      * characters, the tests are arranged in a kind of binary search.
445      * ASCII (<=0x7f) is checked first, followed by the dividing point
446      * between 2- and 3-byte sequences (0xe0).
447      * The 3-byte branch is tested first to speed up CJK text.
448      * The compiler should combine the subtractions for the two tests for 0xe0.
449      * Each branch then tests for the other end of its range.
450      */
451
452     if(srcLength < 0){
453         /*
454          * Transform a NUL-terminated string.
455          * The code explicitly checks for NULs only in the lead byte position.
456          * A NUL byte in the trail byte position fails the trail byte range check anyway.
457          */
458         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
459             if(ch <= 0x7f){
460                 *pDest++=(UChar)ch;
461                 ++pSrc;
462             } else {
463                 if(ch > 0xe0) {
464                     if( /* handle U+1000..U+CFFF inline */
465                         ch <= 0xec &&
466                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
467                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
468                     ) {
469                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
470                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
471                         pSrc += 3;
472                         continue;
473                     }
474                 } else if(ch < 0xe0) {
475                     if( /* handle U+0080..U+07FF inline */
476                         ch >= 0xc2 &&
477                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
478                     ) {
479                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
480                         pSrc += 2;
481                         continue;
482                     }
483                 }
484
485                 /* function call for "complicated" and error cases */
486                 ++pSrc; /* continue after the lead byte */
487                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
488                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
489                     *pErrorCode = U_INVALID_CHAR_FOUND;
490                     return NULL;
491                 } else if(ch<=0xFFFF) {
492                     *(pDest++)=(UChar)ch;
493                 } else {
494                     *(pDest++)=U16_LEAD(ch);
495                     if(pDest<pDestLimit) {
496                         *(pDest++)=U16_TRAIL(ch);
497                     } else {
498                         reqLength++;
499                         break;
500                     }
501                 }
502             }
503         }
504
505         /* Pre-flight the rest of the string. */
506         while((ch = *pSrc) != 0) {
507             if(ch <= 0x7f){
508                 ++reqLength;
509                 ++pSrc;
510             } else {
511                 if(ch > 0xe0) {
512                     if( /* handle U+1000..U+CFFF inline */
513                         ch <= 0xec &&
514                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
515                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
516                     ) {
517                         ++reqLength;
518                         pSrc += 3;
519                         continue;
520                     }
521                 } else if(ch < 0xe0) {
522                     if( /* handle U+0080..U+07FF inline */
523                         ch >= 0xc2 &&
524                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
525                     ) {
526                         ++reqLength;
527                         pSrc += 2;
528                         continue;
529                     }
530                 }
531
532                 /* function call for "complicated" and error cases */
533                 ++pSrc; /* continue after the lead byte */
534                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
535                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
536                     *pErrorCode = U_INVALID_CHAR_FOUND;
537                     return NULL;
538                 }
539                 reqLength += U16_LENGTH(ch);
540             }
541         }
542     } else /* srcLength >= 0 */ {
543         const uint8_t *pSrcLimit = pSrc + srcLength;
544         int32_t count;
545
546         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
547         for(;;) {
548             /*
549              * Each iteration of the inner loop progresses by at most 3 UTF-8
550              * bytes and one UChar, for most characters.
551              * For supplementary code points (4 & 2), which are rare,
552              * there is an additional adjustment.
553              */
554             count = (int32_t)(pDestLimit - pDest);
555             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
556             if(count > srcLength) {
557                 count = srcLength; /* min(remaining dest, remaining src/3) */
558             }
559             if(count < 3) {
560                 /*
561                  * Too much overhead if we get near the end of the string,
562                  * continue with the next loop.
563                  */
564                 break;
565             }
566
567             do {
568                 ch = *pSrc;
569                 if(ch <= 0x7f){
570                     *pDest++=(UChar)ch;
571                     ++pSrc;
572                 } else {
573                     if(ch > 0xe0) {
574                         if( /* handle U+1000..U+CFFF inline */
575                             ch <= 0xec &&
576                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
577                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
578                         ) {
579                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
580                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
581                             pSrc += 3;
582                             continue;
583                         }
584                     } else if(ch < 0xe0) {
585                         if( /* handle U+0080..U+07FF inline */
586                             ch >= 0xc2 &&
587                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
588                         ) {
589                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
590                             pSrc += 2;
591                             continue;
592                         }
593                     }
594
595                     if(ch >= 0xf0 || subchar > 0xffff) {
596                         /*
597                          * We may read up to six bytes and write up to two UChars,
598                          * which we didn't account for with computing count,
599                          * so we adjust it here.
600                          */
601                         if(--count == 0) {
602                             break;
603                         }
604                     }
605
606                     /* function call for "complicated" and error cases */
607                     ++pSrc; /* continue after the lead byte */
608                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
609                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
610                         *pErrorCode = U_INVALID_CHAR_FOUND;
611                         return NULL;
612                     }else if(ch<=0xFFFF){
613                         *(pDest++)=(UChar)ch;
614                     }else{
615                         *(pDest++)=U16_LEAD(ch);
616                         *(pDest++)=U16_TRAIL(ch);
617                     }
618                 }
619             } while(--count > 0);
620         }
621
622         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
623             ch = *pSrc;
624             if(ch <= 0x7f){
625                 *pDest++=(UChar)ch;
626                 ++pSrc;
627             } else {
628                 if(ch > 0xe0) {
629                     if( /* handle U+1000..U+CFFF inline */
630                         ch <= 0xec &&
631                         ((pSrcLimit - pSrc) >= 3) &&
632                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
633                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
634                     ) {
635                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
636                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
637                         pSrc += 3;
638                         continue;
639                     }
640                 } else if(ch < 0xe0) {
641                     if( /* handle U+0080..U+07FF inline */
642                         ch >= 0xc2 &&
643                         ((pSrcLimit - pSrc) >= 2) &&
644                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
645                     ) {
646                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
647                         pSrc += 2;
648                         continue;
649                     }
650                 }
651
652                 /* function call for "complicated" and error cases */
653                 ++pSrc; /* continue after the lead byte */
654                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
655                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
656                     *pErrorCode = U_INVALID_CHAR_FOUND;
657                     return NULL;
658                 }else if(ch<=0xFFFF){
659                     *(pDest++)=(UChar)ch;
660                 }else{
661                     *(pDest++)=U16_LEAD(ch);
662                     if(pDest<pDestLimit){
663                         *(pDest++)=U16_TRAIL(ch);
664                     }else{
665                         reqLength++;
666                         break;
667                     }
668                 }
669             }
670         }
671         /* do not fill the dest buffer just count the UChars needed */
672         while(pSrc < pSrcLimit){
673             ch = *pSrc;
674             if(ch <= 0x7f){
675                 reqLength++;
676                 ++pSrc;
677             } else {
678                 if(ch > 0xe0) {
679                     if( /* handle U+1000..U+CFFF inline */
680                         ch <= 0xec &&
681                         ((pSrcLimit - pSrc) >= 3) &&
682                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
683                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
684                     ) {
685                         reqLength++;
686                         pSrc += 3;
687                         continue;
688                     }
689                 } else if(ch < 0xe0) {
690                     if( /* handle U+0080..U+07FF inline */
691                         ch >= 0xc2 &&
692                         ((pSrcLimit - pSrc) >= 2) &&
693                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
694                     ) {
695                         reqLength++;
696                         pSrc += 2;
697                         continue;
698                     }
699                 }
700
701                 /* function call for "complicated" and error cases */
702                 ++pSrc; /* continue after the lead byte */
703                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
704                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
705                     *pErrorCode = U_INVALID_CHAR_FOUND;
706                     return NULL;
707                 }
708                 reqLength+=U16_LENGTH(ch);
709             }
710         }
711     }
712
713     reqLength+=(int32_t)(pDest - dest);
714
715     if(pNumSubstitutions!=NULL) {
716         *pNumSubstitutions=numSubstitutions;
717     }
718
719     if(pDestLength){
720         *pDestLength = reqLength;
721     }
722
723     /* Terminate the buffer */
724     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
725
726     return dest;
727 }
728
729 U_CAPI UChar* U_EXPORT2
730 u_strFromUTF8(UChar *dest,
731               int32_t destCapacity,
732               int32_t *pDestLength,
733               const char* src,
734               int32_t srcLength,
735               UErrorCode *pErrorCode){
736     return u_strFromUTF8WithSub(
737             dest, destCapacity, pDestLength,
738             src, srcLength,
739             U_SENTINEL, NULL,
740             pErrorCode);
741 }
742
743 U_CAPI UChar * U_EXPORT2
744 u_strFromUTF8Lenient(UChar *dest,
745                      int32_t destCapacity,
746                      int32_t *pDestLength,
747                      const char *src,
748                      int32_t srcLength,
749                      UErrorCode *pErrorCode) {
750     UChar *pDest = dest;
751     UChar32 ch;
752     int32_t reqLength = 0;
753     uint8_t* pSrc = (uint8_t*) src;
754
755     /* args check */
756     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
757         return NULL;
758     }
759         
760     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
761         (destCapacity<0) || (dest == NULL && destCapacity > 0)
762     ) {
763         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
764         return NULL;
765     }
766
767     if(srcLength < 0) {
768         /* Transform a NUL-terminated string. */
769         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
770         uint8_t t1, t2, t3; /* trail bytes */
771
772         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
773             if(ch < 0xc0) {
774                 /*
775                  * ASCII, or a trail byte in lead position which is treated like
776                  * a single-byte sequence for better character boundary
777                  * resynchronization after illegal sequences.
778                  */
779                 *pDest++=(UChar)ch;
780                 ++pSrc;
781                 continue;
782             } else if(ch < 0xe0) { /* U+0080..U+07FF */
783                 if((t1 = pSrc[1]) != 0) {
784                     /* 0x3080 = (0xc0 << 6) + 0x80 */
785                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
786                     pSrc += 2;
787                     continue;
788                 }
789             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
790                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
791                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
792                     /* 0x2080 = (0x80 << 6) + 0x80 */
793                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
794                     pSrc += 3;
795                     continue;
796                 }
797             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
798                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
799                     pSrc += 4;
800                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
801                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
802                     *(pDest++) = U16_LEAD(ch);
803                     if(pDest < pDestLimit) {
804                         *(pDest++) = U16_TRAIL(ch);
805                     } else {
806                         reqLength = 1;
807                         break;
808                     }
809                     continue;
810                 }
811             }
812
813             /* truncated character at the end */
814             *pDest++ = 0xfffd;
815             while(*++pSrc != 0) {}
816             break;
817         }
818
819         /* Pre-flight the rest of the string. */
820         while((ch = *pSrc) != 0) {
821             if(ch < 0xc0) {
822                 /*
823                  * ASCII, or a trail byte in lead position which is treated like
824                  * a single-byte sequence for better character boundary
825                  * resynchronization after illegal sequences.
826                  */
827                 ++reqLength;
828                 ++pSrc;
829                 continue;
830             } else if(ch < 0xe0) { /* U+0080..U+07FF */
831                 if(pSrc[1] != 0) {
832                     ++reqLength;
833                     pSrc += 2;
834                     continue;
835                 }
836             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
837                 if(pSrc[1] != 0 && pSrc[2] != 0) {
838                     ++reqLength;
839                     pSrc += 3;
840                     continue;
841                 }
842             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
843                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
844                     reqLength += 2;
845                     pSrc += 4;
846                     continue;
847                 }
848             }
849
850             /* truncated character at the end */
851             ++reqLength;
852             break;
853         }
854     } else /* srcLength >= 0 */ {
855       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
856
857         /*
858          * This function requires that if srcLength is given, then it must be
859          * destCapatity >= srcLength so that we need not check for
860          * destination buffer overflow in the loop.
861          */
862         if(destCapacity < srcLength) {
863             if(pDestLength != NULL) {
864                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
865             }
866             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
867             return NULL;
868         }
869
870         if((pSrcLimit - pSrc) >= 4) {
871             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
872
873             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
874             do {
875                 ch = *pSrc++;
876                 if(ch < 0xc0) {
877                     /*
878                      * ASCII, or a trail byte in lead position which is treated like
879                      * a single-byte sequence for better character boundary
880                      * resynchronization after illegal sequences.
881                      */
882                     *pDest++=(UChar)ch;
883                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
884                     /* 0x3080 = (0xc0 << 6) + 0x80 */
885                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
886                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
887                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
888                     /* 0x2080 = (0x80 << 6) + 0x80 */
889                     ch = (ch << 12) + (*pSrc++ << 6);
890                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
891                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
892                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
893                     ch = (ch << 18) + (*pSrc++ << 12);
894                     ch += *pSrc++ << 6;
895                     ch += *pSrc++ - 0x3c82080;
896                     *(pDest++) = U16_LEAD(ch);
897                     *(pDest++) = U16_TRAIL(ch);
898                 }
899             } while(pSrc < pSrcLimit);
900
901             pSrcLimit += 3; /* restore original pSrcLimit */
902         }
903
904         while(pSrc < pSrcLimit) {
905             ch = *pSrc++;
906             if(ch < 0xc0) {
907                 /*
908                  * ASCII, or a trail byte in lead position which is treated like
909                  * a single-byte sequence for better character boundary
910                  * resynchronization after illegal sequences.
911                  */
912                 *pDest++=(UChar)ch;
913                 continue;
914             } else if(ch < 0xe0) { /* U+0080..U+07FF */
915                 if(pSrc < pSrcLimit) {
916                     /* 0x3080 = (0xc0 << 6) + 0x80 */
917                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
918                     continue;
919                 }
920             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
921                 if((pSrcLimit - pSrc) >= 2) {
922                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
923                     /* 0x2080 = (0x80 << 6) + 0x80 */
924                     ch = (ch << 12) + (*pSrc++ << 6);
925                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
926                     pSrc += 3;
927                     continue;
928                 }
929             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
930                 if((pSrcLimit - pSrc) >= 3) {
931                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
932                     ch = (ch << 18) + (*pSrc++ << 12);
933                     ch += *pSrc++ << 6;
934                     ch += *pSrc++ - 0x3c82080;
935                     *(pDest++) = U16_LEAD(ch);
936                     *(pDest++) = U16_TRAIL(ch);
937                     pSrc += 4;
938                     continue;
939                 }
940             }
941
942             /* truncated character at the end */
943             *pDest++ = 0xfffd;
944             break;
945         }
946     }
947
948     reqLength+=(int32_t)(pDest - dest);
949
950     if(pDestLength){
951         *pDestLength = reqLength;
952     }
953
954     /* Terminate the buffer */
955     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
956
957     return dest;
958 }
959
960 static inline uint8_t *
961 _appendUTF8(uint8_t *pDest, UChar32 c) {
962     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
963     if((c)<=0x7f) {
964         *pDest++=(uint8_t)c;
965     } else if(c<=0x7ff) {
966         *pDest++=(uint8_t)((c>>6)|0xc0);
967         *pDest++=(uint8_t)((c&0x3f)|0x80);
968     } else if(c<=0xffff) {
969         *pDest++=(uint8_t)((c>>12)|0xe0);
970         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
971         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
972     } else /* if((uint32_t)(c)<=0x10ffff) */ {
973         *pDest++=(uint8_t)(((c)>>18)|0xf0);
974         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
975         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
976         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
977     }
978     return pDest;
979 }
980
981    
982 U_CAPI char* U_EXPORT2 
983 u_strToUTF8WithSub(char *dest,
984             int32_t destCapacity,
985             int32_t *pDestLength,
986             const UChar *pSrc,
987             int32_t srcLength,
988             UChar32 subchar, int32_t *pNumSubstitutions,
989             UErrorCode *pErrorCode){
990     int32_t reqLength=0;
991     uint32_t ch=0,ch2=0;
992     uint8_t *pDest = (uint8_t *)dest;
993     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
994     int32_t numSubstitutions;
995
996     /* args check */
997     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
998         return NULL;
999     }
1000         
1001     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1002         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1003         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1004     ) {
1005         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1006         return NULL;
1007     }
1008
1009     if(pNumSubstitutions!=NULL) {
1010         *pNumSubstitutions=0;
1011     }
1012     numSubstitutions=0;
1013
1014     if(srcLength==-1) {
1015         while((ch=*pSrc)!=0) {
1016             ++pSrc;
1017             if(ch <= 0x7f) {
1018                 if(pDest<pDestLimit) {
1019                     *pDest++ = (uint8_t)ch;
1020                 } else {
1021                     reqLength = 1;
1022                     break;
1023                 }
1024             } else if(ch <= 0x7ff) {
1025                 if((pDestLimit - pDest) >= 2) {
1026                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1027                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1028                 } else {
1029                     reqLength = 2;
1030                     break;
1031                 }
1032             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1033                 if((pDestLimit - pDest) >= 3) {
1034                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1035                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1036                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1037                 } else {
1038                     reqLength = 3;
1039                     break;
1040                 }
1041             } else /* ch is a surrogate */ {
1042                 int32_t length;
1043
1044                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
1046                     ++pSrc;
1047                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1048                 } else if(subchar>=0) {
1049                     ch=subchar;
1050                     ++numSubstitutions;
1051                 } else {
1052                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053                     *pErrorCode = U_INVALID_CHAR_FOUND;
1054                     return NULL;
1055                 }
1056
1057                 length = U8_LENGTH(ch);
1058                 if((pDestLimit - pDest) >= length) {
1059                     /* convert and append*/
1060                     pDest=_appendUTF8(pDest, ch);
1061                 } else {
1062                     reqLength = length;
1063                     break;
1064                 }
1065             }
1066         }
1067         while((ch=*pSrc++)!=0) {
1068             if(ch<=0x7f) {
1069                 ++reqLength;
1070             } else if(ch<=0x7ff) {
1071                 reqLength+=2;
1072             } else if(!U16_IS_SURROGATE(ch)) {
1073                 reqLength+=3;
1074             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1075                 ++pSrc;
1076                 reqLength+=4;
1077             } else if(subchar>=0) {
1078                 reqLength+=U8_LENGTH(subchar);
1079                 ++numSubstitutions;
1080             } else {
1081                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082                 *pErrorCode = U_INVALID_CHAR_FOUND;
1083                 return NULL;
1084             }
1085         }
1086     } else {
1087         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1088         int32_t count;
1089
1090         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1091         for(;;) {
1092             /*
1093              * Each iteration of the inner loop progresses by at most 3 UTF-8
1094              * bytes and one UChar, for most characters.
1095              * For supplementary code points (4 & 2), which are rare,
1096              * there is an additional adjustment.
1097              */
1098             count = (int32_t)((pDestLimit - pDest) / 3);
1099             srcLength = (int32_t)(pSrcLimit - pSrc);
1100             if(count > srcLength) {
1101                 count = srcLength; /* min(remaining dest/3, remaining src) */
1102             }
1103             if(count < 3) {
1104                 /*
1105                  * Too much overhead if we get near the end of the string,
1106                  * continue with the next loop.
1107                  */
1108                 break;
1109             }
1110             do {
1111                 ch=*pSrc++;
1112                 if(ch <= 0x7f) {
1113                     *pDest++ = (uint8_t)ch;
1114                 } else if(ch <= 0x7ff) {
1115                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1116                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
1118                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1119                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1120                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1121                 } else /* ch is a surrogate */ {
1122                     /*
1123                      * We will read two UChars and probably output four bytes,
1124                      * which we didn't account for with computing count,
1125                      * so we adjust it here.
1126                      */
1127                     if(--count == 0) {
1128                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1129                         break;  /* recompute count */
1130                     }
1131
1132                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
1133                         ++pSrc;
1134                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1135
1136                         /* writing 4 bytes per 2 UChars is ok */
1137                         *pDest++=(uint8_t)((ch>>18)|0xf0);
1138                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1139                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1140                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
1141                     } else  {
1142                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1143                         if(subchar>=0) {
1144                             ch=subchar;
1145                             ++numSubstitutions;
1146                         } else {
1147                             *pErrorCode = U_INVALID_CHAR_FOUND;
1148                             return NULL;
1149                         }
1150
1151                         /* convert and append*/
1152                         pDest=_appendUTF8(pDest, ch);
1153                     }
1154                 }
1155             } while(--count > 0);
1156         }
1157
1158         while(pSrc<pSrcLimit) {
1159             ch=*pSrc++;
1160             if(ch <= 0x7f) {
1161                 if(pDest<pDestLimit) {
1162                     *pDest++ = (uint8_t)ch;
1163                 } else {
1164                     reqLength = 1;
1165                     break;
1166                 }
1167             } else if(ch <= 0x7ff) {
1168                 if((pDestLimit - pDest) >= 2) {
1169                     *pDest++=(uint8_t)((ch>>6)|0xc0);
1170                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1171                 } else {
1172                     reqLength = 2;
1173                     break;
1174                 }
1175             } else if(ch <= 0xd7ff || ch >= 0xe000) {
1176                 if((pDestLimit - pDest) >= 3) {
1177                     *pDest++=(uint8_t)((ch>>12)|0xe0);
1178                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1179                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
1180                 } else {
1181                     reqLength = 3;
1182                     break;
1183                 }
1184             } else /* ch is a surrogate */ {
1185                 int32_t length;
1186
1187                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 
1188                     ++pSrc;
1189                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1190                 } else if(subchar>=0) {
1191                     ch=subchar;
1192                     ++numSubstitutions;
1193                 } else {
1194                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195                     *pErrorCode = U_INVALID_CHAR_FOUND;
1196                     return NULL;
1197                 }
1198
1199                 length = U8_LENGTH(ch);
1200                 if((pDestLimit - pDest) >= length) {
1201                     /* convert and append*/
1202                     pDest=_appendUTF8(pDest, ch);
1203                 } else {
1204                     reqLength = length;
1205                     break;
1206                 }
1207             }
1208         }
1209         while(pSrc<pSrcLimit) {
1210             ch=*pSrc++;
1211             if(ch<=0x7f) {
1212                 ++reqLength;
1213             } else if(ch<=0x7ff) {
1214                 reqLength+=2;
1215             } else if(!U16_IS_SURROGATE(ch)) {
1216                 reqLength+=3;
1217             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1218                 ++pSrc;
1219                 reqLength+=4;
1220             } else if(subchar>=0) {
1221                 reqLength+=U8_LENGTH(subchar);
1222                 ++numSubstitutions;
1223             } else {
1224                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225                 *pErrorCode = U_INVALID_CHAR_FOUND;
1226                 return NULL;
1227             }
1228         }
1229     }
1230
1231     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1232
1233     if(pNumSubstitutions!=NULL) {
1234         *pNumSubstitutions=numSubstitutions;
1235     }
1236
1237     if(pDestLength){
1238         *pDestLength = reqLength;
1239     }
1240
1241     /* Terminate the buffer */
1242     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1243     return dest;
1244 }
1245
1246 U_CAPI char* U_EXPORT2 
1247 u_strToUTF8(char *dest,
1248             int32_t destCapacity,
1249             int32_t *pDestLength,
1250             const UChar *pSrc,
1251             int32_t srcLength,
1252             UErrorCode *pErrorCode){
1253     return u_strToUTF8WithSub(
1254             dest, destCapacity, pDestLength,
1255             pSrc, srcLength,
1256             U_SENTINEL, NULL,
1257             pErrorCode);
1258 }
1259
1260 U_CAPI UChar* U_EXPORT2
1261 u_strFromJavaModifiedUTF8WithSub(
1262         UChar *dest,
1263         int32_t destCapacity,
1264         int32_t *pDestLength,
1265         const char *src,
1266         int32_t srcLength,
1267         UChar32 subchar, int32_t *pNumSubstitutions,
1268         UErrorCode *pErrorCode) {
1269     UChar *pDest = dest;
1270     UChar *pDestLimit = dest+destCapacity;
1271     UChar32 ch;
1272     int32_t reqLength = 0;
1273     const uint8_t* pSrc = (const uint8_t*) src;
1274     const uint8_t *pSrcLimit;
1275     int32_t count;
1276     uint8_t t1, t2; /* trail bytes */
1277     int32_t numSubstitutions;
1278
1279     /* args check */
1280     if(U_FAILURE(*pErrorCode)){
1281         return NULL;
1282     }
1283     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1284         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1285         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1286     ) {
1287         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1288         return NULL;
1289     }
1290
1291     if(pNumSubstitutions!=NULL) {
1292         *pNumSubstitutions=0;
1293     }
1294     numSubstitutions=0;
1295
1296     if(srcLength < 0) {
1297         /*
1298          * Transform a NUL-terminated ASCII string.
1299          * Handle non-ASCII strings with slower code.
1300          */
1301         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1302             *pDest++=(UChar)ch;
1303             ++pSrc;
1304         }
1305         if(ch == 0) {
1306             reqLength=(int32_t)(pDest - dest);
1307             if(pDestLength) {
1308                 *pDestLength = reqLength;
1309             }
1310
1311             /* Terminate the buffer */
1312             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1313             return dest;
1314         }
1315         srcLength = uprv_strlen((const char *)pSrc);
1316     }
1317
1318     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1320     for(;;) {
1321         count = (int32_t)(pDestLimit - pDest);
1322         srcLength = (int32_t)(pSrcLimit - pSrc);
1323         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1324             /* fast ASCII loop */
1325             const uint8_t *prevSrc = pSrc;
1326             int32_t delta;
1327             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1328                 *pDest++=(UChar)ch;
1329                 ++pSrc;
1330             }
1331             delta = (int32_t)(pSrc - prevSrc);
1332             count -= delta;
1333             srcLength -= delta;
1334         }
1335         /*
1336          * Each iteration of the inner loop progresses by at most 3 UTF-8
1337          * bytes and one UChar.
1338          */
1339         srcLength /= 3;
1340         if(count > srcLength) {
1341             count = srcLength; /* min(remaining dest, remaining src/3) */
1342         }
1343         if(count < 3) {
1344             /*
1345              * Too much overhead if we get near the end of the string,
1346              * continue with the next loop.
1347              */
1348             break;
1349         }
1350         do {
1351             ch = *pSrc;
1352             if(ch <= 0x7f){
1353                 *pDest++=(UChar)ch;
1354                 ++pSrc;
1355             } else {
1356                 if(ch >= 0xe0) {
1357                     if( /* handle U+0000..U+FFFF inline */
1358                         ch <= 0xef &&
1359                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1360                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1361                     ) {
1362                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1364                         pSrc += 3;
1365                         continue;
1366                     }
1367                 } else {
1368                     if( /* handle U+0000..U+07FF inline */
1369                         ch >= 0xc0 &&
1370                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1371                     ) {
1372                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1373                         pSrc += 2;
1374                         continue;
1375                     }
1376                 }
1377
1378                 if(subchar < 0) {
1379                     *pErrorCode = U_INVALID_CHAR_FOUND;
1380                     return NULL;
1381                 } else if(subchar > 0xffff && --count == 0) {
1382                     /*
1383                      * We need to write two UChars, adjusted count for that,
1384                      * and ran out of space.
1385                      */
1386                     break;
1387                 } else {
1388                     /* function call for error cases */
1389                     ++pSrc; /* continue after the lead byte */
1390                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1391                     ++numSubstitutions;
1392                     if(subchar<=0xFFFF) {
1393                         *(pDest++)=(UChar)subchar;
1394                     } else {
1395                         *(pDest++)=U16_LEAD(subchar);
1396                         *(pDest++)=U16_TRAIL(subchar);
1397                     }
1398                 }
1399             }
1400         } while(--count > 0);
1401     }
1402
1403     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1404         ch = *pSrc;
1405         if(ch <= 0x7f){
1406             *pDest++=(UChar)ch;
1407             ++pSrc;
1408         } else {
1409             if(ch >= 0xe0) {
1410                 if( /* handle U+0000..U+FFFF inline */
1411                     ch <= 0xef &&
1412                     ((pSrcLimit - pSrc) >= 3) &&
1413                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1414                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1415                 ) {
1416                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1418                     pSrc += 3;
1419                     continue;
1420                 }
1421             } else {
1422                 if( /* handle U+0000..U+07FF inline */
1423                     ch >= 0xc0 &&
1424                     ((pSrcLimit - pSrc) >= 2) &&
1425                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1426                 ) {
1427                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1428                     pSrc += 2;
1429                     continue;
1430                 }
1431             }
1432
1433             if(subchar < 0) {
1434                 *pErrorCode = U_INVALID_CHAR_FOUND;
1435                 return NULL;
1436             } else {
1437                 /* function call for error cases */
1438                 ++pSrc; /* continue after the lead byte */
1439                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1440                 ++numSubstitutions;
1441                 if(subchar<=0xFFFF) {
1442                     *(pDest++)=(UChar)subchar;
1443                 } else {
1444                     *(pDest++)=U16_LEAD(subchar);
1445                     if(pDest<pDestLimit) {
1446                         *(pDest++)=U16_TRAIL(subchar);
1447                     } else {
1448                         reqLength++;
1449                         break;
1450                     }
1451                 }
1452             }
1453         }
1454     }
1455
1456     /* do not fill the dest buffer just count the UChars needed */
1457     while(pSrc < pSrcLimit){
1458         ch = *pSrc;
1459         if(ch <= 0x7f) {
1460             reqLength++;
1461             ++pSrc;
1462         } else {
1463             if(ch >= 0xe0) {
1464                 if( /* handle U+0000..U+FFFF inline */
1465                     ch <= 0xef &&
1466                     ((pSrcLimit - pSrc) >= 3) &&
1467                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1468                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1469                 ) {
1470                     reqLength++;
1471                     pSrc += 3;
1472                     continue;
1473                 }
1474             } else {
1475                 if( /* handle U+0000..U+07FF inline */
1476                     ch >= 0xc0 &&
1477                     ((pSrcLimit - pSrc) >= 2) &&
1478                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1479                 ) {
1480                     reqLength++;
1481                     pSrc += 2;
1482                     continue;
1483                 }
1484             }
1485
1486             if(subchar < 0) {
1487                 *pErrorCode = U_INVALID_CHAR_FOUND;
1488                 return NULL;
1489             } else {
1490                 /* function call for error cases */
1491                 ++pSrc; /* continue after the lead byte */
1492                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1493                 ++numSubstitutions;
1494                 reqLength+=U16_LENGTH(ch);
1495             }
1496         }
1497     }
1498
1499     if(pNumSubstitutions!=NULL) {
1500         *pNumSubstitutions=numSubstitutions;
1501     }
1502
1503     reqLength+=(int32_t)(pDest - dest);
1504     if(pDestLength) {
1505         *pDestLength = reqLength;
1506     }
1507
1508     /* Terminate the buffer */
1509     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1510     return dest;
1511 }
1512
1513 U_CAPI char* U_EXPORT2 
1514 u_strToJavaModifiedUTF8(
1515         char *dest,
1516         int32_t destCapacity,
1517         int32_t *pDestLength,
1518         const UChar *src, 
1519         int32_t srcLength,
1520         UErrorCode *pErrorCode) {
1521     int32_t reqLength=0;
1522     uint32_t ch=0;
1523     uint8_t *pDest = (uint8_t *)dest;
1524     uint8_t *pDestLimit = pDest + destCapacity;
1525     const UChar *pSrcLimit;
1526     int32_t count;
1527
1528     /* args check */
1529     if(U_FAILURE(*pErrorCode)){
1530         return NULL;
1531     }
1532     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1533         (dest==NULL && destCapacity!=0) || destCapacity<0
1534     ) {
1535         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1536         return NULL;
1537     }
1538
1539     if(srcLength==-1) {
1540         /* Convert NUL-terminated ASCII, then find the string length. */
1541         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1542             *pDest++ = (uint8_t)ch;
1543             ++src;
1544         }
1545         if(ch == 0) {
1546             reqLength=(int32_t)(pDest - (uint8_t *)dest);
1547             if(pDestLength) {
1548                 *pDestLength = reqLength;
1549             }
1550
1551             /* Terminate the buffer */
1552             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1553             return dest;
1554         }
1555         srcLength = u_strlen(src);
1556     }
1557
1558     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1560     for(;;) {
1561         count = (int32_t)(pDestLimit - pDest);
1562         srcLength = (int32_t)(pSrcLimit - src);
1563         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1564             /* fast ASCII loop */
1565             const UChar *prevSrc = src;
1566             int32_t delta;
1567             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1568                 *pDest++=(uint8_t)ch;
1569                 ++src;
1570             }
1571             delta = (int32_t)(src - prevSrc);
1572             count -= delta;
1573             srcLength -= delta;
1574         }
1575         /*
1576          * Each iteration of the inner loop progresses by at most 3 UTF-8
1577          * bytes and one UChar.
1578          */
1579         count /= 3;
1580         if(count > srcLength) {
1581             count = srcLength; /* min(remaining dest/3, remaining src) */
1582         }
1583         if(count < 3) {
1584             /*
1585              * Too much overhead if we get near the end of the string,
1586              * continue with the next loop.
1587              */
1588             break;
1589         }
1590         do {
1591             ch=*src++;
1592             if(ch <= 0x7f && ch != 0) {
1593                 *pDest++ = (uint8_t)ch;
1594             } else if(ch <= 0x7ff) {
1595                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1596                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597             } else {
1598                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1599                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1600                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1601             }
1602         } while(--count > 0);
1603     }
1604
1605     while(src<pSrcLimit) {
1606         ch=*src++;
1607         if(ch <= 0x7f && ch != 0) {
1608             if(pDest<pDestLimit) {
1609                 *pDest++ = (uint8_t)ch;
1610             } else {
1611                 reqLength = 1;
1612                 break;
1613             }
1614         } else if(ch <= 0x7ff) {
1615             if((pDestLimit - pDest) >= 2) {
1616                 *pDest++=(uint8_t)((ch>>6)|0xc0);
1617                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1618             } else {
1619                 reqLength = 2;
1620                 break;
1621             }
1622         } else {
1623             if((pDestLimit - pDest) >= 3) {
1624                 *pDest++=(uint8_t)((ch>>12)|0xe0);
1625                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1626                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
1627             } else {
1628                 reqLength = 3;
1629                 break;
1630             }
1631         }
1632     }
1633     while(src<pSrcLimit) {
1634         ch=*src++;
1635         if(ch <= 0x7f && ch != 0) {
1636             ++reqLength;
1637         } else if(ch<=0x7ff) {
1638             reqLength+=2;
1639         } else {
1640             reqLength+=3;
1641         }
1642     }
1643
1644     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1645     if(pDestLength){
1646         *pDestLength = reqLength;
1647     }
1648
1649     /* Terminate the buffer */
1650     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1651     return dest;
1652 }