1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2002-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u32.c
10 * tab size: 8 (not used)
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
16 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
23 #include "unicode/ucnv.h"
24 #include "unicode/utf.h"
29 #define MAXIMUM_UCS2 0x0000FFFF
30 #define MAXIMUM_UTF 0x0010FFFF
32 #define HALF_BASE 0x0010000
33 #define HALF_MASK 0x3FF
34 #define SURROGATE_HIGH_START 0xD800
35 #define SURROGATE_LOW_START 0xDC00
37 /* -SURROGATE_LOW_START + HALF_BASE */
38 #define SURROGATE_LOW_BASE 9216
41 UCNV_NEED_TO_WRITE_BOM=1
44 /* UTF-32BE ----------------------------------------------------------------- */
47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
50 const unsigned char *mySource = (unsigned char *) args->source;
51 UChar *myTarget = args->target;
52 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
53 const UChar *targetLimit = args->targetLimit;
54 unsigned char *toUBytes = args->converter->toUBytes;
57 /* Restore state of current sequence */
58 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
59 i = args->converter->toULength; /* restore # of bytes consumed */
60 args->converter->toULength = 0;
62 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
63 args->converter->toUnicodeStatus = 0;
67 while (mySource < sourceLimit && myTarget < targetLimit) {
71 while (i < sizeof(uint32_t)) {
72 if (mySource < sourceLimit) {
73 ch = (ch << 8) | (uint8_t)(*mySource);
74 toUBytes[i++] = (char) *(mySource++);
77 /* stores a partially calculated target*/
78 /* + 1 to make 0 a valid character */
79 args->converter->toUnicodeStatus = ch + 1;
80 args->converter->toULength = (int8_t) i;
85 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
86 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
87 if (ch <= MAXIMUM_UCS2)
90 *(myTarget++) = (UChar) ch;
93 /* write out the surrogates */
94 *(myTarget++) = U16_LEAD(ch);
96 if (myTarget < targetLimit) {
97 *(myTarget++) = (UChar)ch;
100 /* Put in overflow buffer (not handled here) */
101 args->converter->UCharErrorBuffer[0] = (UChar) ch;
102 args->converter->UCharErrorBufferLength = 1;
103 *err = U_BUFFER_OVERFLOW_ERROR;
109 args->converter->toULength = (int8_t)i;
110 *err = U_ILLEGAL_CHAR_FOUND;
116 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
117 /* End of target buffer */
118 *err = U_BUFFER_OVERFLOW_ERROR;
121 args->target = myTarget;
122 args->source = (const char *) mySource;
126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
129 const unsigned char *mySource = (unsigned char *) args->source;
130 UChar *myTarget = args->target;
131 int32_t *myOffsets = args->offsets;
132 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
133 const UChar *targetLimit = args->targetLimit;
134 unsigned char *toUBytes = args->converter->toUBytes;
136 int32_t offsetNum = 0;
138 /* Restore state of current sequence */
139 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
140 i = args->converter->toULength; /* restore # of bytes consumed */
141 args->converter->toULength = 0;
143 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
144 args->converter->toUnicodeStatus = 0;
148 while (mySource < sourceLimit && myTarget < targetLimit) {
152 while (i < sizeof(uint32_t)) {
153 if (mySource < sourceLimit) {
154 ch = (ch << 8) | (uint8_t)(*mySource);
155 toUBytes[i++] = (char) *(mySource++);
158 /* stores a partially calculated target*/
159 /* + 1 to make 0 a valid character */
160 args->converter->toUnicodeStatus = ch + 1;
161 args->converter->toULength = (int8_t) i;
166 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
167 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
168 if (ch <= MAXIMUM_UCS2) {
169 /* fits in 16 bits */
170 *(myTarget++) = (UChar) ch;
171 *(myOffsets++) = offsetNum;
174 /* write out the surrogates */
175 *(myTarget++) = U16_LEAD(ch);
176 *myOffsets++ = offsetNum;
178 if (myTarget < targetLimit)
180 *(myTarget++) = (UChar)ch;
181 *(myOffsets++) = offsetNum;
184 /* Put in overflow buffer (not handled here) */
185 args->converter->UCharErrorBuffer[0] = (UChar) ch;
186 args->converter->UCharErrorBufferLength = 1;
187 *err = U_BUFFER_OVERFLOW_ERROR;
193 args->converter->toULength = (int8_t)i;
194 *err = U_ILLEGAL_CHAR_FOUND;
201 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
203 /* End of target buffer */
204 *err = U_BUFFER_OVERFLOW_ERROR;
207 args->target = myTarget;
208 args->source = (const char *) mySource;
209 args->offsets = myOffsets;
213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
216 const UChar *mySource = args->source;
217 unsigned char *myTarget;
218 const UChar *sourceLimit = args->sourceLimit;
219 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
221 unsigned int indexToWrite;
222 unsigned char temp[sizeof(uint32_t)];
224 if(mySource >= sourceLimit) {
225 /* no input, nothing to do */
229 /* write the BOM if necessary */
230 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
231 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
232 ucnv_fromUWriteBytes(args->converter,
234 &args->target, args->targetLimit,
237 args->converter->fromUnicodeStatus=0;
240 myTarget = (unsigned char *) args->target;
243 if (args->converter->fromUChar32) {
244 ch = args->converter->fromUChar32;
245 args->converter->fromUChar32 = 0;
249 while (mySource < sourceLimit && myTarget < targetLimit) {
252 if (U_IS_SURROGATE(ch)) {
255 if (mySource < sourceLimit) {
257 if (U_IS_TRAIL(ch2)) {
258 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
262 /* this is an unmatched trail code unit (2nd surrogate) */
263 /* callback(illegal) */
264 args->converter->fromUChar32 = ch;
265 *err = U_ILLEGAL_CHAR_FOUND;
270 /* ran out of source */
271 args->converter->fromUChar32 = ch;
273 /* this is an unmatched trail code unit (2nd surrogate) */
274 /* callback(illegal) */
275 *err = U_ILLEGAL_CHAR_FOUND;
281 /* this is an unmatched trail code unit (2nd surrogate) */
282 /* callback(illegal) */
283 args->converter->fromUChar32 = ch;
284 *err = U_ILLEGAL_CHAR_FOUND;
289 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
290 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
291 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
292 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
294 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
295 if (myTarget < targetLimit) {
296 *(myTarget++) = temp[indexToWrite];
299 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
300 *err = U_BUFFER_OVERFLOW_ERROR;
305 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
306 *err = U_BUFFER_OVERFLOW_ERROR;
309 args->target = (char *) myTarget;
310 args->source = mySource;
314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
317 const UChar *mySource = args->source;
318 unsigned char *myTarget;
320 const UChar *sourceLimit = args->sourceLimit;
321 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
323 int32_t offsetNum = 0;
324 unsigned int indexToWrite;
325 unsigned char temp[sizeof(uint32_t)];
327 if(mySource >= sourceLimit) {
328 /* no input, nothing to do */
332 /* write the BOM if necessary */
333 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
334 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
335 ucnv_fromUWriteBytes(args->converter,
337 &args->target, args->targetLimit,
340 args->converter->fromUnicodeStatus=0;
343 myTarget = (unsigned char *) args->target;
344 myOffsets = args->offsets;
347 if (args->converter->fromUChar32) {
348 ch = args->converter->fromUChar32;
349 args->converter->fromUChar32 = 0;
353 while (mySource < sourceLimit && myTarget < targetLimit) {
356 if (U_IS_SURROGATE(ch)) {
359 if (mySource < sourceLimit) {
361 if (U_IS_TRAIL(ch2)) {
362 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
366 /* this is an unmatched trail code unit (2nd surrogate) */
367 /* callback(illegal) */
368 args->converter->fromUChar32 = ch;
369 *err = U_ILLEGAL_CHAR_FOUND;
374 /* ran out of source */
375 args->converter->fromUChar32 = ch;
377 /* this is an unmatched trail code unit (2nd surrogate) */
378 /* callback(illegal) */
379 *err = U_ILLEGAL_CHAR_FOUND;
385 /* this is an unmatched trail code unit (2nd surrogate) */
386 /* callback(illegal) */
387 args->converter->fromUChar32 = ch;
388 *err = U_ILLEGAL_CHAR_FOUND;
393 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
394 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
395 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
396 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
398 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
399 if (myTarget < targetLimit) {
400 *(myTarget++) = temp[indexToWrite];
401 *(myOffsets++) = offsetNum;
404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
405 *err = U_BUFFER_OVERFLOW_ERROR;
408 offsetNum = offsetNum + 1 + (temp[1] != 0);
411 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
412 *err = U_BUFFER_OVERFLOW_ERROR;
415 args->target = (char *) myTarget;
416 args->source = mySource;
417 args->offsets = myOffsets;
421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
424 const uint8_t *mySource;
428 mySource = (const uint8_t *)args->source;
429 if (mySource >= (const uint8_t *)args->sourceLimit)
432 *err = U_INDEX_OUTOFBOUNDS_ERROR;
436 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
439 /* got a partial character */
440 uprv_memcpy(args->converter->toUBytes, mySource, length);
441 args->converter->toULength = (int8_t)length;
442 args->source = (const char *)(mySource + length);
443 *err = U_TRUNCATED_CHAR_FOUND;
447 /* Don't even try to do a direct cast because the value may be on an odd address. */
448 myUChar = ((UChar32)mySource[0] << 24)
449 | ((UChar32)mySource[1] << 16)
450 | ((UChar32)mySource[2] << 8)
451 | ((UChar32)mySource[3]);
453 args->source = (const char *)(mySource + 4);
454 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
458 uprv_memcpy(args->converter->toUBytes, mySource, 4);
459 args->converter->toULength = 4;
461 *err = U_ILLEGAL_CHAR_FOUND;
465 static const UConverterImpl _UTF32BEImpl = {
466 UCNV_UTF32_BigEndian,
475 T_UConverter_toUnicode_UTF32_BE,
476 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
477 T_UConverter_fromUnicode_UTF32_BE,
478 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
479 T_UConverter_getNextUChar_UTF32_BE,
485 ucnv_getNonSurrogateUnicodeSet,
491 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
492 static const UConverterStaticData _UTF32BEStaticData = {
493 sizeof(UConverterStaticData),
496 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
497 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
500 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
503 const UConverterSharedData _UTF32BEData =
504 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
506 /* UTF-32LE ---------------------------------------------------------- */
509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
512 const unsigned char *mySource = (unsigned char *) args->source;
513 UChar *myTarget = args->target;
514 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
515 const UChar *targetLimit = args->targetLimit;
516 unsigned char *toUBytes = args->converter->toUBytes;
519 /* Restore state of current sequence */
520 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
522 i = args->converter->toULength; /* restore # of bytes consumed */
523 args->converter->toULength = 0;
525 /* Stores the previously calculated ch from a previous call*/
526 ch = args->converter->toUnicodeStatus - 1;
527 args->converter->toUnicodeStatus = 0;
531 while (mySource < sourceLimit && myTarget < targetLimit)
536 while (i < sizeof(uint32_t))
538 if (mySource < sourceLimit)
540 ch |= ((uint8_t)(*mySource)) << (i * 8);
541 toUBytes[i++] = (char) *(mySource++);
545 /* stores a partially calculated target*/
546 /* + 1 to make 0 a valid character */
547 args->converter->toUnicodeStatus = ch + 1;
548 args->converter->toULength = (int8_t) i;
553 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
554 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
555 if (ch <= MAXIMUM_UCS2) {
556 /* fits in 16 bits */
557 *(myTarget++) = (UChar) ch;
560 /* write out the surrogates */
561 *(myTarget++) = U16_LEAD(ch);
563 if (myTarget < targetLimit) {
564 *(myTarget++) = (UChar)ch;
567 /* Put in overflow buffer (not handled here) */
568 args->converter->UCharErrorBuffer[0] = (UChar) ch;
569 args->converter->UCharErrorBufferLength = 1;
570 *err = U_BUFFER_OVERFLOW_ERROR;
576 args->converter->toULength = (int8_t)i;
577 *err = U_ILLEGAL_CHAR_FOUND;
583 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
585 /* End of target buffer */
586 *err = U_BUFFER_OVERFLOW_ERROR;
589 args->target = myTarget;
590 args->source = (const char *) mySource;
594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
597 const unsigned char *mySource = (unsigned char *) args->source;
598 UChar *myTarget = args->target;
599 int32_t *myOffsets = args->offsets;
600 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
601 const UChar *targetLimit = args->targetLimit;
602 unsigned char *toUBytes = args->converter->toUBytes;
604 int32_t offsetNum = 0;
606 /* Restore state of current sequence */
607 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
609 i = args->converter->toULength; /* restore # of bytes consumed */
610 args->converter->toULength = 0;
612 /* Stores the previously calculated ch from a previous call*/
613 ch = args->converter->toUnicodeStatus - 1;
614 args->converter->toUnicodeStatus = 0;
618 while (mySource < sourceLimit && myTarget < targetLimit)
623 while (i < sizeof(uint32_t))
625 if (mySource < sourceLimit)
627 ch |= ((uint8_t)(*mySource)) << (i * 8);
628 toUBytes[i++] = (char) *(mySource++);
632 /* stores a partially calculated target*/
633 /* + 1 to make 0 a valid character */
634 args->converter->toUnicodeStatus = ch + 1;
635 args->converter->toULength = (int8_t) i;
640 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
642 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
643 if (ch <= MAXIMUM_UCS2)
645 /* fits in 16 bits */
646 *(myTarget++) = (UChar) ch;
647 *(myOffsets++) = offsetNum;
650 /* write out the surrogates */
651 *(myTarget++) = U16_LEAD(ch);
652 *(myOffsets++) = offsetNum;
654 if (myTarget < targetLimit)
656 *(myTarget++) = (UChar)ch;
657 *(myOffsets++) = offsetNum;
661 /* Put in overflow buffer (not handled here) */
662 args->converter->UCharErrorBuffer[0] = (UChar) ch;
663 args->converter->UCharErrorBufferLength = 1;
664 *err = U_BUFFER_OVERFLOW_ERROR;
671 args->converter->toULength = (int8_t)i;
672 *err = U_ILLEGAL_CHAR_FOUND;
679 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
681 /* End of target buffer */
682 *err = U_BUFFER_OVERFLOW_ERROR;
685 args->target = myTarget;
686 args->source = (const char *) mySource;
687 args->offsets = myOffsets;
691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
694 const UChar *mySource = args->source;
695 unsigned char *myTarget;
696 const UChar *sourceLimit = args->sourceLimit;
697 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
699 unsigned int indexToWrite;
700 unsigned char temp[sizeof(uint32_t)];
702 if(mySource >= sourceLimit) {
703 /* no input, nothing to do */
707 /* write the BOM if necessary */
708 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
709 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
710 ucnv_fromUWriteBytes(args->converter,
712 &args->target, args->targetLimit,
715 args->converter->fromUnicodeStatus=0;
718 myTarget = (unsigned char *) args->target;
721 if (args->converter->fromUChar32)
723 ch = args->converter->fromUChar32;
724 args->converter->fromUChar32 = 0;
728 while (mySource < sourceLimit && myTarget < targetLimit)
732 if (U16_IS_SURROGATE(ch)) {
736 if (mySource < sourceLimit)
739 if (U16_IS_TRAIL(ch2)) {
740 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
744 /* this is an unmatched trail code unit (2nd surrogate) */
745 /* callback(illegal) */
746 args->converter->fromUChar32 = ch;
747 *err = U_ILLEGAL_CHAR_FOUND;
752 /* ran out of source */
753 args->converter->fromUChar32 = ch;
755 /* this is an unmatched trail code unit (2nd surrogate) */
756 /* callback(illegal) */
757 *err = U_ILLEGAL_CHAR_FOUND;
763 /* this is an unmatched trail code unit (2nd surrogate) */
764 /* callback(illegal) */
765 args->converter->fromUChar32 = ch;
766 *err = U_ILLEGAL_CHAR_FOUND;
771 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
772 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
773 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
774 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
776 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
778 if (myTarget < targetLimit)
780 *(myTarget++) = temp[indexToWrite];
784 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
785 *err = U_BUFFER_OVERFLOW_ERROR;
790 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
792 *err = U_BUFFER_OVERFLOW_ERROR;
795 args->target = (char *) myTarget;
796 args->source = mySource;
800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
803 const UChar *mySource = args->source;
804 unsigned char *myTarget;
806 const UChar *sourceLimit = args->sourceLimit;
807 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
809 unsigned int indexToWrite;
810 unsigned char temp[sizeof(uint32_t)];
811 int32_t offsetNum = 0;
813 if(mySource >= sourceLimit) {
814 /* no input, nothing to do */
818 /* write the BOM if necessary */
819 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
820 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
821 ucnv_fromUWriteBytes(args->converter,
823 &args->target, args->targetLimit,
826 args->converter->fromUnicodeStatus=0;
829 myTarget = (unsigned char *) args->target;
830 myOffsets = args->offsets;
833 if (args->converter->fromUChar32)
835 ch = args->converter->fromUChar32;
836 args->converter->fromUChar32 = 0;
840 while (mySource < sourceLimit && myTarget < targetLimit)
844 if (U16_IS_SURROGATE(ch)) {
848 if (mySource < sourceLimit)
851 if (U16_IS_TRAIL(ch2))
853 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
857 /* this is an unmatched trail code unit (2nd surrogate) */
858 /* callback(illegal) */
859 args->converter->fromUChar32 = ch;
860 *err = U_ILLEGAL_CHAR_FOUND;
865 /* ran out of source */
866 args->converter->fromUChar32 = ch;
868 /* this is an unmatched trail code unit (2nd surrogate) */
869 /* callback(illegal) */
870 *err = U_ILLEGAL_CHAR_FOUND;
876 /* this is an unmatched trail code unit (2nd surrogate) */
877 /* callback(illegal) */
878 args->converter->fromUChar32 = ch;
879 *err = U_ILLEGAL_CHAR_FOUND;
884 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
885 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
886 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
887 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
889 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
891 if (myTarget < targetLimit)
893 *(myTarget++) = temp[indexToWrite];
894 *(myOffsets++) = offsetNum;
898 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
899 *err = U_BUFFER_OVERFLOW_ERROR;
902 offsetNum = offsetNum + 1 + (temp[2] != 0);
905 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
907 *err = U_BUFFER_OVERFLOW_ERROR;
910 args->target = (char *) myTarget;
911 args->source = mySource;
912 args->offsets = myOffsets;
916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
919 const uint8_t *mySource;
923 mySource = (const uint8_t *)args->source;
924 if (mySource >= (const uint8_t *)args->sourceLimit)
927 *err = U_INDEX_OUTOFBOUNDS_ERROR;
931 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
934 /* got a partial character */
935 uprv_memcpy(args->converter->toUBytes, mySource, length);
936 args->converter->toULength = (int8_t)length;
937 args->source = (const char *)(mySource + length);
938 *err = U_TRUNCATED_CHAR_FOUND;
942 /* Don't even try to do a direct cast because the value may be on an odd address. */
943 myUChar = ((UChar32)mySource[3] << 24)
944 | ((UChar32)mySource[2] << 16)
945 | ((UChar32)mySource[1] << 8)
946 | ((UChar32)mySource[0]);
948 args->source = (const char *)(mySource + 4);
949 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
953 uprv_memcpy(args->converter->toUBytes, mySource, 4);
954 args->converter->toULength = 4;
956 *err = U_ILLEGAL_CHAR_FOUND;
960 static const UConverterImpl _UTF32LEImpl = {
961 UCNV_UTF32_LittleEndian,
970 T_UConverter_toUnicode_UTF32_LE,
971 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
972 T_UConverter_fromUnicode_UTF32_LE,
973 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
974 T_UConverter_getNextUChar_UTF32_LE,
980 ucnv_getNonSurrogateUnicodeSet,
986 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
987 static const UConverterStaticData _UTF32LEStaticData = {
988 sizeof(UConverterStaticData),
991 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
992 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
995 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
999 const UConverterSharedData _UTF32LEData =
1000 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1005 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1020 * During detection: state&3==number of matching bytes so far.
1022 * On output, emit U+FEFF as the first code point.
1026 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1027 if(choice<=UCNV_RESET_TO_UNICODE) {
1028 /* reset toUnicode: state=0 */
1031 if(choice!=UCNV_RESET_TO_UNICODE) {
1032 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1033 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1038 _UTF32Open(UConverter *cnv,
1039 UConverterLoadArgs *pArgs,
1040 UErrorCode *pErrorCode) {
1041 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1044 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1047 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1048 UErrorCode *pErrorCode) {
1049 UConverter *cnv=pArgs->converter;
1050 const char *source=pArgs->source;
1051 const char *sourceLimit=pArgs->sourceLimit;
1052 int32_t *offsets=pArgs->offsets;
1054 int32_t state, offsetDelta;
1060 * If we detect a BOM in this buffer, then we must add the BOM size to the
1061 * offsets because the actual converter function will not see and count the BOM.
1062 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1066 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1071 state=1; /* could be 00 00 FE FF */
1072 } else if(b==(char)0xff) {
1073 state=5; /* could be FF FE 00 00 */
1075 state=8; /* default to UTF-32BE */
1086 if(*source==utf32BOM[state]) {
1090 state=8; /* detect UTF-32BE */
1091 offsetDelta=(int32_t)(source-pArgs->source);
1092 } else if(state==8) {
1093 state=9; /* detect UTF-32LE */
1094 offsetDelta=(int32_t)(source-pArgs->source);
1097 /* switch to UTF-32BE and pass the previous bytes */
1098 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1100 /* reset the source */
1101 source=pArgs->source;
1103 if(count==(state&3)) {
1104 /* simple: all in the same buffer, just reset source */
1106 UBool oldFlush=pArgs->flush;
1108 /* some of the bytes are from a previous buffer, replay those first */
1109 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1110 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1111 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1113 /* no offsets: bytes from previous buffer, and not enough for output */
1114 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1116 /* restore real pointers; pArgs->source will be set in case 8/9 */
1117 pArgs->sourceLimit=sourceLimit;
1118 pArgs->flush=oldFlush;
1126 pArgs->source=source;
1128 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1130 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1132 source=pArgs->source;
1136 pArgs->source=source;
1138 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1140 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1142 source=pArgs->source;
1145 break; /* does not occur */
1149 /* add BOM size to offsets - see comment at offsetDelta declaration */
1150 if(offsets!=NULL && offsetDelta!=0) {
1151 int32_t *offsetsLimit=pArgs->offsets;
1152 while(offsets<offsetsLimit) {
1153 *offsets++ += offsetDelta;
1157 pArgs->source=source;
1159 if(source==sourceLimit && pArgs->flush) {
1160 /* handle truncated input */
1163 break; /* no input at all, nothing to do */
1165 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1168 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1171 /* handle 0<state<8: call UTF-32BE with too-short input */
1172 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1173 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1175 /* no offsets: not enough for output */
1176 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1177 pArgs->source=source;
1178 pArgs->sourceLimit=sourceLimit;
1188 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1189 UErrorCode *pErrorCode) {
1190 switch(pArgs->converter->mode) {
1192 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1194 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1196 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1200 static const UConverterImpl _UTF32Impl = {
1210 _UTF32ToUnicodeWithOffsets,
1211 _UTF32ToUnicodeWithOffsets,
1213 T_UConverter_fromUnicode_UTF32_BE,
1214 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1216 T_UConverter_fromUnicode_UTF32_LE,
1217 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1221 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1225 ucnv_getNonSurrogateUnicodeSet,
1231 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1232 static const UConverterStaticData _UTF32StaticData = {
1233 sizeof(UConverterStaticData),
1236 UCNV_IBM, UCNV_UTF32, 4, 4,
1238 { 0, 0, 0xff, 0xfd }, 4,
1240 { 0xfd, 0xff, 0, 0 }, 4,
1245 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1248 const UConverterSharedData _UTF32Data =
1249 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);