2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u32.c
8 * tab size: 8 (not used)
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
14 * UTF-32 converter implementation. Used to be in ucnv_utf.c.
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_CONVERSION
21 #include "unicode/ucnv.h"
22 #include "unicode/utf.h"
27 #define MAXIMUM_UCS2 0x0000FFFF
28 #define MAXIMUM_UTF 0x0010FFFF
30 #define HALF_BASE 0x0010000
31 #define HALF_MASK 0x3FF
32 #define SURROGATE_HIGH_START 0xD800
33 #define SURROGATE_LOW_START 0xDC00
35 /* -SURROGATE_LOW_START + HALF_BASE */
36 #define SURROGATE_LOW_BASE 9216
39 UCNV_NEED_TO_WRITE_BOM=1
42 /* UTF-32BE ----------------------------------------------------------------- */
45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
48 const unsigned char *mySource = (unsigned char *) args->source;
49 UChar *myTarget = args->target;
50 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
51 const UChar *targetLimit = args->targetLimit;
52 unsigned char *toUBytes = args->converter->toUBytes;
55 /* Restore state of current sequence */
56 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
57 i = args->converter->toULength; /* restore # of bytes consumed */
58 args->converter->toULength = 0;
60 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
61 args->converter->toUnicodeStatus = 0;
65 while (mySource < sourceLimit && myTarget < targetLimit) {
69 while (i < sizeof(uint32_t)) {
70 if (mySource < sourceLimit) {
71 ch = (ch << 8) | (uint8_t)(*mySource);
72 toUBytes[i++] = (char) *(mySource++);
75 /* stores a partially calculated target*/
76 /* + 1 to make 0 a valid character */
77 args->converter->toUnicodeStatus = ch + 1;
78 args->converter->toULength = (int8_t) i;
83 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
85 if (ch <= MAXIMUM_UCS2)
88 *(myTarget++) = (UChar) ch;
91 /* write out the surrogates */
92 *(myTarget++) = U16_LEAD(ch);
94 if (myTarget < targetLimit) {
95 *(myTarget++) = (UChar)ch;
98 /* Put in overflow buffer (not handled here) */
99 args->converter->UCharErrorBuffer[0] = (UChar) ch;
100 args->converter->UCharErrorBufferLength = 1;
101 *err = U_BUFFER_OVERFLOW_ERROR;
107 args->converter->toULength = (int8_t)i;
108 *err = U_ILLEGAL_CHAR_FOUND;
114 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
115 /* End of target buffer */
116 *err = U_BUFFER_OVERFLOW_ERROR;
119 args->target = myTarget;
120 args->source = (const char *) mySource;
124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
127 const unsigned char *mySource = (unsigned char *) args->source;
128 UChar *myTarget = args->target;
129 int32_t *myOffsets = args->offsets;
130 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
131 const UChar *targetLimit = args->targetLimit;
132 unsigned char *toUBytes = args->converter->toUBytes;
134 int32_t offsetNum = 0;
136 /* Restore state of current sequence */
137 if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
138 i = args->converter->toULength; /* restore # of bytes consumed */
139 args->converter->toULength = 0;
141 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
142 args->converter->toUnicodeStatus = 0;
146 while (mySource < sourceLimit && myTarget < targetLimit) {
150 while (i < sizeof(uint32_t)) {
151 if (mySource < sourceLimit) {
152 ch = (ch << 8) | (uint8_t)(*mySource);
153 toUBytes[i++] = (char) *(mySource++);
156 /* stores a partially calculated target*/
157 /* + 1 to make 0 a valid character */
158 args->converter->toUnicodeStatus = ch + 1;
159 args->converter->toULength = (int8_t) i;
164 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
166 if (ch <= MAXIMUM_UCS2) {
167 /* fits in 16 bits */
168 *(myTarget++) = (UChar) ch;
169 *(myOffsets++) = offsetNum;
172 /* write out the surrogates */
173 *(myTarget++) = U16_LEAD(ch);
174 *myOffsets++ = offsetNum;
176 if (myTarget < targetLimit)
178 *(myTarget++) = (UChar)ch;
179 *(myOffsets++) = offsetNum;
182 /* Put in overflow buffer (not handled here) */
183 args->converter->UCharErrorBuffer[0] = (UChar) ch;
184 args->converter->UCharErrorBufferLength = 1;
185 *err = U_BUFFER_OVERFLOW_ERROR;
191 args->converter->toULength = (int8_t)i;
192 *err = U_ILLEGAL_CHAR_FOUND;
199 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
201 /* End of target buffer */
202 *err = U_BUFFER_OVERFLOW_ERROR;
205 args->target = myTarget;
206 args->source = (const char *) mySource;
207 args->offsets = myOffsets;
211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
214 const UChar *mySource = args->source;
215 unsigned char *myTarget;
216 const UChar *sourceLimit = args->sourceLimit;
217 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
219 unsigned int indexToWrite;
220 unsigned char temp[sizeof(uint32_t)];
222 if(mySource >= sourceLimit) {
223 /* no input, nothing to do */
227 /* write the BOM if necessary */
228 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
229 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
230 ucnv_fromUWriteBytes(args->converter,
232 &args->target, args->targetLimit,
235 args->converter->fromUnicodeStatus=0;
238 myTarget = (unsigned char *) args->target;
241 if (args->converter->fromUChar32) {
242 ch = args->converter->fromUChar32;
243 args->converter->fromUChar32 = 0;
247 while (mySource < sourceLimit && myTarget < targetLimit) {
250 if (U_IS_SURROGATE(ch)) {
253 if (mySource < sourceLimit) {
255 if (U_IS_TRAIL(ch2)) {
256 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
260 /* this is an unmatched trail code unit (2nd surrogate) */
261 /* callback(illegal) */
262 args->converter->fromUChar32 = ch;
263 *err = U_ILLEGAL_CHAR_FOUND;
268 /* ran out of source */
269 args->converter->fromUChar32 = ch;
271 /* this is an unmatched trail code unit (2nd surrogate) */
272 /* callback(illegal) */
273 *err = U_ILLEGAL_CHAR_FOUND;
279 /* this is an unmatched trail code unit (2nd surrogate) */
280 /* callback(illegal) */
281 args->converter->fromUChar32 = ch;
282 *err = U_ILLEGAL_CHAR_FOUND;
287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
288 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
289 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
290 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
292 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
293 if (myTarget < targetLimit) {
294 *(myTarget++) = temp[indexToWrite];
297 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
298 *err = U_BUFFER_OVERFLOW_ERROR;
303 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
304 *err = U_BUFFER_OVERFLOW_ERROR;
307 args->target = (char *) myTarget;
308 args->source = mySource;
312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
315 const UChar *mySource = args->source;
316 unsigned char *myTarget;
318 const UChar *sourceLimit = args->sourceLimit;
319 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
321 int32_t offsetNum = 0;
322 unsigned int indexToWrite;
323 unsigned char temp[sizeof(uint32_t)];
325 if(mySource >= sourceLimit) {
326 /* no input, nothing to do */
330 /* write the BOM if necessary */
331 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
332 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
333 ucnv_fromUWriteBytes(args->converter,
335 &args->target, args->targetLimit,
338 args->converter->fromUnicodeStatus=0;
341 myTarget = (unsigned char *) args->target;
342 myOffsets = args->offsets;
345 if (args->converter->fromUChar32) {
346 ch = args->converter->fromUChar32;
347 args->converter->fromUChar32 = 0;
351 while (mySource < sourceLimit && myTarget < targetLimit) {
354 if (U_IS_SURROGATE(ch)) {
357 if (mySource < sourceLimit) {
359 if (U_IS_TRAIL(ch2)) {
360 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
364 /* this is an unmatched trail code unit (2nd surrogate) */
365 /* callback(illegal) */
366 args->converter->fromUChar32 = ch;
367 *err = U_ILLEGAL_CHAR_FOUND;
372 /* ran out of source */
373 args->converter->fromUChar32 = ch;
375 /* this is an unmatched trail code unit (2nd surrogate) */
376 /* callback(illegal) */
377 *err = U_ILLEGAL_CHAR_FOUND;
383 /* this is an unmatched trail code unit (2nd surrogate) */
384 /* callback(illegal) */
385 args->converter->fromUChar32 = ch;
386 *err = U_ILLEGAL_CHAR_FOUND;
391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
392 temp[1] = (uint8_t) (ch >> 16 & 0x1F);
393 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
394 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
396 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
397 if (myTarget < targetLimit) {
398 *(myTarget++) = temp[indexToWrite];
399 *(myOffsets++) = offsetNum;
402 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
403 *err = U_BUFFER_OVERFLOW_ERROR;
406 offsetNum = offsetNum + 1 + (temp[1] != 0);
409 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
410 *err = U_BUFFER_OVERFLOW_ERROR;
413 args->target = (char *) myTarget;
414 args->source = mySource;
415 args->offsets = myOffsets;
419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
422 const uint8_t *mySource;
426 mySource = (const uint8_t *)args->source;
427 if (mySource >= (const uint8_t *)args->sourceLimit)
430 *err = U_INDEX_OUTOFBOUNDS_ERROR;
434 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
437 /* got a partial character */
438 uprv_memcpy(args->converter->toUBytes, mySource, length);
439 args->converter->toULength = (int8_t)length;
440 args->source = (const char *)(mySource + length);
441 *err = U_TRUNCATED_CHAR_FOUND;
445 /* Don't even try to do a direct cast because the value may be on an odd address. */
446 myUChar = ((UChar32)mySource[0] << 24)
447 | ((UChar32)mySource[1] << 16)
448 | ((UChar32)mySource[2] << 8)
449 | ((UChar32)mySource[3]);
451 args->source = (const char *)(mySource + 4);
452 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
456 uprv_memcpy(args->converter->toUBytes, mySource, 4);
457 args->converter->toULength = 4;
459 *err = U_ILLEGAL_CHAR_FOUND;
463 static const UConverterImpl _UTF32BEImpl = {
464 UCNV_UTF32_BigEndian,
473 T_UConverter_toUnicode_UTF32_BE,
474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
475 T_UConverter_fromUnicode_UTF32_BE,
476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
477 T_UConverter_getNextUChar_UTF32_BE,
483 ucnv_getNonSurrogateUnicodeSet
486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
487 static const UConverterStaticData _UTF32BEStaticData = {
488 sizeof(UConverterStaticData),
491 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
492 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
498 const UConverterSharedData _UTF32BEData = {
499 sizeof(UConverterSharedData), ~((uint32_t) 0),
500 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
504 /* UTF-32LE ---------------------------------------------------------- */
507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
510 const unsigned char *mySource = (unsigned char *) args->source;
511 UChar *myTarget = args->target;
512 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
513 const UChar *targetLimit = args->targetLimit;
514 unsigned char *toUBytes = args->converter->toUBytes;
517 /* Restore state of current sequence */
518 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
520 i = args->converter->toULength; /* restore # of bytes consumed */
521 args->converter->toULength = 0;
523 /* Stores the previously calculated ch from a previous call*/
524 ch = args->converter->toUnicodeStatus - 1;
525 args->converter->toUnicodeStatus = 0;
529 while (mySource < sourceLimit && myTarget < targetLimit)
534 while (i < sizeof(uint32_t))
536 if (mySource < sourceLimit)
538 ch |= ((uint8_t)(*mySource)) << (i * 8);
539 toUBytes[i++] = (char) *(mySource++);
543 /* stores a partially calculated target*/
544 /* + 1 to make 0 a valid character */
545 args->converter->toUnicodeStatus = ch + 1;
546 args->converter->toULength = (int8_t) i;
551 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
552 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
553 if (ch <= MAXIMUM_UCS2) {
554 /* fits in 16 bits */
555 *(myTarget++) = (UChar) ch;
558 /* write out the surrogates */
559 *(myTarget++) = U16_LEAD(ch);
561 if (myTarget < targetLimit) {
562 *(myTarget++) = (UChar)ch;
565 /* Put in overflow buffer (not handled here) */
566 args->converter->UCharErrorBuffer[0] = (UChar) ch;
567 args->converter->UCharErrorBufferLength = 1;
568 *err = U_BUFFER_OVERFLOW_ERROR;
574 args->converter->toULength = (int8_t)i;
575 *err = U_ILLEGAL_CHAR_FOUND;
581 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
583 /* End of target buffer */
584 *err = U_BUFFER_OVERFLOW_ERROR;
587 args->target = myTarget;
588 args->source = (const char *) mySource;
592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
595 const unsigned char *mySource = (unsigned char *) args->source;
596 UChar *myTarget = args->target;
597 int32_t *myOffsets = args->offsets;
598 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
599 const UChar *targetLimit = args->targetLimit;
600 unsigned char *toUBytes = args->converter->toUBytes;
602 int32_t offsetNum = 0;
604 /* Restore state of current sequence */
605 if (args->converter->toUnicodeStatus && myTarget < targetLimit)
607 i = args->converter->toULength; /* restore # of bytes consumed */
608 args->converter->toULength = 0;
610 /* Stores the previously calculated ch from a previous call*/
611 ch = args->converter->toUnicodeStatus - 1;
612 args->converter->toUnicodeStatus = 0;
616 while (mySource < sourceLimit && myTarget < targetLimit)
621 while (i < sizeof(uint32_t))
623 if (mySource < sourceLimit)
625 ch |= ((uint8_t)(*mySource)) << (i * 8);
626 toUBytes[i++] = (char) *(mySource++);
630 /* stores a partially calculated target*/
631 /* + 1 to make 0 a valid character */
632 args->converter->toUnicodeStatus = ch + 1;
633 args->converter->toULength = (int8_t) i;
638 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
640 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
641 if (ch <= MAXIMUM_UCS2)
643 /* fits in 16 bits */
644 *(myTarget++) = (UChar) ch;
645 *(myOffsets++) = offsetNum;
648 /* write out the surrogates */
649 *(myTarget++) = U16_LEAD(ch);
650 *(myOffsets++) = offsetNum;
652 if (myTarget < targetLimit)
654 *(myTarget++) = (UChar)ch;
655 *(myOffsets++) = offsetNum;
659 /* Put in overflow buffer (not handled here) */
660 args->converter->UCharErrorBuffer[0] = (UChar) ch;
661 args->converter->UCharErrorBufferLength = 1;
662 *err = U_BUFFER_OVERFLOW_ERROR;
669 args->converter->toULength = (int8_t)i;
670 *err = U_ILLEGAL_CHAR_FOUND;
677 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
679 /* End of target buffer */
680 *err = U_BUFFER_OVERFLOW_ERROR;
683 args->target = myTarget;
684 args->source = (const char *) mySource;
685 args->offsets = myOffsets;
689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
692 const UChar *mySource = args->source;
693 unsigned char *myTarget;
694 const UChar *sourceLimit = args->sourceLimit;
695 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
697 unsigned int indexToWrite;
698 unsigned char temp[sizeof(uint32_t)];
700 if(mySource >= sourceLimit) {
701 /* no input, nothing to do */
705 /* write the BOM if necessary */
706 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
707 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
708 ucnv_fromUWriteBytes(args->converter,
710 &args->target, args->targetLimit,
713 args->converter->fromUnicodeStatus=0;
716 myTarget = (unsigned char *) args->target;
719 if (args->converter->fromUChar32)
721 ch = args->converter->fromUChar32;
722 args->converter->fromUChar32 = 0;
726 while (mySource < sourceLimit && myTarget < targetLimit)
730 if (U16_IS_SURROGATE(ch)) {
734 if (mySource < sourceLimit)
737 if (U16_IS_TRAIL(ch2)) {
738 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
742 /* this is an unmatched trail code unit (2nd surrogate) */
743 /* callback(illegal) */
744 args->converter->fromUChar32 = ch;
745 *err = U_ILLEGAL_CHAR_FOUND;
750 /* ran out of source */
751 args->converter->fromUChar32 = ch;
753 /* this is an unmatched trail code unit (2nd surrogate) */
754 /* callback(illegal) */
755 *err = U_ILLEGAL_CHAR_FOUND;
761 /* this is an unmatched trail code unit (2nd surrogate) */
762 /* callback(illegal) */
763 args->converter->fromUChar32 = ch;
764 *err = U_ILLEGAL_CHAR_FOUND;
769 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
770 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
771 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
772 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
774 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
776 if (myTarget < targetLimit)
778 *(myTarget++) = temp[indexToWrite];
782 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
783 *err = U_BUFFER_OVERFLOW_ERROR;
788 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
790 *err = U_BUFFER_OVERFLOW_ERROR;
793 args->target = (char *) myTarget;
794 args->source = mySource;
798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
801 const UChar *mySource = args->source;
802 unsigned char *myTarget;
804 const UChar *sourceLimit = args->sourceLimit;
805 const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
807 unsigned int indexToWrite;
808 unsigned char temp[sizeof(uint32_t)];
809 int32_t offsetNum = 0;
811 if(mySource >= sourceLimit) {
812 /* no input, nothing to do */
816 /* write the BOM if necessary */
817 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
818 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
819 ucnv_fromUWriteBytes(args->converter,
821 &args->target, args->targetLimit,
824 args->converter->fromUnicodeStatus=0;
827 myTarget = (unsigned char *) args->target;
828 myOffsets = args->offsets;
831 if (args->converter->fromUChar32)
833 ch = args->converter->fromUChar32;
834 args->converter->fromUChar32 = 0;
838 while (mySource < sourceLimit && myTarget < targetLimit)
842 if (U16_IS_SURROGATE(ch)) {
846 if (mySource < sourceLimit)
849 if (U16_IS_TRAIL(ch2))
851 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
855 /* this is an unmatched trail code unit (2nd surrogate) */
856 /* callback(illegal) */
857 args->converter->fromUChar32 = ch;
858 *err = U_ILLEGAL_CHAR_FOUND;
863 /* ran out of source */
864 args->converter->fromUChar32 = ch;
866 /* this is an unmatched trail code unit (2nd surrogate) */
867 /* callback(illegal) */
868 *err = U_ILLEGAL_CHAR_FOUND;
874 /* this is an unmatched trail code unit (2nd surrogate) */
875 /* callback(illegal) */
876 args->converter->fromUChar32 = ch;
877 *err = U_ILLEGAL_CHAR_FOUND;
882 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
883 temp[2] = (uint8_t) (ch >> 16 & 0x1F);
884 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */
885 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */
887 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
889 if (myTarget < targetLimit)
891 *(myTarget++) = temp[indexToWrite];
892 *(myOffsets++) = offsetNum;
896 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
897 *err = U_BUFFER_OVERFLOW_ERROR;
900 offsetNum = offsetNum + 1 + (temp[2] != 0);
903 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
905 *err = U_BUFFER_OVERFLOW_ERROR;
908 args->target = (char *) myTarget;
909 args->source = mySource;
910 args->offsets = myOffsets;
914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
917 const uint8_t *mySource;
921 mySource = (const uint8_t *)args->source;
922 if (mySource >= (const uint8_t *)args->sourceLimit)
925 *err = U_INDEX_OUTOFBOUNDS_ERROR;
929 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
932 /* got a partial character */
933 uprv_memcpy(args->converter->toUBytes, mySource, length);
934 args->converter->toULength = (int8_t)length;
935 args->source = (const char *)(mySource + length);
936 *err = U_TRUNCATED_CHAR_FOUND;
940 /* Don't even try to do a direct cast because the value may be on an odd address. */
941 myUChar = ((UChar32)mySource[3] << 24)
942 | ((UChar32)mySource[2] << 16)
943 | ((UChar32)mySource[1] << 8)
944 | ((UChar32)mySource[0]);
946 args->source = (const char *)(mySource + 4);
947 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
951 uprv_memcpy(args->converter->toUBytes, mySource, 4);
952 args->converter->toULength = 4;
954 *err = U_ILLEGAL_CHAR_FOUND;
958 static const UConverterImpl _UTF32LEImpl = {
959 UCNV_UTF32_LittleEndian,
968 T_UConverter_toUnicode_UTF32_LE,
969 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
970 T_UConverter_fromUnicode_UTF32_LE,
971 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
972 T_UConverter_getNextUChar_UTF32_LE,
978 ucnv_getNonSurrogateUnicodeSet
981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
982 static const UConverterStaticData _UTF32LEStaticData = {
983 sizeof(UConverterStaticData),
986 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
987 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
990 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
994 const UConverterSharedData _UTF32LEData = {
995 sizeof(UConverterSharedData), ~((uint32_t) 0),
996 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
1003 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
1018 * During detection: state&3==number of matching bytes so far.
1020 * On output, emit U+FEFF as the first code point.
1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
1025 if(choice<=UCNV_RESET_TO_UNICODE) {
1026 /* reset toUnicode: state=0 */
1029 if(choice!=UCNV_RESET_TO_UNICODE) {
1030 /* reset fromUnicode: prepare to output the UTF-32PE BOM */
1031 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1036 _UTF32Open(UConverter *cnv,
1037 UConverterLoadArgs *pArgs,
1038 UErrorCode *pErrorCode) {
1039 _UTF32Reset(cnv, UCNV_RESET_BOTH);
1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 };
1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1046 UErrorCode *pErrorCode) {
1047 UConverter *cnv=pArgs->converter;
1048 const char *source=pArgs->source;
1049 const char *sourceLimit=pArgs->sourceLimit;
1050 int32_t *offsets=pArgs->offsets;
1052 int32_t state, offsetDelta;
1058 * If we detect a BOM in this buffer, then we must add the BOM size to the
1059 * offsets because the actual converter function will not see and count the BOM.
1060 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1064 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1069 state=1; /* could be 00 00 FE FF */
1070 } else if(b==(char)0xff) {
1071 state=5; /* could be FF FE 00 00 */
1073 state=8; /* default to UTF-32BE */
1084 if(*source==utf32BOM[state]) {
1088 state=8; /* detect UTF-32BE */
1089 offsetDelta=(int32_t)(source-pArgs->source);
1090 } else if(state==8) {
1091 state=9; /* detect UTF-32LE */
1092 offsetDelta=(int32_t)(source-pArgs->source);
1095 /* switch to UTF-32BE and pass the previous bytes */
1096 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
1098 /* reset the source */
1099 source=pArgs->source;
1101 if(count==(state&3)) {
1102 /* simple: all in the same buffer, just reset source */
1104 UBool oldFlush=pArgs->flush;
1106 /* some of the bytes are from a previous buffer, replay those first */
1107 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1108 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
1109 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
1111 /* no offsets: bytes from previous buffer, and not enough for output */
1112 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1114 /* restore real pointers; pArgs->source will be set in case 8/9 */
1115 pArgs->sourceLimit=sourceLimit;
1116 pArgs->flush=oldFlush;
1124 pArgs->source=source;
1126 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1128 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
1130 source=pArgs->source;
1134 pArgs->source=source;
1136 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1138 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
1140 source=pArgs->source;
1143 break; /* does not occur */
1147 /* add BOM size to offsets - see comment at offsetDelta declaration */
1148 if(offsets!=NULL && offsetDelta!=0) {
1149 int32_t *offsetsLimit=pArgs->offsets;
1150 while(offsets<offsetsLimit) {
1151 *offsets++ += offsetDelta;
1155 pArgs->source=source;
1157 if(source==sourceLimit && pArgs->flush) {
1158 /* handle truncated input */
1161 break; /* no input at all, nothing to do */
1163 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1166 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
1169 /* handle 0<state<8: call UTF-32BE with too-short input */
1170 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
1171 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
1173 /* no offsets: not enough for output */
1174 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
1175 pArgs->source=source;
1176 pArgs->sourceLimit=sourceLimit;
1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
1187 UErrorCode *pErrorCode) {
1188 switch(pArgs->converter->mode) {
1190 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
1192 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
1194 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1198 static const UConverterImpl _UTF32Impl = {
1208 _UTF32ToUnicodeWithOffsets,
1209 _UTF32ToUnicodeWithOffsets,
1211 T_UConverter_fromUnicode_UTF32_BE,
1212 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
1214 T_UConverter_fromUnicode_UTF32_LE,
1215 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
1219 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1223 ucnv_getNonSurrogateUnicodeSet
1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
1227 static const UConverterStaticData _UTF32StaticData = {
1228 sizeof(UConverterStaticData),
1231 UCNV_IBM, UCNV_UTF32, 4, 4,
1233 { 0, 0, 0xff, 0xfd }, 4,
1235 { 0xfd, 0xff, 0, 0 }, 4,
1240 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1243 const UConverterSharedData _UTF32Data = {
1244 sizeof(UConverterSharedData), ~((uint32_t) 0),
1245 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,