2 **********************************************************************
3 * Copyright (C) 2000-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv2022.cpp
8 * tab size: 8 (not used)
11 * created on: 2000feb03
12 * created by: Markus W. Scherer
16 * 06/29/2000 helena Major rewrite of the callback APIs.
17 * 08/08/2000 Ram Included support for ISO-2022-JP-2
18 * Changed implementation of toUnicode
20 * 08/21/2000 Ram Added support for ISO-2022-KR
21 * 08/29/2000 Ram Seperated implementation of EBCDIC to
23 * 09/20/2000 Ram Added support for ISO-2022-CN
24 * Added implementations for getNextUChar()
25 * for specific 2022 country variants.
26 * 10/31/2000 Ram Implemented offsets logic functions
29 #include "unicode/utypes.h"
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "unicode/utf16.h"
46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
48 #ifdef U_ENABLE_GENERIC_ISO_2022
50 * I am disabling the generic ISO-2022 converter after proposing to do so on
51 * the icu mailing list two days ago.
54 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55 * its designation sequences, single shifts with return to the previous state,
56 * switch-with-no-return to UTF-16BE or similar, etc.
57 * This is unlike the language-specific variants like ISO-2022-JP which
58 * require a much smaller repertoire of ISO-2022 features.
59 * These variants continue to be supported.
60 * 2. I believe that no one is really using the generic ISO-2022 converter
61 * but rather always one of the language-specific variants.
62 * Note that ICU's generic ISO-2022 converter has always output one escape
63 * sequence followed by UTF-8 for the whole stream.
64 * 3. Switching between subcharsets is extremely slow, because each time
65 * the previous converter is closed and a new one opened,
66 * without any kind of caching, least-recently-used list, etc.
67 * 4. The code is currently buggy, and given the above it does not seem
68 * reasonable to spend the time on maintenance.
69 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70 * This means, for example, that when ISO-8859-7 is designated, the following
71 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72 * The ICU ISO-2022 converter does not handle this - and has no information
73 * about which subconverter would have to be shifted vs. which is designed
76 * Markus Scherer 2003-dec-03
80 static const char SHIFT_IN_STR[] = "\x0F";
81 // static const char SHIFT_OUT_STR[] = "\x0E";
95 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96 * as bytes 21..7E. (Subtract 0x80.)
97 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98 * as bytes 20..7F. (Subtract 0x80.)
99 * Do not encode C1 control codes with native bytes 80..9F
100 * as bytes 00..1F (C0 control codes).
110 * ISO 2022 control codes must not be converted from Unicode
111 * because they would mess up the byte stream.
112 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113 * corresponding to SO, SI, and ESC.
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
117 /* for ISO-2022-JP and -CN implementations */
134 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
137 /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
143 * these are used in StateEnum and ISO2022State variables,
144 * but CNS_11643 must be used to index into myConverterArray[]
156 /* is the StateEnum charset value for a DBCS charset? */
157 #if UCONFIG_NO_NON_HTML5_CONVERSION
158 #define IS_JP_DBCS(cs) (JISX208==(cs))
160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163 #define CSM(cs) ((uint16_t)1<<(cs))
166 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
167 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
169 * Note: The converter uses some leniency:
170 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
171 * all versions, not just JIS7 and JIS8.
172 * - ICU does not distinguish between different versions of JIS X 0208.
174 #if UCONFIG_NO_NON_HTML5_CONVERSION
175 enum { MAX_JA_VERSION=0 };
177 enum { MAX_JA_VERSION=4 };
179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
181 * TODO(jshin): The encoding spec has JISX212, but we don't support it.
182 * See https://www.w3.org/Bugs/Public/show_bug.cgi?id=26885
184 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
185 #if !UCONFIG_NO_NON_HTML5_CONVERSION
186 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
187 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
188 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
189 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
202 typedef struct ISO2022State {
203 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
204 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
205 int8_t prevG; /* g before single shift (SS2 or SS3) */
208 #define UCNV_OPTIONS_VERSION_MASK 0xf
209 #define UCNV_2022_MAX_CONVERTERS 10
212 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
213 UConverter *currentConverter;
214 Cnv2022Type currentType;
215 ISO2022State toU2022State, fromU2022State;
218 #ifdef U_ENABLE_GENERIC_ISO_2022
221 UBool isEmptySegment;
224 }UConverterDataISO2022;
227 /* ISO-2022 ----------------------------------------------------------------- */
229 /*Forward declaration */
231 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
234 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
237 #define ESC_2022 0x1B /*ESC*/
241 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
242 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
243 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
244 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
245 } UCNV_TableStates_2022;
248 * The way these state transition arrays work is:
249 * ex : ESC$B is the sequence for JISX208
250 * a) First Iteration: char is ESC
251 * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
252 * int x = normalize_esq_chars_2022[27] which is equal to 1
253 * ii) Search for this value in escSeqStateTable_Key_2022[]
254 * value of x is stored at escSeqStateTable_Key_2022[0]
255 * iii) Save this index as offset
256 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
257 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
258 * b) Switch on this state and continue to next char
259 * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
260 * which is normalize_esq_chars_2022[36] == 4
261 * ii) x is currently 1(from above)
262 * x<<=5 -- x is now 32
263 * x+=normalize_esq_chars_2022[36]
265 * iii) Search for this value in escSeqStateTable_Key_2022[]
266 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
267 * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
268 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
269 * c) Switch on this state and continue to next char
270 * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
271 * ii) x is currently 36 (from above)
272 * x<<=5 -- x is now 1152
273 * x+=normalize_esq_chars_2022[66]
275 * iii) Search for this value in escSeqStateTable_Key_2022[]
276 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
277 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
278 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
279 * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
283 /*Below are the 3 arrays depicting a state transition table*/
284 static const int8_t normalize_esq_chars_2022[256] = {
285 /* 0 1 2 3 4 5 6 7 8 9 */
287 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
290 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
291 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
294 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
295 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
299 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
300 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
301 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
303 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
304 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
305 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
306 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
307 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
308 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
309 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
310 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
311 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
315 #ifdef U_ENABLE_GENERIC_ISO_2022
317 * When the generic ISO-2022 converter is completely removed, not just disabled
318 * per #ifdef, then the following state table and the associated tables that are
319 * dimensioned with MAX_STATES_2022 should be trimmed.
321 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
322 * the associated escape sequences starting with ESC ( B should be removed.
323 * This includes the ones with key values 1097 and all of the ones above 1000000.
325 * For the latter, the tables can simply be truncated.
326 * For the former, since the tables must be kept parallel, it is probably best
327 * to simply duplicate an adjacent table cell, parallel in all tables.
329 * It may make sense to restructure the tables, especially by using small search
330 * tables for the variants instead of indexing them parallel to the table here.
334 #define MAX_STATES_2022 74
335 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
336 /* 0 1 2 3 4 5 6 7 8 9 */
338 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
339 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
340 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
341 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
342 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
343 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
344 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
345 ,35947631 ,35947635 ,35947636 ,35947638
348 #ifdef U_ENABLE_GENERIC_ISO_2022
350 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
351 /* 0 1 2 3 4 5 6 7 8 9 */
353 NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
354 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
355 ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
356 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
357 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
358 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
359 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
360 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
365 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
366 /* 0 1 2 3 4 5 6 7 8 9 */
367 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
368 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
369 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
370 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
371 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
372 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
373 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
374 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
377 /* Type def for refactoring changeState_2022 code*/
379 #ifdef U_ENABLE_GENERIC_ISO_2022
382 #if UCONFIG_NO_NON_HTML5_CONVERSION
391 /*********** ISO 2022 Converter Protos ***********/
393 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
396 _ISO2022Close(UConverter *converter);
399 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
402 _ISO2022getName(const UConverter* cnv);
405 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
408 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
410 #ifdef U_ENABLE_GENERIC_ISO_2022
412 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
417 /*const UConverterSharedData _ISO2022Data;*/
418 extern const UConverterSharedData _ISO2022JPData;
419 extern const UConverterSharedData _ISO2022KRData;
420 extern const UConverterSharedData _ISO2022CNData;
424 /*************** Converter implementations ******************/
426 /* The purpose of this function is to get around gcc compiler warnings. */
428 fromUWriteUInt8(UConverter *cnv,
429 const char *bytes, int32_t length,
430 uint8_t **target, const char *targetLimit,
433 UErrorCode *pErrorCode)
435 char *targetChars = (char *)*target;
436 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
437 offsets, sourceIndex, pErrorCode);
438 *target = (uint8_t*)targetChars;
443 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
444 if(myConverterData->version == 1) {
445 UConverter *cnv = myConverterData->currentConverter;
447 cnv->toUnicodeStatus=0; /* offset */
448 cnv->mode=0; /* state */
449 cnv->toULength=0; /* byteIndex */
454 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
455 /* in ISO-2022-KR the designator sequence appears only once
456 * in a file so we append it only once
458 if( converter->charErrorBufferLength==0){
460 converter->charErrorBufferLength = 4;
461 converter->charErrorBuffer[0] = 0x1b;
462 converter->charErrorBuffer[1] = 0x24;
463 converter->charErrorBuffer[2] = 0x29;
464 converter->charErrorBuffer[3] = 0x43;
466 if(myConverterData->version == 1) {
467 UConverter *cnv = myConverterData->currentConverter;
470 cnv->fromUnicodeStatus=1; /* prevLength */
475 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
477 char myLocale[6]={' ',' ',' ',' ',' ',' '};
479 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
480 if(cnv->extraInfo != NULL) {
481 UConverterNamePieces stackPieces;
482 UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
483 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
486 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
488 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
489 myConverterData->currentType = ASCII1;
490 cnv->fromUnicodeStatus =FALSE;
492 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
494 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
495 myConverterData->version = version;
496 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
497 (myLocale[2]=='_' || myLocale[2]=='\0'))
500 /* open the required converters and cache them */
501 if(version>MAX_JA_VERSION) {
502 /* prevent indexing beyond jpCharsetMasks[] */
503 myConverterData->version = version = 0;
505 #if !UCONFIG_NO_NON_HTML5_CONVERSION
506 if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
507 myConverterData->myConverterArray[ISO8859_7] =
508 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
511 myConverterData->myConverterArray[JISX208] =
512 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
513 #if !UCONFIG_NO_NON_HTML5_CONVERSION
514 if(jpCharsetMasks[version]&CSM(JISX212)) {
515 myConverterData->myConverterArray[JISX212] =
516 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
518 if(jpCharsetMasks[version]&CSM(GB2312)) {
519 myConverterData->myConverterArray[GB2312] =
520 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
522 if(jpCharsetMasks[version]&CSM(KSC5601)) {
523 myConverterData->myConverterArray[KSC5601] =
524 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528 /* set the function pointers to appropriate funtions */
529 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
530 uprv_strcpy(myConverterData->locale,"ja");
532 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
533 len = uprv_strlen(myConverterData->name);
534 myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
535 myConverterData->name[len+1]='\0';
537 #if !UCONFIG_NO_NON_HTML5_CONVERSION
538 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
539 (myLocale[2]=='_' || myLocale[2]=='\0'))
543 cnvName="icu-internal-25546";
546 myConverterData->version=version=0;
548 if(pArgs->onlyTestIsLoadable) {
549 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
550 uprv_free(cnv->extraInfo);
554 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
555 if (U_FAILURE(*errorCode)) {
561 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
562 uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
563 cnv->subCharLen = myConverterData->currentConverter->subCharLen;
565 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
568 /* initialize the state variables */
569 setInitialStateToUnicodeKR(cnv, myConverterData);
570 setInitialStateFromUnicodeKR(cnv, myConverterData);
572 /* set the function pointers to appropriate funtions */
573 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
574 uprv_strcpy(myConverterData->locale,"ko");
577 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
578 (myLocale[2]=='_' || myLocale[2]=='\0'))
581 /* open the required converters and cache them */
582 myConverterData->myConverterArray[GB2312_1] =
583 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
585 myConverterData->myConverterArray[ISO_IR_165] =
586 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
588 myConverterData->myConverterArray[CNS_11643] =
589 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
592 /* set the function pointers to appropriate funtions */
593 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
594 uprv_strcpy(myConverterData->locale,"cn");
597 myConverterData->version = 0;
598 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
599 }else if (version==1){
600 myConverterData->version = 1;
601 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
603 myConverterData->version = 2;
604 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
607 #endif // !UCONFIG_NO_NON_HTML5_CONVERSION
609 #ifdef U_ENABLE_GENERIC_ISO_2022
610 myConverterData->isFirstBuffer = TRUE;
612 /* append the UTF-8 escape sequence */
613 cnv->charErrorBufferLength = 3;
614 cnv->charErrorBuffer[0] = 0x1b;
615 cnv->charErrorBuffer[1] = 0x25;
616 cnv->charErrorBuffer[2] = 0x42;
618 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
619 /* initialize the state variables */
620 uprv_strcpy(myConverterData->name,"ISO_2022");
622 *errorCode = U_UNSUPPORTED_ERROR;
627 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
629 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
633 *errorCode = U_MEMORY_ALLOCATION_ERROR;
639 _ISO2022Close(UConverter *converter) {
640 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
641 UConverterSharedData **array = myData->myConverterArray;
644 if (converter->extraInfo != NULL) {
645 /*close the array of converter pointers and free the memory*/
646 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
648 ucnv_unloadSharedDataIfReady(array[i]);
652 ucnv_close(myData->currentConverter);
654 if(!converter->isExtraLocal){
655 uprv_free (converter->extraInfo);
656 converter->extraInfo = NULL;
662 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
663 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
664 if(choice<=UCNV_RESET_TO_UNICODE) {
665 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
666 myConverterData->key = 0;
667 myConverterData->isEmptySegment = FALSE;
669 if(choice!=UCNV_RESET_TO_UNICODE) {
670 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
672 #ifdef U_ENABLE_GENERIC_ISO_2022
673 if(myConverterData->locale[0] == 0){
674 if(choice<=UCNV_RESET_TO_UNICODE) {
675 myConverterData->isFirstBuffer = TRUE;
676 myConverterData->key = 0;
677 if (converter->mode == UCNV_SO){
678 ucnv_close (myConverterData->currentConverter);
679 myConverterData->currentConverter=NULL;
681 converter->mode = UCNV_SI;
683 if(choice!=UCNV_RESET_TO_UNICODE) {
684 /* re-append UTF-8 escape sequence */
685 converter->charErrorBufferLength = 3;
686 converter->charErrorBuffer[0] = 0x1b;
687 converter->charErrorBuffer[1] = 0x28;
688 converter->charErrorBuffer[2] = 0x42;
694 /* reset the state variables */
695 if(myConverterData->locale[0] == 'k'){
696 if(choice<=UCNV_RESET_TO_UNICODE) {
697 setInitialStateToUnicodeKR(converter, myConverterData);
699 if(choice!=UCNV_RESET_TO_UNICODE) {
700 setInitialStateFromUnicodeKR(converter, myConverterData);
707 _ISO2022getName(const UConverter* cnv){
709 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
716 /*************** to unicode *******************/
717 /****************************************************************************
718 * Recognized escape sequences are
730 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
731 /* 0 1 2 3 4 5 6 7 8 9 */
732 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
733 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
734 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
735 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
736 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
737 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
738 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
739 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
742 #if !UCONFIG_NO_NON_HTML5_CONVERSION
743 /*************** to unicode *******************/
744 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
745 /* 0 1 2 3 4 5 6 7 8 9 */
746 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
747 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
748 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
749 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
750 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
751 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
752 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
753 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
758 static UCNV_TableStates_2022
759 getKey_2022(char c,int32_t* key,int32_t* offset){
762 int32_t hi = MAX_STATES_2022;
765 togo = normalize_esq_chars_2022[(uint8_t)c];
767 /* not a valid character anywhere in an escape sequence */
772 togo = (*key << 5) + togo;
774 while (hi != low) /*binary search*/{
776 register int32_t mid = (hi+low) >> 1; /*Finds median*/
781 if (escSeqStateTable_Key_2022[mid] > togo){
784 else if (escSeqStateTable_Key_2022[mid] < togo){
787 else /*we found it*/{
790 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
801 /*runs through a state machine to determine the escape sequence - codepage correspondance
804 changeState_2022(UConverter* _this,
806 const char* sourceLimit,
809 UCNV_TableStates_2022 value;
810 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
811 uint32_t key = myData2022->key;
813 int8_t initialToULength = _this->toULength;
816 value = VALID_NON_TERMINAL_2022;
817 while (*source < sourceLimit) {
819 _this->toUBytes[_this->toULength++]=(uint8_t)c;
820 value = getKey_2022(c,(int32_t *) &key, &offset);
824 case VALID_NON_TERMINAL_2022 :
825 /* continue with the loop */
828 case VALID_TERMINAL_2022:
835 case VALID_MAYBE_TERMINAL_2022:
836 #ifdef U_ENABLE_GENERIC_ISO_2022
837 /* ESC ( B is ambiguous only for ISO_2022 itself */
838 if(var == ISO_2022) {
839 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
840 _this->toULength = 0;
842 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
844 /* continue with the loop */
845 value = VALID_NON_TERMINAL_2022;
850 /* not ISO_2022 itself, finish here */
851 value = VALID_TERMINAL_2022;
859 myData2022->key = key;
861 if (value == VALID_NON_TERMINAL_2022) {
862 /* indicate that the escape sequence is incomplete: key!=0 */
864 } else if (value == INVALID_2022 ) {
865 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
866 } else /* value == VALID_TERMINAL_2022 */ {
868 #ifdef U_ENABLE_GENERIC_ISO_2022
871 const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
872 if(chosenConverterName == NULL) {
874 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
875 _this->toUCallbackReason = UCNV_UNASSIGNED;
879 _this->mode = UCNV_SI;
880 ucnv_close(myData2022->currentConverter);
881 myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
882 if(U_SUCCESS(*err)) {
883 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
884 _this->mode = UCNV_SO;
891 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
894 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897 if(myData2022->toU2022State.cs[2]!=0) {
898 if(myData2022->toU2022State.g<2) {
899 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
901 myData2022->toU2022State.g=2;
903 /* illegal to have SS2 before a matching designator */
904 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
907 /* case SS3_STATE: not used in ISO-2022-JP-x */
908 #if !UCONFIG_NO_NON_HTML5_CONVERSION
911 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
912 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
914 /* G2 charset for SS2 */
915 myData2022->toU2022State.cs[2]=(int8_t)tempState;
920 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
921 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
924 myData2022->toU2022State.cs[0]=(int8_t)tempState;
930 #if !UCONFIG_NO_NON_HTML5_CONVERSION
933 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
936 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
939 if(myData2022->toU2022State.cs[2]!=0) {
940 if(myData2022->toU2022State.g<2) {
941 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
943 myData2022->toU2022State.g=2;
945 /* illegal to have SS2 before a matching designator */
946 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
950 if(myData2022->toU2022State.cs[3]!=0) {
951 if(myData2022->toU2022State.g<2) {
952 myData2022->toU2022State.prevG=myData2022->toU2022State.g;
954 myData2022->toU2022State.g=3;
956 /* illegal to have SS3 before a matching designator */
957 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
961 if(myData2022->version==0) {
962 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
969 myData2022->toU2022State.cs[1]=(int8_t)tempState;
972 myData2022->toU2022State.cs[2]=(int8_t)tempState;
975 /* other CNS 11643 planes */
976 if(myData2022->version==0) {
977 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
979 myData2022->toU2022State.cs[3]=(int8_t)tempState;
987 /* nothing to be done, just accept this one escape sequence */
989 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
992 #endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
995 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
999 if(U_SUCCESS(*err)) {
1000 _this->toULength = 0;
1001 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1002 if(_this->toULength>1) {
1004 * Ticket 5691: consistent illegal sequences:
1005 * - We include at least the first byte (ESC) in the illegal sequence.
1006 * - If any of the non-initial bytes could be the start of a character,
1007 * we stop the illegal sequence before the first one of those.
1008 * In escape sequences, all following bytes are "printable", that is,
1009 * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1010 * they are valid single/lead bytes.
1011 * For simplicity, we always only report the initial ESC byte as the
1012 * illegal sequence and back out all other bytes we looked at.
1014 /* Back out some bytes. */
1015 int8_t backOutDistance=_this->toULength-1;
1016 int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1017 if(backOutDistance<=bytesFromThisBuffer) {
1018 /* same as initialToULength<=1 */
1019 *source-=backOutDistance;
1021 /* Back out bytes from the previous buffer: Need to replay them. */
1022 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1023 /* same as -(initialToULength-1) */
1024 /* preToULength is negative! */
1025 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1026 *source-=bytesFromThisBuffer;
1030 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1031 _this->toUCallbackReason = UCNV_UNASSIGNED;
1035 /*Checks the characters of the buffer against valid 2022 escape sequences
1036 *if the match we return a pointer to the initial start of the sequence otherwise
1037 *we return sourceLimit
1039 /*for 2022 looks ahead in the stream
1040 *to determine the longest possible convertible
1043 static inline const char*
1044 getEndOfBuffer_2022(const char** source,
1045 const char* sourceLimit,
1048 const char* mySource = *source;
1050 #ifdef U_ENABLE_GENERIC_ISO_2022
1051 if (*source >= sourceLimit)
1056 if (*mySource == ESC_2022){
1060 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1062 /* Kludge: I could not
1063 * figure out the reason for validating an escape sequence
1064 * twice - once here and once in changeState_2022().
1065 * is it possible to have an ESC character in a ISO2022
1066 * byte stream which is valid in a code page? Is it legal?
1069 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1071 value = getKey_2022(*(mySource+i), &key, &offset);
1073 if (value > 0 || *mySource==ESC_2022)
1076 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1079 }while (++mySource < sourceLimit);
1083 while(mySource < sourceLimit && *mySource != ESC_2022) {
1091 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1092 * any future change in _MBCSFromUChar32() function should be reflected here.
1093 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1095 static inline int32_t
1096 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1103 const uint16_t *table;
1104 uint32_t stage2Entry;
1109 * TODO(markus): Use and require new, faster MBCS conversion table structures.
1110 * Use internal version of ucnv_open() that verifies that the new structures are available,
1111 * else U_INTERNAL_PROGRAM_ERROR.
1113 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1114 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1115 table=sharedData->mbcs.fromUnicodeTable;
1116 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1117 /* get the bytes and the length for the output */
1118 if(outputType==MBCS_OUTPUT_2){
1119 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1125 } else /* outputType==MBCS_OUTPUT_3 */ {
1126 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1127 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1130 } else if(myValue<=0xffff) {
1136 /* is this code point assigned, or do we use fallbacks? */
1137 if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1141 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1143 * We allow a 0 byte output if the "assigned" bit is set for this entry.
1144 * There is no way with this data structure for fallback output
1145 * to be a zero byte.
1152 cx=sharedData->mbcs.extIndexes;
1154 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1161 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1162 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1163 * @param retval pointer to output byte
1164 * @return 1 roundtrip byte 0 no mapping -1 fallback byte
1166 static inline int32_t
1167 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1172 const uint16_t *table;
1174 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1175 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1178 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1179 table=sharedData->mbcs.fromUnicodeTable;
1180 /* get the byte for the output */
1181 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1182 /* is this code point assigned, or do we use fallbacks? */
1183 *retval=(uint32_t)(value&0xff);
1185 return 1; /* roundtrip */
1186 } else if(useFallback ? value>=0x800 : value>=0xc00) {
1187 return -1; /* fallback taken */
1189 return 0; /* no mapping */
1194 * Check that the result is a 2-byte value with each byte in the range A1..FE
1195 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1196 * to move it to the ISO 2022 range 21..7E.
1197 * Return 0 if out of range.
1199 static inline uint32_t
1200 _2022FromGR94DBCS(uint32_t value) {
1201 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1202 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1204 return value - 0x8080; /* shift down to 21..7e byte range */
1206 return 0; /* not valid for ISO 2022 */
1210 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1212 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1213 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1216 static inline uint32_t
1217 _2022ToGR94DBCS(uint32_t value) {
1218 uint32_t returnValue = value + 0x8080;
1219 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1220 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1228 #ifdef U_ENABLE_GENERIC_ISO_2022
1230 /**********************************************************************************
1231 * ISO-2022 Converter
1237 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1239 const char* mySourceLimit, *realSourceLimit;
1240 const char* sourceStart;
1241 const UChar* myTargetStart;
1242 UConverter* saveThis;
1243 UConverterDataISO2022* myData;
1246 saveThis = args->converter;
1247 myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1249 realSourceLimit = args->sourceLimit;
1250 while (args->source < realSourceLimit) {
1251 if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1252 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1253 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1255 if(args->source < mySourceLimit) {
1256 if(myData->currentConverter==NULL) {
1257 myData->currentConverter = ucnv_open("ASCII",err);
1258 if(U_FAILURE(*err)){
1262 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1263 saveThis->mode = UCNV_SO;
1266 /* convert to before the ESC or until the end of the buffer */
1267 myData->isFirstBuffer=FALSE;
1268 sourceStart = args->source;
1269 myTargetStart = args->target;
1270 args->converter = myData->currentConverter;
1271 ucnv_toUnicode(args->converter,
1277 (UBool)(args->flush && mySourceLimit == realSourceLimit),
1279 args->converter = saveThis;
1281 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1282 /* move the overflow buffer */
1283 length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1284 myData->currentConverter->UCharErrorBufferLength = 0;
1286 uprv_memcpy(saveThis->UCharErrorBuffer,
1287 myData->currentConverter->UCharErrorBuffer,
1288 length*U_SIZEOF_UCHAR);
1295 * -Error while converting
1296 * -Done with entire buffer
1297 * -Need to write offsets or update the current offset
1298 * (leave that up to the code in ucnv.c)
1300 * or else we just stopped at an ESC byte and continue with changeState_2022()
1302 if (U_FAILURE(*err) ||
1303 (args->source == realSourceLimit) ||
1304 (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1305 (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1307 /* copy partial or error input for truncated detection and error handling */
1308 if(U_FAILURE(*err)) {
1309 length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1311 uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1314 length = saveThis->toULength = myData->currentConverter->toULength;
1316 uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1317 if(args->source < mySourceLimit) {
1318 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1327 sourceStart = args->source;
1328 changeState_2022(args->converter,
1333 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1334 /* let the ucnv.c code update its current offset */
1343 * To Unicode Callback helper function
1346 toUnicodeCallback(UConverter *cnv,
1347 const uint32_t sourceChar, const uint32_t targetUniChar,
1349 if(sourceChar>0xff){
1350 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1351 cnv->toUBytes[1] = (uint8_t)sourceChar;
1355 cnv->toUBytes[0] =(char) sourceChar;
1359 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1360 *err = U_INVALID_CHAR_FOUND;
1363 *err = U_ILLEGAL_CHAR_FOUND;
1367 /**************************************ISO-2022-JP*************************************************/
1369 /************************************** IMPORTANT **************************************************
1370 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1371 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1372 * The converter iterates over each Unicode codepoint
1373 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1374 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1375 * would do as far as possible.
1377 * If the implementation of these macros or structure of sharedData struct change in the future, make
1378 * sure that ISO-2022 is also changed.
1379 ***************************************************************************************************
1382 /***************************************************************************************************
1383 * Rules for ISO-2022-jp encoding
1384 * (i) Escape sequences must be fully contained within a line they should not
1385 * span new lines or CRs
1386 * (ii) If the last character on a line is represented by two bytes then an ASCII or
1387 * JIS-Roman character escape sequence should follow before the line terminates
1388 * (iii) If the first character on the line is represented by two bytes then a two
1389 * byte character escape sequence should precede it
1390 * (iv) If no escape sequence is encountered then the characters are ASCII
1391 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1392 * and invoked with SS2 (ESC N).
1393 * (vi) If there is any G0 designation in text, there must be a switch to
1394 * ASCII or to JIS X 0201-Roman before a space character (but not
1395 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1396 * characters such as tab or CRLF.
1397 * (vi) Supported encodings:
1398 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1402 * JISX201, JISX208,JISX212 : new .cnv data files created
1403 * KSC5601 : alias to ibm-949 mapping table
1404 * GB2312 : alias to ibm-1386 mapping table
1405 * ISO-8859-1 : Algorithmic implemented as LATIN1 case
1406 * ISO-8859-7 : alisas to ibm-9409 mapping table
1409 /* preference order of JP charsets */
1410 static const StateEnum jpCharsetPref[]={
1413 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1418 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1427 * The escape sequences must be in order of the enum constants like JISX201 = 3,
1428 * not in order of jpCharsetPref[]!
1430 static const char escSeqChars[][6] ={
1431 "\x1B\x28\x42", /* <ESC>(B ASCII */
1432 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1433 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1434 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1435 "\x1B\x24\x42", /* <ESC>$B JISX-208 */
1436 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1437 "\x1B\x24\x41", /* <ESC>$A GB2312 */
1438 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1439 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1442 static const int8_t escSeqCharsLen[] ={
1443 3, /* length of <ESC>(B ASCII */
1444 3, /* length of <ESC>.A ISO-8859-1 */
1445 3, /* length of <ESC>.F ISO-8859-7 */
1446 3, /* length of <ESC>(J JISX-201 */
1447 3, /* length of <ESC>$B JISX-208 */
1448 4, /* length of <ESC>$(D JISX-212 */
1449 3, /* length of <ESC>$A GB2312 */
1450 4, /* length of <ESC>$(C KSC5601 */
1451 3 /* length of <ESC>(I HWKANA_7BIT */
1455 * The iteration over various code pages works this way:
1456 * i) Get the currentState from myConverterData->currentState
1457 * ii) Check if the character is mapped to a valid character in the currentState
1458 * Yes -> a) set the initIterState to currentState
1459 * b) remain in this state until an invalid character is found
1460 * No -> a) go to the next code page and find the character
1461 * iii) Before changing the state increment the current state check if the current state
1462 * is equal to the intitIteration state
1463 * Yes -> A character that cannot be represented in any of the supported encodings
1464 * break and return a U_INVALID_CHARACTER error
1465 * No -> Continue and find the character in next code page
1468 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1471 /* Map 00..7F to Unicode according to JIS X 0201. */
1472 static inline uint32_t
1473 jisx201ToU(uint32_t value) {
1476 } else if(value == 0x5c) {
1478 } else if(value == 0x7e) {
1480 } else /* value <= 0x7f */ {
1485 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1486 static inline uint32_t
1487 jisx201FromU(uint32_t value) {
1489 if(value!=0x5c && value!=0x7e) {
1492 } else if(value==0xa5) {
1494 } else if(value==0x203e) {
1501 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1502 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1503 * Return 0 if the byte pair is out of range.
1505 static inline uint32_t
1506 _2022FromSJIS(uint32_t value) {
1509 if(value > 0xEFFC) {
1510 return 0; /* beyond JIS X 0208 */
1513 trail = (uint8_t)value;
1515 value &= 0xff00; /* lead byte */
1516 if(value <= 0x9f00) {
1518 } else /* 0xe000 <= value <= 0xef00 */ {
1526 value |= trail - 0x1f;
1528 value |= trail - 0x20;
1530 } else /* trail <= 0xfc */ {
1531 value |= trail - 0x7e;
1537 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1538 * If either byte is outside 21..7E make sure that the result is not valid
1539 * for Shift-JIS so that the converter catches it.
1540 * Some invalid byte values already turn into equally invalid Shift-JIS
1541 * byte values and need not be tested explicitly.
1544 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1549 } else if(c2 <= 0x7e) {
1552 c2 = 0; /* invalid */
1555 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1558 c2 = 0; /* invalid */
1564 } else if(c1 <= 0x3f) {
1567 c1 = 0; /* invalid */
1569 bytes[0] = (char)c1;
1570 bytes[1] = (char)c2;
1574 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1576 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1577 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1578 * These were the only fallbacks in ICU's jisx-208.ucm file.
1580 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1581 0x2123, /* U+FF61 */
1596 0x213C, /* U+FF70 */
1612 0x253F, /* U+FF80 */
1628 0x255F, /* U+FF90 */
1647 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1648 UConverter *cnv = args->converter;
1649 UConverterDataISO2022 *converterData;
1650 ISO2022State *pFromU2022State;
1651 uint8_t *target = (uint8_t *) args->target;
1652 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1653 const UChar* source = args->source;
1654 const UChar* sourceLimit = args->sourceLimit;
1655 int32_t* offsets = args->offsets;
1658 int32_t len, outLen;
1660 int32_t choiceCount;
1661 uint32_t targetValue = 0;
1667 /* set up the state */
1668 converterData = (UConverterDataISO2022*)cnv->extraInfo;
1669 pFromU2022State = &converterData->fromU2022State;
1673 /* check if the last codepoint of previous buffer was a lead surrogate*/
1674 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1678 while(source < sourceLimit) {
1679 if(target < targetLimit) {
1681 sourceChar = *(source++);
1682 /*check if the char is a First surrogate*/
1683 if(U16_IS_SURROGATE(sourceChar)) {
1684 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1686 /*look ahead to find the trail surrogate*/
1687 if(source < sourceLimit) {
1688 /* test the following code unit */
1689 UChar trail=(UChar) *source;
1690 if(U16_IS_TRAIL(trail)) {
1692 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1693 cnv->fromUChar32=0x00;
1694 /* convert this supplementary code point */
1695 /* exit this condition tree */
1697 /* this is an unmatched lead code unit (1st surrogate) */
1698 /* callback(illegal) */
1699 *err=U_ILLEGAL_CHAR_FOUND;
1700 cnv->fromUChar32=sourceChar;
1705 cnv->fromUChar32=sourceChar;
1709 /* this is an unmatched trail code unit (2nd surrogate) */
1710 /* callback(illegal) */
1711 *err=U_ILLEGAL_CHAR_FOUND;
1712 cnv->fromUChar32=sourceChar;
1717 /* do not convert SO/SI/ESC */
1718 if(IS_2022_CONTROL(sourceChar)) {
1719 /* callback(illegal) */
1720 *err=U_ILLEGAL_CHAR_FOUND;
1721 cnv->fromUChar32=sourceChar;
1725 /* do the conversion */
1727 if(choiceCount == 0) {
1731 * The csm variable keeps track of which charsets are allowed
1732 * and not used yet while building the choices[].
1734 csm = jpCharsetMasks[converterData->version];
1737 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1738 if(converterData->version == 3 || converterData->version == 4) {
1739 choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1741 /* Do not try single-byte half-width Katakana for other versions. */
1742 csm &= ~CSM(HWKANA_7BIT);
1744 /* try the current G0 charset */
1745 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1748 /* try the current G2 charset */
1749 if((cs = pFromU2022State->cs[2]) != 0) {
1750 choices[choiceCount++] = cs;
1754 /* try all the other possible charsets */
1755 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1756 cs = (int8_t)jpCharsetPref[i];
1758 choices[choiceCount++] = cs;
1766 * len==0: no mapping found yet
1767 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1768 * len>0: found a roundtrip result, done
1772 * We will turn off useFallback after finding a fallback,
1773 * but we still get fallbacks from PUA code points as usual.
1774 * Therefore, we will also need to check that we don't overwrite
1775 * an early fallback with a later one.
1777 useFallback = cnv->useFallback;
1779 for(i = 0; i < choiceCount && len <= 0; ++i) {
1782 int8_t cs0 = choices[i];
1785 if(sourceChar <= 0x7f) {
1786 targetValue = (uint32_t)sourceChar;
1792 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1794 if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1795 targetValue = (uint32_t)sourceChar - 0x80;
1803 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1804 if(converterData->version==3) {
1805 /* JIS7: use G1 (SO) */
1806 /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1807 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1809 pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1811 } else if(converterData->version==4) {
1812 /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1813 /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1814 targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1817 cs = pFromU2022State->cs[0];
1818 if(IS_JP_DBCS(cs)) {
1819 /* switch from a DBCS charset to JISX201 */
1820 cs = (int8_t)JISX201;
1822 /* else stay in the current G0 charset */
1825 /* else do not use HWKANA_7BIT with other versions */
1830 value = jisx201FromU(sourceChar);
1832 targetValue = value;
1836 useFallback = FALSE;
1840 /* G0 DBCS from Shift-JIS table */
1841 len2 = MBCS_FROM_UCHAR32_ISO2022(
1842 converterData->myConverterArray[cs0],
1844 useFallback, MBCS_OUTPUT_2);
1845 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1846 value = _2022FromSJIS(value);
1848 targetValue = value;
1852 useFallback = FALSE;
1854 } else if(len == 0 && useFallback &&
1855 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1856 targetValue = hwkana_fb[sourceChar - HWKANA_START];
1860 useFallback = FALSE;
1863 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1865 /* G0 SBCS forced to 7-bit output */
1866 len2 = MBCS_SINGLE_FROM_UCHAR32(
1867 converterData->myConverterArray[cs0],
1870 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1871 targetValue = value - 0x80;
1875 useFallback = FALSE;
1881 len2 = MBCS_FROM_UCHAR32_ISO2022(
1882 converterData->myConverterArray[cs0],
1884 useFallback, MBCS_OUTPUT_2);
1885 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1886 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1887 if(cs0 == KSC5601) {
1889 * Check for valid bytes for the encoding scheme.
1890 * This is necessary because the sub-converter (windows-949)
1891 * has a broader encoding scheme than is valid for 2022.
1893 value = _2022FromGR94DBCS(value);
1899 targetValue = value;
1903 useFallback = FALSE;
1911 len = -len; /* fallback */
1913 outLen = 0; /* count output bytes */
1915 /* write SI if necessary (only for JIS7) */
1916 if(pFromU2022State->g == 1 && g == 0) {
1917 buffer[outLen++] = UCNV_SI;
1918 pFromU2022State->g = 0;
1921 /* write the designation sequence if necessary */
1922 if(cs != pFromU2022State->cs[g]) {
1923 int32_t escLen = escSeqCharsLen[cs];
1924 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1926 pFromU2022State->cs[g] = cs;
1928 /* invalidate the choices[] */
1932 /* write the shift sequence if necessary */
1933 if(g != pFromU2022State->g) {
1935 /* case 0 handled before writing escapes */
1937 buffer[outLen++] = UCNV_SO;
1938 pFromU2022State->g = 1;
1940 default: /* case 2 */
1941 buffer[outLen++] = 0x1b;
1942 buffer[outLen++] = 0x4e;
1944 /* no case 3: no SS3 in ISO-2022-JP-x */
1948 /* write the output bytes */
1950 buffer[outLen++] = (char)targetValue;
1951 } else /* len == 2 */ {
1952 buffer[outLen++] = (char)(targetValue >> 8);
1953 buffer[outLen++] = (char)targetValue;
1957 * if we cannot find the character after checking all codepages
1958 * then this is an error
1960 *err = U_INVALID_CHAR_FOUND;
1961 cnv->fromUChar32=sourceChar;
1965 if(sourceChar == CR || sourceChar == LF) {
1966 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1967 pFromU2022State->cs[2] = 0;
1971 /* output outLen>0 bytes in buffer[] */
1973 *target++ = buffer[0];
1975 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1977 } else if(outLen == 2 && (target + 2) <= targetLimit) {
1978 *target++ = buffer[0];
1979 *target++ = buffer[1];
1981 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1982 *offsets++ = sourceIndex;
1983 *offsets++ = sourceIndex;
1989 &target, (const char *)targetLimit,
1990 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1992 if(U_FAILURE(*err)) {
1996 } /* end if(myTargetIndex<myTargetLength) */
1998 *err =U_BUFFER_OVERFLOW_ERROR;
2002 }/* end while(mySourceIndex<mySourceLength) */
2005 * the end of the input stream and detection of truncated input
2006 * are handled by the framework, but for ISO-2022-JP conversion
2007 * we need to be in ASCII mode at the very end
2011 * in SO mode or not in ASCII mode
2012 * end of input and no truncated input
2014 if( U_SUCCESS(*err) &&
2015 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2016 args->flush && source>=sourceLimit && cnv->fromUChar32==0
2018 int32_t sourceIndex;
2022 if(pFromU2022State->g != 0) {
2023 buffer[outLen++] = UCNV_SI;
2024 pFromU2022State->g = 0;
2027 if(pFromU2022State->cs[0] != ASCII) {
2028 int32_t escLen = escSeqCharsLen[ASCII];
2029 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2031 pFromU2022State->cs[0] = (int8_t)ASCII;
2034 /* get the source index of the last input character */
2036 * TODO this would be simpler and more reliable if we used a pair
2037 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2038 * so that we could simply use the prevSourceIndex here;
2039 * this code gives an incorrect result for the rare case of an unmatched
2040 * trail surrogate that is alone in the last buffer of the text stream
2042 sourceIndex=(int32_t)(source-args->source);
2045 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2046 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2057 &target, (const char *)targetLimit,
2058 &offsets, sourceIndex,
2062 /*save the state and return */
2063 args->source = source;
2064 args->target = (char*)target;
2067 /*************** to unicode *******************/
2070 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2073 const char *mySource = (char *) args->source;
2074 UChar *myTarget = args->target;
2075 const char *mySourceLimit = args->sourceLimit;
2076 uint32_t targetUniChar = 0x0000;
2077 uint32_t mySourceChar = 0x0000;
2078 uint32_t tmpSourceChar = 0x0000;
2079 UConverterDataISO2022* myData;
2080 ISO2022State *pToU2022State;
2083 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2084 pToU2022State = &myData->toU2022State;
2086 if(myData->key != 0) {
2087 /* continue with a partial escape sequence */
2089 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2090 /* continue with a partial double-byte character */
2091 mySourceChar = args->converter->toUBytes[0];
2092 args->converter->toULength = 0;
2093 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2094 targetUniChar = missingCharMarker;
2098 while(mySource < mySourceLimit){
2100 targetUniChar =missingCharMarker;
2102 if(myTarget < args->targetLimit){
2104 mySourceChar= (unsigned char) *mySource++;
2106 switch(mySourceChar) {
2108 if(myData->version==3) {
2112 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2113 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2118 if(myData->version==3) {
2119 /* JIS7: switch to G1 half-width Katakana */
2120 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2124 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2125 myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2133 const char * mySourceBefore = mySource;
2134 int8_t toULengthBefore = args->converter->toULength;
2136 changeState_2022(args->converter,&(mySource),
2137 mySourceLimit, ISO_2022_JP,err);
2139 /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2140 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2141 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2142 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2143 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2147 /* invalid or illegal escape sequence */
2148 if(U_FAILURE(*err)){
2149 args->target = myTarget;
2150 args->source = mySource;
2151 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2154 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2155 if(myData->key==0) {
2156 myData->isEmptySegment = TRUE;
2160 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2165 /* automatically reset to single-byte mode */
2166 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2167 pToU2022State->cs[0] = (int8_t)ASCII;
2169 pToU2022State->cs[2] = 0;
2170 pToU2022State->g = 0;
2173 /* convert one or two bytes */
2174 myData->isEmptySegment = FALSE;
2175 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2176 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2179 /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2180 targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2182 /* return from a single-shift state to the previous one */
2183 if(pToU2022State->g >= 2) {
2184 pToU2022State->g=pToU2022State->prevG;
2188 if(mySourceChar <= 0x7f) {
2189 targetUniChar = mySourceChar;
2192 #if !UCONFIG_NO_NON_HTML5_CONVERSION
2194 if(mySourceChar <= 0x7f) {
2195 targetUniChar = mySourceChar + 0x80;
2197 /* return from a single-shift state to the previous one */
2198 pToU2022State->g=pToU2022State->prevG;
2201 if(mySourceChar <= 0x7f) {
2202 /* convert mySourceChar+0x80 to use a normal 8-bit table */
2204 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2205 myData->myConverterArray[cs],
2206 mySourceChar + 0x80);
2208 /* return from a single-shift state to the previous one */
2209 pToU2022State->g=pToU2022State->prevG;
2213 if(mySourceChar <= 0x7f) {
2214 targetUniChar = jisx201ToU(mySourceChar);
2218 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2219 /* 7-bit halfwidth Katakana */
2220 targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2225 if(mySource < mySourceLimit) {
2226 int leadIsOk, trailIsOk;
2229 trailByte = (uint8_t)*mySource;
2231 * Ticket 5691: consistent illegal sequences:
2232 * - We include at least the first byte in the illegal sequence.
2233 * - If any of the non-initial bytes could be the start of a character,
2234 * we stop the illegal sequence before the first one of those.
2236 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2237 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2238 * Otherwise we convert or report the pair of bytes.
2240 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2241 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2242 if (leadIsOk && trailIsOk) {
2244 tmpSourceChar = (mySourceChar << 8) | trailByte;
2246 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2247 mySourceChar = tmpSourceChar;
2249 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2250 mySourceChar = tmpSourceChar;
2251 #if !UCONFIG_NO_NON_HTML5_CONVERSION
2252 if (cs == KSC5601) {
2253 tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
2256 tempBuf[0] = (char)(tmpSourceChar >> 8);
2257 tempBuf[1] = (char)(tmpSourceChar);
2259 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2260 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2261 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2263 /* add another bit so that the code below writes 2 bytes in case of error */
2264 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2267 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2268 args->converter->toULength = 1;
2271 } /* End of inner switch */
2273 } /* End of outer switch */
2274 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2276 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2278 *(myTarget++)=(UChar)targetUniChar;
2280 else if(targetUniChar > missingCharMarker){
2281 /* disassemble the surrogate pair and write to output*/
2282 targetUniChar-=0x0010000;
2283 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2285 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2288 if(myTarget< args->targetLimit){
2289 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2291 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2295 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2296 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2301 /* Call the callback function*/
2302 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2306 else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
2307 *err =U_BUFFER_OVERFLOW_ERROR;
2312 args->target = myTarget;
2313 args->source = mySource;
2317 #if !UCONFIG_NO_NON_HTML5_CONVERSION
2318 /***************************************************************
2319 * Rules for ISO-2022-KR encoding
2320 * i) The KSC5601 designator sequence should appear only once in a file,
2321 * at the begining of a line before any KSC5601 characters. This usually
2322 * means that it appears by itself on the first line of the file
2323 * ii) There are only 2 shifting sequences SO to shift into double byte mode
2324 * and SI to shift into single byte mode
2327 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2329 UConverter* saveConv = args->converter;
2330 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2331 args->converter=myConverterData->currentConverter;
2333 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2334 ucnv_MBCSFromUnicodeWithOffsets(args,err);
2335 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2337 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2338 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2340 saveConv->charErrorBuffer,
2341 myConverterData->currentConverter->charErrorBuffer,
2342 myConverterData->currentConverter->charErrorBufferLength);
2344 saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2345 myConverterData->currentConverter->charErrorBufferLength = 0;
2347 args->converter=saveConv;
2351 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2353 const UChar *source = args->source;
2354 const UChar *sourceLimit = args->sourceLimit;
2355 unsigned char *target = (unsigned char *) args->target;
2356 unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2357 int32_t* offsets = args->offsets;
2358 uint32_t targetByteUnit = 0x0000;
2359 UChar32 sourceChar = 0x0000;
2360 UBool isTargetByteDBCS;
2361 UBool oldIsTargetByteDBCS;
2362 UConverterDataISO2022 *converterData;
2363 UConverterSharedData* sharedData;
2367 converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2368 /* if the version is 1 then the user is requesting
2369 * conversion with ibm-25546 pass the arguments to
2370 * MBCS converter and return
2372 if(converterData->version==1){
2373 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2377 /* initialize data */
2378 sharedData = converterData->currentConverter->sharedData;
2379 useFallback = args->converter->useFallback;
2380 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2381 oldIsTargetByteDBCS = isTargetByteDBCS;
2383 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2384 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2387 while(source < sourceLimit){
2389 targetByteUnit = missingCharMarker;
2391 if(target < (unsigned char*) args->targetLimit){
2392 sourceChar = *source++;
2394 /* do not convert SO/SI/ESC */
2395 if(IS_2022_CONTROL(sourceChar)) {
2396 /* callback(illegal) */
2397 *err=U_ILLEGAL_CHAR_FOUND;
2398 args->converter->fromUChar32=sourceChar;
2402 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2404 length = -length; /* fallback */
2406 /* only DBCS or SBCS characters are expected*/
2407 /* DB characters with high bit set to 1 are expected */
2408 if( length > 2 || length==0 ||
2409 (length == 1 && targetByteUnit > 0x7f) ||
2411 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2412 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2414 targetByteUnit=missingCharMarker;
2416 if (targetByteUnit != missingCharMarker){
2418 oldIsTargetByteDBCS = isTargetByteDBCS;
2419 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2420 /* append the shift sequence */
2421 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2423 if (isTargetByteDBCS)
2424 *target++ = UCNV_SO;
2426 *target++ = UCNV_SI;
2428 *(offsets++) = (int32_t)(source - args->source-1);
2430 /* write the targetUniChar to target */
2431 if(targetByteUnit <= 0x00FF){
2432 if( target < targetLimit){
2433 *(target++) = (unsigned char) targetByteUnit;
2435 *(offsets++) = (int32_t)(source - args->source-1);
2439 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2440 *err = U_BUFFER_OVERFLOW_ERROR;
2443 if(target < targetLimit){
2444 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2446 *(offsets++) = (int32_t)(source - args->source-1);
2448 if(target < targetLimit){
2449 *(target++) =(unsigned char) (targetByteUnit -0x80);
2451 *(offsets++) = (int32_t)(source - args->source-1);
2454 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2455 *err = U_BUFFER_OVERFLOW_ERROR;
2458 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2459 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2460 *err = U_BUFFER_OVERFLOW_ERROR;
2466 /* oops.. the code point is unassingned
2467 * set the error and reason
2470 /*check if the char is a First surrogate*/
2471 if(U16_IS_SURROGATE(sourceChar)) {
2472 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2474 /*look ahead to find the trail surrogate*/
2475 if(source < sourceLimit) {
2476 /* test the following code unit */
2477 UChar trail=(UChar) *source;
2478 if(U16_IS_TRAIL(trail)) {
2480 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2481 *err = U_INVALID_CHAR_FOUND;
2482 /* convert this surrogate code point */
2483 /* exit this condition tree */
2485 /* this is an unmatched lead code unit (1st surrogate) */
2486 /* callback(illegal) */
2487 *err=U_ILLEGAL_CHAR_FOUND;
2491 *err = U_ZERO_ERROR;
2494 /* this is an unmatched trail code unit (2nd surrogate) */
2495 /* callback(illegal) */
2496 *err=U_ILLEGAL_CHAR_FOUND;
2499 /* callback(unassigned) for a BMP code point */
2500 *err = U_INVALID_CHAR_FOUND;
2503 args->converter->fromUChar32=sourceChar;
2506 } /* end if(myTargetIndex<myTargetLength) */
2508 *err =U_BUFFER_OVERFLOW_ERROR;
2512 }/* end while(mySourceIndex<mySourceLength) */
2515 * the end of the input stream and detection of truncated input
2516 * are handled by the framework, but for ISO-2022-KR conversion
2517 * we need to be in ASCII mode at the very end
2522 * end of input and no truncated input
2524 if( U_SUCCESS(*err) &&
2526 args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2528 int32_t sourceIndex;
2530 /* we are switching to ASCII */
2531 isTargetByteDBCS=FALSE;
2533 /* get the source index of the last input character */
2535 * TODO this would be simpler and more reliable if we used a pair
2536 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2537 * so that we could simply use the prevSourceIndex here;
2538 * this code gives an incorrect result for the rare case of an unmatched
2539 * trail surrogate that is alone in the last buffer of the text stream
2541 sourceIndex=(int32_t)(source-args->source);
2544 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2545 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2556 &target, (const char *)targetLimit,
2557 &offsets, sourceIndex,
2561 /*save the state and return */
2562 args->source = source;
2563 args->target = (char*)target;
2564 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2567 /************************ To Unicode ***************************************/
2570 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2572 char const* sourceStart;
2573 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2575 UConverterToUnicodeArgs subArgs;
2576 int32_t minArgsSize;
2578 /* set up the subconverter arguments */
2579 if(args->size<sizeof(UConverterToUnicodeArgs)) {
2580 minArgsSize = args->size;
2582 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2585 uprv_memcpy(&subArgs, args, minArgsSize);
2586 subArgs.size = (uint16_t)minArgsSize;
2587 subArgs.converter = myData->currentConverter;
2589 /* remember the original start of the input for offsets */
2590 sourceStart = args->source;
2592 if(myData->key != 0) {
2593 /* continue with a partial escape sequence */
2597 while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2598 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2599 subArgs.source = args->source;
2600 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2601 if(subArgs.source != subArgs.sourceLimit) {
2603 * get the current partial byte sequence
2605 * it needs to be moved between the public and the subconverter
2606 * so that the conversion framework, which only sees the public
2607 * converter, can handle truncated and illegal input etc.
2609 if(args->converter->toULength > 0) {
2610 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2612 subArgs.converter->toULength = args->converter->toULength;
2615 * Convert up to the end of the input, or to before the next escape character.
2616 * Does not handle conversion extensions because the preToU[] state etc.
2619 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2621 if(args->offsets != NULL && sourceStart != args->source) {
2622 /* update offsets to base them on the actual start of the input */
2623 int32_t *offsets = args->offsets;
2624 UChar *target = args->target;
2625 int32_t delta = (int32_t)(args->source - sourceStart);
2626 while(target < subArgs.target) {
2634 args->source = subArgs.source;
2635 args->target = subArgs.target;
2636 args->offsets = subArgs.offsets;
2638 /* copy input/error/overflow buffers */
2639 if(subArgs.converter->toULength > 0) {
2640 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2642 args->converter->toULength = subArgs.converter->toULength;
2644 if(*err == U_BUFFER_OVERFLOW_ERROR) {
2645 if(subArgs.converter->UCharErrorBufferLength > 0) {
2646 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2647 subArgs.converter->UCharErrorBufferLength);
2649 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2650 subArgs.converter->UCharErrorBufferLength = 0;
2654 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2659 changeState_2022(args->converter,
2668 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2671 const char *mySource = ( char *) args->source;
2672 UChar *myTarget = args->target;
2673 const char *mySourceLimit = args->sourceLimit;
2674 UChar32 targetUniChar = 0x0000;
2675 UChar mySourceChar = 0x0000;
2676 UConverterDataISO2022* myData;
2677 UConverterSharedData* sharedData ;
2680 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2681 if(myData->version==1){
2682 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2686 /* initialize state */
2687 sharedData = myData->currentConverter->sharedData;
2688 useFallback = args->converter->useFallback;
2690 if(myData->key != 0) {
2691 /* continue with a partial escape sequence */
2693 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2694 /* continue with a partial double-byte character */
2695 mySourceChar = args->converter->toUBytes[0];
2696 args->converter->toULength = 0;
2700 while(mySource< mySourceLimit){
2702 if(myTarget < args->targetLimit){
2704 mySourceChar= (unsigned char) *mySource++;
2706 if(mySourceChar==UCNV_SI){
2707 myData->toU2022State.g = 0;
2708 if (myData->isEmptySegment) {
2709 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2710 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2711 args->converter->toUCallbackReason = UCNV_IRREGULAR;
2712 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2713 args->converter->toULength = 1;
2714 args->target = myTarget;
2715 args->source = mySource;
2718 /*consume the source */
2720 }else if(mySourceChar==UCNV_SO){
2721 myData->toU2022State.g = 1;
2722 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2723 /*consume the source */
2725 }else if(mySourceChar==ESC_2022){
2728 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2729 changeState_2022(args->converter,&(mySource),
2730 mySourceLimit, ISO_2022_KR, err);
2731 if(U_FAILURE(*err)){
2732 args->target = myTarget;
2733 args->source = mySource;
2739 myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2740 if(myData->toU2022State.g == 1) {
2741 if(mySource < mySourceLimit) {
2742 int leadIsOk, trailIsOk;
2745 targetUniChar = missingCharMarker;
2746 trailByte = (uint8_t)*mySource;
2748 * Ticket 5691: consistent illegal sequences:
2749 * - We include at least the first byte in the illegal sequence.
2750 * - If any of the non-initial bytes could be the start of a character,
2751 * we stop the illegal sequence before the first one of those.
2753 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2754 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2755 * Otherwise we convert or report the pair of bytes.
2757 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2758 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2759 if (leadIsOk && trailIsOk) {
2761 tempBuf[0] = (char)(mySourceChar + 0x80);
2762 tempBuf[1] = (char)(trailByte + 0x80);
2763 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2764 mySourceChar = (mySourceChar << 8) | trailByte;
2765 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2766 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2768 /* add another bit so that the code below writes 2 bytes in case of error */
2769 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2772 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2773 args->converter->toULength = 1;
2777 else if(mySourceChar <= 0x7f) {
2778 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2780 targetUniChar = 0xffff;
2782 if(targetUniChar < 0xfffe){
2784 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2786 *(myTarget++)=(UChar)targetUniChar;
2789 /* Call the callback function*/
2790 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2795 *err =U_BUFFER_OVERFLOW_ERROR;
2799 args->target = myTarget;
2800 args->source = mySource;
2803 /*************************** END ISO2022-KR *********************************/
2805 /*************************** ISO-2022-CN *********************************
2807 * Rules for ISO-2022-CN Encoding:
2808 * i) The designator sequence must appear once on a line before any instance
2809 * of character set it designates.
2810 * ii) If two lines contain characters from the same character set, both lines
2811 * must include the designator sequence.
2812 * iii) Once the designator sequence is known, a shifting sequence has to be found
2813 * to invoke the shifting
2814 * iv) All lines start in ASCII and end in ASCII.
2815 * v) Four shifting sequences are employed for this purpose:
2817 * Sequcence ASCII Eq Charsets
2818 * ---------- ------- ---------
2820 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2821 * SS2 <ESC>N CNS-11643-1992 Plane 2
2822 * SS3 <ESC>O CNS-11643-1992 Planes 3-7
2825 * SOdesignator : ESC "$" ")" finalchar_for_SO
2826 * SS2designator : ESC "$" "*" finalchar_for_SS2
2827 * SS3designator : ESC "$" "+" finalchar_for_SS3
2829 * ESC $ ) A Indicates the bytes following SO are Chinese
2830 * characters as defined in GB 2312-80, until
2831 * another SOdesignation appears
2834 * ESC $ ) E Indicates the bytes following SO are as defined
2835 * in ISO-IR-165 (for details, see section 2.1),
2836 * until another SOdesignation appears
2838 * ESC $ ) G Indicates the bytes following SO are as defined
2839 * in CNS 11643-plane-1, until another
2840 * SOdesignation appears
2842 * ESC $ * H Indicates the two bytes immediately following
2843 * SS2 is a Chinese character as defined in CNS
2844 * 11643-plane-2, until another SS2designation
2846 * (Meaning <ESC>N must preceed every 2 byte
2849 * ESC $ + I Indicates the immediate two bytes following SS3
2850 * is a Chinese character as defined in CNS
2851 * 11643-plane-3, until another SS3designation
2853 * (Meaning <ESC>O must preceed every 2 byte
2856 * ESC $ + J Indicates the immediate two bytes following SS3
2857 * is a Chinese character as defined in CNS
2858 * 11643-plane-4, until another SS3designation
2860 * (In English: <ESC>O must preceed every 2 byte
2863 * ESC $ + K Indicates the immediate two bytes following SS3
2864 * is a Chinese character as defined in CNS
2865 * 11643-plane-5, until another SS3designation
2868 * ESC $ + L Indicates the immediate two bytes following SS3
2869 * is a Chinese character as defined in CNS
2870 * 11643-plane-6, until another SS3designation
2873 * ESC $ + M Indicates the immediate two bytes following SS3
2874 * is a Chinese character as defined in CNS
2875 * 11643-plane-7, until another SS3designation
2878 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2879 * has its own designation information before any Chinese characters
2884 /* The following are defined this way to make the strings truly readonly */
2885 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2886 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2887 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2888 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2889 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2890 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2891 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2892 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2893 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2895 /********************** ISO2022-CN Data **************************/
2896 static const char* const escSeqCharsCN[10] ={
2897 SHIFT_IN_STR, /* 0 ASCII */
2898 GB_2312_80_STR, /* 1 GB2312_1 */
2899 ISO_IR_165_STR, /* 2 ISO_IR_165 */
2900 CNS_11643_1992_Plane_1_STR,
2901 CNS_11643_1992_Plane_2_STR,
2902 CNS_11643_1992_Plane_3_STR,
2903 CNS_11643_1992_Plane_4_STR,
2904 CNS_11643_1992_Plane_5_STR,
2905 CNS_11643_1992_Plane_6_STR,
2906 CNS_11643_1992_Plane_7_STR
2910 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2911 UConverter *cnv = args->converter;
2912 UConverterDataISO2022 *converterData;
2913 ISO2022State *pFromU2022State;
2914 uint8_t *target = (uint8_t *) args->target;
2915 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2916 const UChar* source = args->source;
2917 const UChar* sourceLimit = args->sourceLimit;
2918 int32_t* offsets = args->offsets;
2923 int32_t choiceCount;
2924 uint32_t targetValue = 0;
2927 /* set up the state */
2928 converterData = (UConverterDataISO2022*)cnv->extraInfo;
2929 pFromU2022State = &converterData->fromU2022State;
2933 /* check if the last codepoint of previous buffer was a lead surrogate*/
2934 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2938 while( source < sourceLimit){
2939 if(target < targetLimit){
2941 sourceChar = *(source++);
2942 /*check if the char is a First surrogate*/
2943 if(U16_IS_SURROGATE(sourceChar)) {
2944 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2946 /*look ahead to find the trail surrogate*/
2947 if(source < sourceLimit) {
2948 /* test the following code unit */
2949 UChar trail=(UChar) *source;
2950 if(U16_IS_TRAIL(trail)) {
2952 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2953 cnv->fromUChar32=0x00;
2954 /* convert this supplementary code point */
2955 /* exit this condition tree */
2957 /* this is an unmatched lead code unit (1st surrogate) */
2958 /* callback(illegal) */
2959 *err=U_ILLEGAL_CHAR_FOUND;
2960 cnv->fromUChar32=sourceChar;
2965 cnv->fromUChar32=sourceChar;
2969 /* this is an unmatched trail code unit (2nd surrogate) */
2970 /* callback(illegal) */
2971 *err=U_ILLEGAL_CHAR_FOUND;
2972 cnv->fromUChar32=sourceChar;
2977 /* do the conversion */
2978 if(sourceChar <= 0x007f ){
2979 /* do not convert SO/SI/ESC */
2980 if(IS_2022_CONTROL(sourceChar)) {
2981 /* callback(illegal) */
2982 *err=U_ILLEGAL_CHAR_FOUND;
2983 cnv->fromUChar32=sourceChar;
2988 if(pFromU2022State->g == 0) {
2989 buffer[0] = (char)sourceChar;
2992 buffer[0] = UCNV_SI;
2993 buffer[1] = (char)sourceChar;
2995 pFromU2022State->g = 0;
2998 if(sourceChar == CR || sourceChar == LF) {
2999 /* reset the state at the end of a line */
3000 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3005 /* convert U+0080..U+10ffff */
3009 if(choiceCount == 0) {
3010 /* try the current SO/G1 converter first */
3011 choices[0] = pFromU2022State->cs[1];
3013 /* default to GB2312_1 if none is designated yet */
3014 if(choices[0] == 0) {
3015 choices[0] = GB2312_1;
3018 if(converterData->version == 0) {
3021 /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3022 if(choices[0] == GB2312_1) {
3023 choices[1] = (int8_t)CNS_11643_1;
3025 choices[1] = (int8_t)GB2312_1;
3029 } else if (converterData->version == 1) {
3030 /* ISO-2022-CN-EXT */
3032 /* try one of the other converters */
3033 switch(choices[0]) {
3035 choices[1] = (int8_t)CNS_11643_1;
3036 choices[2] = (int8_t)ISO_IR_165;
3039 choices[1] = (int8_t)GB2312_1;
3040 choices[2] = (int8_t)CNS_11643_1;
3042 default: /* CNS_11643_x */
3043 choices[1] = (int8_t)GB2312_1;
3044 choices[2] = (int8_t)ISO_IR_165;
3050 choices[0] = (int8_t)CNS_11643_1;
3051 choices[1] = (int8_t)GB2312_1;
3057 * len==0: no mapping found yet
3058 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3059 * len>0: found a roundtrip result, done
3063 * We will turn off useFallback after finding a fallback,
3064 * but we still get fallbacks from PUA code points as usual.
3065 * Therefore, we will also need to check that we don't overwrite
3066 * an early fallback with a later one.
3068 useFallback = cnv->useFallback;
3070 for(i = 0; i < choiceCount && len <= 0; ++i) {
3071 int8_t cs0 = choices[i];
3075 if(cs0 >= CNS_11643_0) {
3076 len2 = MBCS_FROM_UCHAR32_ISO2022(
3077 converterData->myConverterArray[CNS_11643],
3082 if(len2 == 3 || (len2 == -3 && len == 0)) {
3083 targetValue = value;
3084 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3089 useFallback = FALSE;
3091 if(cs == CNS_11643_1) {
3093 } else if(cs == CNS_11643_2) {
3095 } else /* plane 3..7 */ if(converterData->version == 1) {
3098 /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3103 /* GB2312_1 or ISO-IR-165 */
3104 U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3105 len2 = MBCS_FROM_UCHAR32_ISO2022(
3106 converterData->myConverterArray[cs0],
3111 if(len2 == 2 || (len2 == -2 && len == 0)) {
3112 targetValue = value;
3116 useFallback = FALSE;
3123 len = 0; /* count output bytes; it must have been abs(len) == 2 */
3125 /* write the designation sequence if necessary */
3126 if(cs != pFromU2022State->cs[g]) {
3127 if(cs < CNS_11643) {
3128 uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3130 U_ASSERT(cs >= CNS_11643_1);
3131 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3134 pFromU2022State->cs[g] = cs;
3136 /* changing the SO/G1 charset invalidates the choices[] */
3141 /* write the shift sequence if necessary */
3142 if(g != pFromU2022State->g) {
3145 buffer[len++] = UCNV_SO;
3147 /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3148 pFromU2022State->g = 1;
3151 buffer[len++] = 0x1b;
3152 buffer[len++] = 0x4e;
3154 default: /* case 3 */
3155 buffer[len++] = 0x1b;
3156 buffer[len++] = 0x4f;
3161 /* write the two output bytes */
3162 buffer[len++] = (char)(targetValue >> 8);
3163 buffer[len++] = (char)targetValue;
3165 /* if we cannot find the character after checking all codepages
3166 * then this is an error
3168 *err = U_INVALID_CHAR_FOUND;
3169 cnv->fromUChar32=sourceChar;
3174 /* output len>0 bytes in buffer[] */
3176 *target++ = buffer[0];
3178 *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3180 } else if(len == 2 && (target + 2) <= targetLimit) {
3181 *target++ = buffer[0];
3182 *target++ = buffer[1];
3184 int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3185 *offsets++ = sourceIndex;
3186 *offsets++ = sourceIndex;
3192 &target, (const char *)targetLimit,
3193 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3195 if(U_FAILURE(*err)) {
3199 } /* end if(myTargetIndex<myTargetLength) */
3201 *err =U_BUFFER_OVERFLOW_ERROR;
3205 }/* end while(mySourceIndex<mySourceLength) */
3208 * the end of the input stream and detection of truncated input
3209 * are handled by the framework, but for ISO-2022-CN conversion
3210 * we need to be in ASCII mode at the very end
3215 * end of input and no truncated input
3217 if( U_SUCCESS(*err) &&
3218 pFromU2022State->g!=0 &&
3219 args->flush && source>=sourceLimit && cnv->fromUChar32==0
3221 int32_t sourceIndex;
3223 /* we are switching to ASCII */
3224 pFromU2022State->g=0;
3226 /* get the source index of the last input character */
3228 * TODO this would be simpler and more reliable if we used a pair
3229 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3230 * so that we could simply use the prevSourceIndex here;
3231 * this code gives an incorrect result for the rare case of an unmatched
3232 * trail surrogate that is alone in the last buffer of the text stream
3234 sourceIndex=(int32_t)(source-args->source);
3237 if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3238 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3249 &target, (const char *)targetLimit,
3250 &offsets, sourceIndex,
3254 /*save the state and return */
3255 args->source = source;
3256 args->target = (char*)target;
3261 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3264 const char *mySource = (char *) args->source;
3265 UChar *myTarget = args->target;
3266 const char *mySourceLimit = args->sourceLimit;
3267 uint32_t targetUniChar = 0x0000;
3268 uint32_t mySourceChar = 0x0000;
3269 UConverterDataISO2022* myData;
3270 ISO2022State *pToU2022State;
3272 myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3273 pToU2022State = &myData->toU2022State;
3275 if(myData->key != 0) {
3276 /* continue with a partial escape sequence */
3278 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3279 /* continue with a partial double-byte character */
3280 mySourceChar = args->converter->toUBytes[0];
3281 args->converter->toULength = 0;
3282 targetUniChar = missingCharMarker;
3286 while(mySource < mySourceLimit){
3288 targetUniChar =missingCharMarker;
3290 if(myTarget < args->targetLimit){
3292 mySourceChar= (unsigned char) *mySource++;
3294 switch(mySourceChar){
3297 if (myData->isEmptySegment) {
3298 myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3299 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3300 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3301 args->converter->toUBytes[0] = mySourceChar;
3302 args->converter->toULength = 1;
3303 args->target = myTarget;
3304 args->source = mySource;
3310 if(pToU2022State->cs[1] != 0) {
3312 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3315 /* illegal to have SO before a matching designator */
3316 myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3324 const char * mySourceBefore = mySource;
3325 int8_t toULengthBefore = args->converter->toULength;
3327 changeState_2022(args->converter,&(mySource),
3328 mySourceLimit, ISO_2022_CN,err);
3330 /* After SO there must be at least one character before a designator (designator error handled separately) */
3331 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3332 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3333 args->converter->toUCallbackReason = UCNV_IRREGULAR;
3334 args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3338 /* invalid or illegal escape sequence */
3339 if(U_FAILURE(*err)){
3340 args->target = myTarget;
3341 args->source = mySource;
3342 myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3347 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3352 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3355 /* convert one or two bytes */
3356 myData->isEmptySegment = FALSE;
3357 if(pToU2022State->g != 0) {
3358 if(mySource < mySourceLimit) {
3359 UConverterSharedData *cnv;
3360 StateEnum tempState;
3362 int leadIsOk, trailIsOk;
3365 trailByte = (uint8_t)*mySource;
3367 * Ticket 5691: consistent illegal sequences:
3368 * - We include at least the first byte in the illegal sequence.
3369 * - If any of the non-initial bytes could be the start of a character,
3370 * we stop the illegal sequence before the first one of those.
3372 * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3373 * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3374 * Otherwise we convert or report the pair of bytes.
3376 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3377 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3378 if (leadIsOk && trailIsOk) {
3380 tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3381 if(tempState >= CNS_11643_0) {
3382 cnv = myData->myConverterArray[CNS_11643];
3383 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3384 tempBuf[1] = (char) (mySourceChar);
3385 tempBuf[2] = (char) trailByte;
3389 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3390 cnv = myData->myConverterArray[tempState];
3391 tempBuf[0] = (char) (mySourceChar);
3392 tempBuf[1] = (char) trailByte;
3395 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3396 mySourceChar = (mySourceChar << 8) | trailByte;
3397 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3398 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3400 /* add another bit so that the code below writes 2 bytes in case of error */
3401 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3403 if(pToU2022State->g>=2) {
3404 /* return from a single-shift state to the previous one */
3405 pToU2022State->g=pToU2022State->prevG;
3408 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3409 args->converter->toULength = 1;
3414 if(mySourceChar <= 0x7f) {
3415 targetUniChar = (UChar) mySourceChar;
3420 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3422 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3424 *(myTarget++)=(UChar)targetUniChar;
3426 else if(targetUniChar > missingCharMarker){
3427 /* disassemble the surrogate pair and write to output*/
3428 targetUniChar-=0x0010000;
3429 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3431 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3434 if(myTarget< args->targetLimit){
3435 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3437 args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3441 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3442 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3447 /* Call the callback function*/
3448 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3453 *err =U_BUFFER_OVERFLOW_ERROR;
3458 args->target = myTarget;
3459 args->source = mySource;
3461 #endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
3464 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3465 UConverter *cnv = args->converter;
3466 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3467 ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3472 subchar=(char *)cnv->subChars;
3473 length=cnv->subCharLen; /* assume length==1 for most variants */
3476 switch(myConverterData->locale[0]){
3481 if(pFromU2022State->g == 1) {
3482 /* JIS7: switch from G1 to G0 */
3483 pFromU2022State->g = 0;
3487 cs = pFromU2022State->cs[0];
3488 if(cs != ASCII && cs != JISX201) {
3489 /* not in ASCII or JIS X 0201: switch to ASCII */
3490 pFromU2022State->cs[0] = (int8_t)ASCII;
3500 if(pFromU2022State->g != 0) {
3501 /* not in ASCII mode: switch to ASCII */
3502 pFromU2022State->g = 0;
3508 if(myConverterData->version == 0) {
3510 if((UBool)args->converter->fromUnicodeStatus) {
3511 /* in DBCS mode: switch to SBCS */
3512 args->converter->fromUnicodeStatus = 0;
3516 } else /* length == 2*/ {
3517 if(!(UBool)args->converter->fromUnicodeStatus) {
3518 /* in SBCS mode: switch to DBCS */
3519 args->converter->fromUnicodeStatus = 1;
3527 /* save the subconverter's substitution string */
3528 uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3529 int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3531 /* set our substitution string into the subconverter */
3532 myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3533 myConverterData->currentConverter->subCharLen = (int8_t)length;
3535 /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3536 args->converter = myConverterData->currentConverter;
3537 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3538 ucnv_cbFromUWriteSub(args, 0, err);
3539 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3540 args->converter = cnv;
3542 /* restore the subconverter's substitution string */
3543 myConverterData->currentConverter->subChars = currentSubChars;
3544 myConverterData->currentConverter->subCharLen = currentSubCharLen;
3546 if(*err == U_BUFFER_OVERFLOW_ERROR) {
3547 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3549 cnv->charErrorBuffer,
3550 myConverterData->currentConverter->charErrorBuffer,
3551 myConverterData->currentConverter->charErrorBufferLength);
3553 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3554 myConverterData->currentConverter->charErrorBufferLength = 0;
3562 ucnv_cbFromUWriteBytes(args,
3563 buffer, (int32_t)(p - buffer),
3568 * Structure for cloning an ISO 2022 converter into a single memory block.
3569 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3570 * and then ucnv_safeClone() of the sub-converter may additionally align
3571 * currentConverter inside the cloneStruct, for which we need the deadSpace
3572 * after currentConverter.
3573 * This is because UAlignedMemory may be larger than the actually
3574 * necessary alignment size for the platform.
3575 * The other cloneStruct fields will not be moved around,
3576 * and are aligned properly with cloneStruct's alignment.
3581 UConverter currentConverter;
3582 UAlignedMemory deadSpace;
3583 UConverterDataISO2022 mydata;
3588 _ISO_2022_SafeClone(
3589 const UConverter *cnv,
3591 int32_t *pBufferSize,
3594 struct cloneStruct * localClone;
3595 UConverterDataISO2022 *cnvData;
3598 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3599 *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3603 cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3604 localClone = (struct cloneStruct *)stackBuffer;
3606 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3608 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3609 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3610 localClone->cnv.isExtraLocal = TRUE;
3612 /* share the subconverters */
3614 if(cnvData->currentConverter != NULL) {
3615 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3616 localClone->mydata.currentConverter =
3617 ucnv_safeClone(cnvData->currentConverter,
3618 &localClone->currentConverter,
3620 if(U_FAILURE(*status)) {
3625 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3626 if(cnvData->myConverterArray[i] != NULL) {
3627 ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3631 return &localClone->cnv;
3635 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3636 const USetAdder *sa,
3637 UConverterUnicodeSet which,
3638 UErrorCode *pErrorCode)
3641 UConverterDataISO2022* cnvData;
3643 if (U_FAILURE(*pErrorCode)) {
3646 #ifdef U_ENABLE_GENERIC_ISO_2022
3647 if (cnv->sharedData == &_ISO2022Data) {
3648 /* We use UTF-8 in this case */
3649 sa->addRange(sa->set, 0, 0xd7FF);
3650 sa->addRange(sa->set, 0xE000, 0x10FFFF);
3655 cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3657 /* open a set and initialize it with code points that are algorithmically round-tripped */
3658 switch(cnvData->locale[0]){
3660 /* include JIS X 0201 which is hardcoded */
3661 sa->add(sa->set, 0xa5);
3662 sa->add(sa->set, 0x203e);
3663 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3664 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3665 /* include Latin-1 for some variants of JP */
3666 sa->addRange(sa->set, 0, 0xff);
3668 /* include ASCII for JP */
3669 sa->addRange(sa->set, 0, 0x7f);
3672 /* include ASCII for JP */
3673 sa->addRange(sa->set, 0, 0x7f);
3675 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3677 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3678 * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3679 * use half-width Katakana.
3680 * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3681 * half-width Katakana via the ESC ( I sequence.
3682 * However, we only emit (fromUnicode) half-width Katakana according to the
3683 * definition of each variant.
3685 * When including fallbacks,
3686 * we need to include half-width Katakana Unicode code points for all JP variants because
3687 * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3689 /* include half-width Katakana for JP */
3690 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3693 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3696 /* include ASCII for CN */
3697 sa->addRange(sa->set, 0, 0x7f);
3700 /* there is only one converter for KR, and it is not in the myConverterArray[] */
3701 cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3702 cnvData->currentConverter, sa, which, pErrorCode);
3703 /* the loop over myConverterArray[] will simply not find another converter */
3710 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3711 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3712 cnvData->version==0 && i==CNS_11643
3714 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3715 ucnv_MBCSGetUnicodeSetForBytes(
3716 cnvData->myConverterArray[i],
3717 sa, UCNV_ROUNDTRIP_SET,
3723 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3724 UConverterSetFilter filter;
3725 if(cnvData->myConverterArray[i]!=NULL) {
3726 if(cnvData->locale[0]=='j' && i==JISX208) {
3728 * Only add code points that map to Shift-JIS codes
3729 * corresponding to JIS X 0208.
3731 filter=UCNV_SET_FILTER_SJIS;
3732 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3733 } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3734 cnvData->version==0 && i==CNS_11643) {
3736 * Version-specific for CN:
3737 * CN version 0 does not map CNS planes 3..7 although
3738 * they are all available in the CNS conversion table;
3739 * CN version 1 (-EXT) does map them all.
3740 * The two versions create different Unicode sets.
3742 filter=UCNV_SET_FILTER_2022_CN;
3743 } else if(i==KSC5601) {
3745 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3746 * are broader than GR94.
3748 filter=UCNV_SET_FILTER_GR94DBCS;
3751 filter=UCNV_SET_FILTER_NONE;
3753 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3758 * ISO 2022 converters must not convert SO/SI/ESC despite what
3759 * sub-converters do by themselves.
3760 * Remove these characters from the set.
3762 sa->remove(sa->set, 0x0e);
3763 sa->remove(sa->set, 0x0f);
3764 sa->remove(sa->set, 0x1b);
3766 /* ISO 2022 converters do not convert C1 controls either */
3767 sa->removeRange(sa->set, 0x80, 0x9f);
3770 static const UConverterImpl _ISO2022Impl={
3780 #ifdef U_ENABLE_GENERIC_ISO_2022
3781 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3782 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3783 ucnv_fromUnicode_UTF8,
3784 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3796 _ISO_2022_SafeClone,
3797 _ISO_2022_GetUnicodeSet,
3802 static const UConverterStaticData _ISO2022StaticData={
3803 sizeof(UConverterStaticData),
3809 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3816 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3818 const UConverterSharedData _ISO2022Data={
3819 sizeof(UConverterSharedData),
3823 &_ISO2022StaticData,
3826 0, UCNV_MBCS_TABLE_INITIALIZER
3829 /*************JP****************/
3830 static const UConverterImpl _ISO2022JPImpl={
3840 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3841 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3842 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3843 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3849 _ISO_2022_SafeClone,
3850 _ISO_2022_GetUnicodeSet,
3855 static const UConverterStaticData _ISO2022JPStaticData={
3856 sizeof(UConverterStaticData),
3862 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3869 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3874 const UConverterSharedData _ISO2022JPData={
3875 sizeof(UConverterSharedData),
3879 &_ISO2022JPStaticData,
3882 0, UCNV_MBCS_TABLE_INITIALIZER
3887 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3888 /************* KR ***************/
3889 static const UConverterImpl _ISO2022KRImpl={
3899 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3900 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3901 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3902 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3908 _ISO_2022_SafeClone,
3909 _ISO_2022_GetUnicodeSet,
3914 static const UConverterStaticData _ISO2022KRStaticData={
3915 sizeof(UConverterStaticData),
3921 3, /* max 3 bytes per UChar: SO+DBCS */
3928 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3933 const UConverterSharedData _ISO2022KRData={
3934 sizeof(UConverterSharedData),
3938 &_ISO2022KRStaticData,
3941 0, UCNV_MBCS_TABLE_INITIALIZER
3946 /*************** CN ***************/
3947 static const UConverterImpl _ISO2022CNImpl={
3958 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3959 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3960 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3961 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3967 _ISO_2022_SafeClone,
3968 _ISO_2022_GetUnicodeSet,
3973 static const UConverterStaticData _ISO2022CNStaticData={
3974 sizeof(UConverterStaticData),
3980 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3987 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3992 const UConverterSharedData _ISO2022CNData={
3993 sizeof(UConverterSharedData),
3997 &_ISO2022CNStaticData,
4000 0, UCNV_MBCS_TABLE_INITIALIZER
4004 #endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
4006 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */