Imported Upstream version 58.1
[platform/upstream/icu.git] / source / common / ucnv2022.cpp
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2000-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  ucnv2022.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000feb03
14 *   created by: Markus W. Scherer
15 *
16 *   Change history:
17 *
18 *   06/29/2000  helena  Major rewrite of the callback APIs.
19 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
20 *                       Changed implementation of toUnicode
21 *                       function
22 *   08/21/2000  Ram     Added support for ISO-2022-KR
23 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
24 *                       ucnvebdc.c
25 *   09/20/2000  Ram     Added support for ISO-2022-CN
26 *                       Added implementations for getNextUChar()
27 *                       for specific 2022 country variants.
28 *   10/31/2000  Ram     Implemented offsets logic functions
29 */
30
31 #include "unicode/utypes.h"
32
33 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35 #include "unicode/ucnv.h"
36 #include "unicode/uset.h"
37 #include "unicode/ucnv_err.h"
38 #include "unicode/ucnv_cb.h"
39 #include "unicode/utf16.h"
40 #include "ucnv_imp.h"
41 #include "ucnv_bld.h"
42 #include "ucnv_cnv.h"
43 #include "ucnvmbcs.h"
44 #include "cstring.h"
45 #include "cmemory.h"
46 #include "uassert.h"
47
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50  * I am disabling the generic ISO-2022 converter after proposing to do so on
51  * the icu mailing list two days ago.
52  *
53  * Reasons:
54  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55  *    its designation sequences, single shifts with return to the previous state,
56  *    switch-with-no-return to UTF-16BE or similar, etc.
57  *    This is unlike the language-specific variants like ISO-2022-JP which
58  *    require a much smaller repertoire of ISO-2022 features.
59  *    These variants continue to be supported.
60  * 2. I believe that no one is really using the generic ISO-2022 converter
61  *    but rather always one of the language-specific variants.
62  *    Note that ICU's generic ISO-2022 converter has always output one escape
63  *    sequence followed by UTF-8 for the whole stream.
64  * 3. Switching between subcharsets is extremely slow, because each time
65  *    the previous converter is closed and a new one opened,
66  *    without any kind of caching, least-recently-used list, etc.
67  * 4. The code is currently buggy, and given the above it does not seem
68  *    reasonable to spend the time on maintenance.
69  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70  *    This means, for example, that when ISO-8859-7 is designated, the following
71  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72  *    The ICU ISO-2022 converter does not handle this - and has no information
73  *    about which subconverter would have to be shifted vs. which is designed
74  *    for 7-bit ISO-2022.
75  *
76  * Markus Scherer 2003-dec-03
77  */
78 #endif
79
80 #if !UCONFIG_ONLY_HTML_CONVERSION
81 static const char SHIFT_IN_STR[]  = "\x0F";
82 // static const char SHIFT_OUT_STR[] = "\x0E";
83 #endif
84
85 #define CR      0x0D
86 #define LF      0x0A
87 #define H_TAB   0x09
88 #define V_TAB   0x0B
89 #define SPACE   0x20
90
91 enum {
92     HWKANA_START=0xff61,
93     HWKANA_END=0xff9f
94 };
95
96 /*
97  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98  * as bytes 21..7E. (Subtract 0x80.)
99  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100  * as bytes 20..7F. (Subtract 0x80.)
101  * Do not encode C1 control codes with native bytes 80..9F
102  * as bytes 00..1F (C0 control codes).
103  */
104 enum {
105     GR94_START=0xa1,
106     GR94_END=0xfe,
107     GR96_START=0xa0,
108     GR96_END=0xff
109 };
110
111 /*
112  * ISO 2022 control codes must not be converted from Unicode
113  * because they would mess up the byte stream.
114  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115  * corresponding to SO, SI, and ESC.
116  */
117 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119 /* for ISO-2022-JP and -CN implementations */
120 typedef enum  {
121         /* shared values */
122         INVALID_STATE=-1,
123         ASCII = 0,
124
125         SS2_STATE=0x10,
126         SS3_STATE,
127
128         /* JP */
129         ISO8859_1 = 1 ,
130         ISO8859_7 = 2 ,
131         JISX201  = 3,
132         JISX208 = 4,
133         JISX212 = 5,
134         GB2312  =6,
135         KSC5601 =7,
136         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
137
138         /* CN */
139         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140         GB2312_1=1,
141         ISO_IR_165=2,
142         CNS_11643=3,
143
144         /*
145          * these are used in StateEnum and ISO2022State variables,
146          * but CNS_11643 must be used to index into myConverterArray[]
147          */
148         CNS_11643_0=0x20,
149         CNS_11643_1,
150         CNS_11643_2,
151         CNS_11643_3,
152         CNS_11643_4,
153         CNS_11643_5,
154         CNS_11643_6,
155         CNS_11643_7
156 } StateEnum;
157
158 /* is the StateEnum charset value for a DBCS charset? */
159 #if UCONFIG_ONLY_HTML_CONVERSION
160 #define IS_JP_DBCS(cs) (JISX208==(cs))
161 #else
162 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163 #endif
164
165 #define CSM(cs) ((uint16_t)1<<(cs))
166
167 /*
168  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170  *
171  * Note: The converter uses some leniency:
172  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173  *   all versions, not just JIS7 and JIS8.
174  * - ICU does not distinguish between different versions of JIS X 0208.
175  */
176 #if UCONFIG_ONLY_HTML_CONVERSION
177 enum { MAX_JA_VERSION=0 };
178 #else
179 enum { MAX_JA_VERSION=4 };
180 #endif
181 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183 #if !UCONFIG_ONLY_HTML_CONVERSION
184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188 #endif
189 };
190
191 typedef enum {
192         ASCII1=0,
193         LATIN1,
194         SBCS,
195         DBCS,
196         MBCS,
197         HWKANA
198 }Cnv2022Type;
199
200 typedef struct ISO2022State {
201     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203     int8_t prevG;       /* g before single shift (SS2 or SS3) */
204 } ISO2022State;
205
206 #define UCNV_OPTIONS_VERSION_MASK 0xf
207 #define UCNV_2022_MAX_CONVERTERS 10
208
209 typedef struct{
210     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211     UConverter *currentConverter;
212     Cnv2022Type currentType;
213     ISO2022State toU2022State, fromU2022State;
214     uint32_t key;
215     uint32_t version;
216 #ifdef U_ENABLE_GENERIC_ISO_2022
217     UBool isFirstBuffer;
218 #endif
219     UBool isEmptySegment;
220     char name[30];
221     char locale[3];
222 }UConverterDataISO2022;
223
224 /* Protos */
225 /* ISO-2022 ----------------------------------------------------------------- */
226
227 /*Forward declaration */
228 U_CFUNC void U_CALLCONV
229 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230                       UErrorCode * err);
231 U_CFUNC void U_CALLCONV
232 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233                                     UErrorCode * err);
234
235 #define ESC_2022 0x1B /*ESC*/
236
237 typedef enum
238 {
239         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243 } UCNV_TableStates_2022;
244
245 /*
246 * The way these state transition arrays work is:
247 * ex : ESC$B is the sequence for JISX208
248 *      a) First Iteration: char is ESC
249 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250 *             int x = normalize_esq_chars_2022[27] which is equal to 1
251 *         ii) Search for this value in escSeqStateTable_Key_2022[]
252 *             value of x is stored at escSeqStateTable_Key_2022[0]
253 *        iii) Save this index as offset
254 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256 *     b) Switch on this state and continue to next char
257 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258 *             which is normalize_esq_chars_2022[36] == 4
259 *         ii) x is currently 1(from above)
260 *               x<<=5 -- x is now 32
261 *               x+=normalize_esq_chars_2022[36]
262 *               now x is 36
263 *        iii) Search for this value in escSeqStateTable_Key_2022[]
264 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267 *     c) Switch on this state and continue to next char
268 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269 *        ii) x is currently 36 (from above)
270 *            x<<=5 -- x is now 1152
271 *            x+=normalize_esq_chars_2022[66]
272 *            now x is 1161
273 *       iii) Search for this value in escSeqStateTable_Key_2022[]
274 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278 */
279
280
281 /*Below are the 3 arrays depicting a state transition table*/
282 static const int8_t normalize_esq_chars_2022[256] = {
283 /*       0      1       2       3       4      5       6        7       8       9           */
284
285          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
288         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
289         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
290         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
292         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
293         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
294         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310         ,0     ,0      ,0      ,0      ,0      ,0
311 };
312
313 #ifdef U_ENABLE_GENERIC_ISO_2022
314 /*
315  * When the generic ISO-2022 converter is completely removed, not just disabled
316  * per #ifdef, then the following state table and the associated tables that are
317  * dimensioned with MAX_STATES_2022 should be trimmed.
318  *
319  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320  * the associated escape sequences starting with ESC ( B should be removed.
321  * This includes the ones with key values 1097 and all of the ones above 1000000.
322  *
323  * For the latter, the tables can simply be truncated.
324  * For the former, since the tables must be kept parallel, it is probably best
325  * to simply duplicate an adjacent table cell, parallel in all tables.
326  *
327  * It may make sense to restructure the tables, especially by using small search
328  * tables for the variants instead of indexing them parallel to the table here.
329  */
330 #endif
331
332 #define MAX_STATES_2022 74
333 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334 /*   0           1           2           3           4           5           6           7           8           9           */
335
336      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
337     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
338     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
339     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
340     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
341     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
342     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
343     ,35947631   ,35947635   ,35947636   ,35947638
344 };
345
346 #ifdef U_ENABLE_GENERIC_ISO_2022
347
348 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
350
351      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
352     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
353     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
354     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
355     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
356     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
358     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
359 };
360
361 #endif
362
363 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
365      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
368     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373 };
374
375 /* Type def for refactoring changeState_2022 code*/
376 typedef enum{
377 #ifdef U_ENABLE_GENERIC_ISO_2022
378     ISO_2022=0,
379 #endif
380     ISO_2022_JP=1,
381 #if !UCONFIG_ONLY_HTML_CONVERSION
382     ISO_2022_KR=2,
383     ISO_2022_CN=3
384 #endif
385 } Variant2022;
386
387 /*********** ISO 2022 Converter Protos ***********/
388 static void U_CALLCONV
389 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391 static void U_CALLCONV
392  _ISO2022Close(UConverter *converter);
393
394 static void U_CALLCONV
395 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397 U_CDECL_BEGIN
398 static const char * U_CALLCONV
399 _ISO2022getName(const UConverter* cnv);
400 U_CDECL_END
401
402 static void  U_CALLCONV
403 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405 U_CDECL_BEGIN
406 static UConverter * U_CALLCONV
407 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409 U_CDECL_END
410
411 #ifdef U_ENABLE_GENERIC_ISO_2022
412 static void U_CALLCONV
413 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414 #endif
415
416 namespace {
417
418 /*const UConverterSharedData _ISO2022Data;*/
419 extern const UConverterSharedData _ISO2022JPData;
420
421 #if !UCONFIG_ONLY_HTML_CONVERSION
422 extern const UConverterSharedData _ISO2022KRData;
423 extern const UConverterSharedData _ISO2022CNData;
424 #endif
425
426 }  // namespace
427
428 /*************** Converter implementations ******************/
429
430 /* The purpose of this function is to get around gcc compiler warnings. */
431 static inline void
432 fromUWriteUInt8(UConverter *cnv,
433                  const char *bytes, int32_t length,
434                  uint8_t **target, const char *targetLimit,
435                  int32_t **offsets,
436                  int32_t sourceIndex,
437                  UErrorCode *pErrorCode)
438 {
439     char *targetChars = (char *)*target;
440     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441                          offsets, sourceIndex, pErrorCode);
442     *target = (uint8_t*)targetChars;
443
444 }
445
446 static inline void
447 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448     if(myConverterData->version == 1) {
449         UConverter *cnv = myConverterData->currentConverter;
450
451         cnv->toUnicodeStatus=0;     /* offset */
452         cnv->mode=0;                /* state */
453         cnv->toULength=0;           /* byteIndex */
454     }
455 }
456
457 static inline void
458 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459    /* in ISO-2022-KR the designator sequence appears only once
460     * in a file so we append it only once
461     */
462     if( converter->charErrorBufferLength==0){
463
464         converter->charErrorBufferLength = 4;
465         converter->charErrorBuffer[0] = 0x1b;
466         converter->charErrorBuffer[1] = 0x24;
467         converter->charErrorBuffer[2] = 0x29;
468         converter->charErrorBuffer[3] = 0x43;
469     }
470     if(myConverterData->version == 1) {
471         UConverter *cnv = myConverterData->currentConverter;
472
473         cnv->fromUChar32=0;
474         cnv->fromUnicodeStatus=1;   /* prevLength */
475     }
476 }
477
478 static void U_CALLCONV
479 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481     char myLocale[6]={' ',' ',' ',' ',' ',' '};
482
483     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484     if(cnv->extraInfo != NULL) {
485         UConverterNamePieces stackPieces;
486         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488         uint32_t version;
489
490         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493         myConverterData->currentType = ASCII1;
494         cnv->fromUnicodeStatus =FALSE;
495         if(pArgs->locale){
496             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
497         }
498         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499         myConverterData->version = version;
500         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501             (myLocale[2]=='_' || myLocale[2]=='\0'))
502         {
503             /* open the required converters and cache them */
504             if(version>MAX_JA_VERSION) {
505                 // ICU 55 fails to open a converter for an unsupported version.
506                 // Previously, it fell back to version 0, but that would yield
507                 // unexpected behavior.
508                 *errorCode = U_MISSING_RESOURCE_ERROR;
509                 return;
510             }
511             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512                 myConverterData->myConverterArray[ISO8859_7] =
513                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514             }
515             myConverterData->myConverterArray[JISX208] =
516                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517             if(jpCharsetMasks[version]&CSM(JISX212)) {
518                 myConverterData->myConverterArray[JISX212] =
519                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520             }
521             if(jpCharsetMasks[version]&CSM(GB2312)) {
522                 myConverterData->myConverterArray[GB2312] =
523                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
524             }
525             if(jpCharsetMasks[version]&CSM(KSC5601)) {
526                 myConverterData->myConverterArray[KSC5601] =
527                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528             }
529
530             /* set the function pointers to appropriate funtions */
531             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532             uprv_strcpy(myConverterData->locale,"ja");
533
534             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535             size_t len = uprv_strlen(myConverterData->name);
536             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537             myConverterData->name[len+1]='\0';
538         }
539 #if !UCONFIG_ONLY_HTML_CONVERSION
540         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541             (myLocale[2]=='_' || myLocale[2]=='\0'))
542         {
543             if(version>1) {
544                 // ICU 55 fails to open a converter for an unsupported version.
545                 // Previously, it fell back to version 0, but that would yield
546                 // unexpected behavior.
547                 *errorCode = U_MISSING_RESOURCE_ERROR;
548                 return;
549             }
550             const char *cnvName;
551             if(version==1) {
552                 cnvName="icu-internal-25546";
553             } else {
554                 cnvName="ibm-949";
555                 myConverterData->version=version=0;
556             }
557             if(pArgs->onlyTestIsLoadable) {
558                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
559                 uprv_free(cnv->extraInfo);
560                 cnv->extraInfo=NULL;
561                 return;
562             } else {
563                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564                 if (U_FAILURE(*errorCode)) {
565                     _ISO2022Close(cnv);
566                     return;
567                 }
568
569                 if(version==1) {
570                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573                 }else{
574                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575                 }
576
577                 /* initialize the state variables */
578                 setInitialStateToUnicodeKR(cnv, myConverterData);
579                 setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581                 /* set the function pointers to appropriate funtions */
582                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583                 uprv_strcpy(myConverterData->locale,"ko");
584             }
585         }
586         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587             (myLocale[2]=='_' || myLocale[2]=='\0'))
588         {
589             if(version>2) {
590                 // ICU 55 fails to open a converter for an unsupported version.
591                 // Previously, it fell back to version 0, but that would yield
592                 // unexpected behavior.
593                 *errorCode = U_MISSING_RESOURCE_ERROR;
594                 return;
595             }
596
597             /* open the required converters and cache them */
598             myConverterData->myConverterArray[GB2312_1] =
599                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600             if(version==1) {
601                 myConverterData->myConverterArray[ISO_IR_165] =
602                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603             }
604             myConverterData->myConverterArray[CNS_11643] =
605                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608             /* set the function pointers to appropriate funtions */
609             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610             uprv_strcpy(myConverterData->locale,"cn");
611
612             if (version==0){
613                 myConverterData->version = 0;
614                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615             }else if (version==1){
616                 myConverterData->version = 1;
617                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618             }else {
619                 myConverterData->version = 2;
620                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621             }
622         }
623 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
624         else{
625 #ifdef U_ENABLE_GENERIC_ISO_2022
626             myConverterData->isFirstBuffer = TRUE;
627
628             /* append the UTF-8 escape sequence */
629             cnv->charErrorBufferLength = 3;
630             cnv->charErrorBuffer[0] = 0x1b;
631             cnv->charErrorBuffer[1] = 0x25;
632             cnv->charErrorBuffer[2] = 0x42;
633
634             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635             /* initialize the state variables */
636             uprv_strcpy(myConverterData->name,"ISO_2022");
637 #else
638             *errorCode = U_MISSING_RESOURCE_ERROR;
639             // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640             // data loading error code.
641             return;
642 #endif
643         }
644
645         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648             _ISO2022Close(cnv);
649         }
650     } else {
651         *errorCode = U_MEMORY_ALLOCATION_ERROR;
652     }
653 }
654
655
656 static void U_CALLCONV
657 _ISO2022Close(UConverter *converter) {
658     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659     UConverterSharedData **array = myData->myConverterArray;
660     int32_t i;
661
662     if (converter->extraInfo != NULL) {
663         /*close the array of converter pointers and free the memory*/
664         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665             if(array[i]!=NULL) {
666                 ucnv_unloadSharedDataIfReady(array[i]);
667             }
668         }
669
670         ucnv_close(myData->currentConverter);
671
672         if(!converter->isExtraLocal){
673             uprv_free (converter->extraInfo);
674             converter->extraInfo = NULL;
675         }
676     }
677 }
678
679 static void U_CALLCONV
680 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
682     if(choice<=UCNV_RESET_TO_UNICODE) {
683         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684         myConverterData->key = 0;
685         myConverterData->isEmptySegment = FALSE;
686     }
687     if(choice!=UCNV_RESET_TO_UNICODE) {
688         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689     }
690 #ifdef U_ENABLE_GENERIC_ISO_2022
691     if(myConverterData->locale[0] == 0){
692         if(choice<=UCNV_RESET_TO_UNICODE) {
693             myConverterData->isFirstBuffer = TRUE;
694             myConverterData->key = 0;
695             if (converter->mode == UCNV_SO){
696                 ucnv_close (myConverterData->currentConverter);
697                 myConverterData->currentConverter=NULL;
698             }
699             converter->mode = UCNV_SI;
700         }
701         if(choice!=UCNV_RESET_TO_UNICODE) {
702             /* re-append UTF-8 escape sequence */
703             converter->charErrorBufferLength = 3;
704             converter->charErrorBuffer[0] = 0x1b;
705             converter->charErrorBuffer[1] = 0x28;
706             converter->charErrorBuffer[2] = 0x42;
707         }
708     }
709     else
710 #endif
711     {
712         /* reset the state variables */
713         if(myConverterData->locale[0] == 'k'){
714             if(choice<=UCNV_RESET_TO_UNICODE) {
715                 setInitialStateToUnicodeKR(converter, myConverterData);
716             }
717             if(choice!=UCNV_RESET_TO_UNICODE) {
718                 setInitialStateFromUnicodeKR(converter, myConverterData);
719             }
720         }
721     }
722 }
723
724 U_CDECL_BEGIN
725
726 static const char * U_CALLCONV
727 _ISO2022getName(const UConverter* cnv){
728     if(cnv->extraInfo){
729         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730         return myData->name;
731     }
732     return NULL;
733 }
734
735 U_CDECL_END
736
737
738 /*************** to unicode *******************/
739 /****************************************************************************
740  * Recognized escape sequences are
741  * <ESC>(B  ASCII
742  * <ESC>.A  ISO-8859-1
743  * <ESC>.F  ISO-8859-7
744  * <ESC>(J  JISX-201
745  * <ESC>(I  JISX-201
746  * <ESC>$B  JISX-208
747  * <ESC>$@  JISX-208
748  * <ESC>$(D JISX-212
749  * <ESC>$A  GB2312
750  * <ESC>$(C KSC5601
751  */
752 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753 /*      0                1               2               3               4               5               6               7               8               9    */
754     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
755     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
756     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
758     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
759     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
760     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
761     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
762 };
763
764 #if !UCONFIG_ONLY_HTML_CONVERSION
765 /*************** to unicode *******************/
766 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767 /*      0                1               2               3               4               5               6               7               8               9    */
768      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
769     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
770     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
771     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
772     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
773     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
774     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
775     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
776 };
777 #endif
778
779
780 static UCNV_TableStates_2022
781 getKey_2022(char c,int32_t* key,int32_t* offset){
782     int32_t togo;
783     int32_t low = 0;
784     int32_t hi = MAX_STATES_2022;
785     int32_t oldmid=0;
786
787     togo = normalize_esq_chars_2022[(uint8_t)c];
788     if(togo == 0) {
789         /* not a valid character anywhere in an escape sequence */
790         *key = 0;
791         *offset = 0;
792         return INVALID_2022;
793     }
794     togo = (*key << 5) + togo;
795
796     while (hi != low)  /*binary search*/{
797
798         int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800         if (mid == oldmid)
801             break;
802
803         if (escSeqStateTable_Key_2022[mid] > togo){
804             hi = mid;
805         }
806         else if (escSeqStateTable_Key_2022[mid] < togo){
807             low = mid;
808         }
809         else /*we found it*/{
810             *key = togo;
811             *offset = mid;
812             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813         }
814         oldmid = mid;
815
816     }
817
818     *key = 0;
819     *offset = 0;
820     return INVALID_2022;
821 }
822
823 /*runs through a state machine to determine the escape sequence - codepage correspondance
824  */
825 static void
826 changeState_2022(UConverter* _this,
827                 const char** source,
828                 const char* sourceLimit,
829                 Variant2022 var,
830                 UErrorCode* err){
831     UCNV_TableStates_2022 value;
832     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833     uint32_t key = myData2022->key;
834     int32_t offset = 0;
835     int8_t initialToULength = _this->toULength;
836     char c;
837
838     value = VALID_NON_TERMINAL_2022;
839     while (*source < sourceLimit) {
840         c = *(*source)++;
841         _this->toUBytes[_this->toULength++]=(uint8_t)c;
842         value = getKey_2022(c,(int32_t *) &key, &offset);
843
844         switch (value){
845
846         case VALID_NON_TERMINAL_2022 :
847             /* continue with the loop */
848             break;
849
850         case VALID_TERMINAL_2022:
851             key = 0;
852             goto DONE;
853
854         case INVALID_2022:
855             goto DONE;
856
857         case VALID_MAYBE_TERMINAL_2022:
858 #ifdef U_ENABLE_GENERIC_ISO_2022
859             /* ESC ( B is ambiguous only for ISO_2022 itself */
860             if(var == ISO_2022) {
861                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862                 _this->toULength = 0;
863
864                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866                 /* continue with the loop */
867                 value = VALID_NON_TERMINAL_2022;
868                 break;
869             } else
870 #endif
871             {
872                 /* not ISO_2022 itself, finish here */
873                 value = VALID_TERMINAL_2022;
874                 key = 0;
875                 goto DONE;
876             }
877         }
878     }
879
880 DONE:
881     myData2022->key = key;
882
883     if (value == VALID_NON_TERMINAL_2022) {
884         /* indicate that the escape sequence is incomplete: key!=0 */
885         return;
886     } else if (value == INVALID_2022 ) {
887         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888     } else /* value == VALID_TERMINAL_2022 */ {
889         switch(var){
890 #ifdef U_ENABLE_GENERIC_ISO_2022
891         case ISO_2022:
892         {
893             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894             if(chosenConverterName == NULL) {
895                 /* SS2 or SS3 */
896                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897                 _this->toUCallbackReason = UCNV_UNASSIGNED;
898                 return;
899             }
900
901             _this->mode = UCNV_SI;
902             ucnv_close(myData2022->currentConverter);
903             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904             if(U_SUCCESS(*err)) {
905                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906                 _this->mode = UCNV_SO;
907             }
908             break;
909         }
910 #endif
911         case ISO_2022_JP:
912             {
913                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914                 switch(tempState) {
915                 case INVALID_STATE:
916                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917                     break;
918                 case SS2_STATE:
919                     if(myData2022->toU2022State.cs[2]!=0) {
920                         if(myData2022->toU2022State.g<2) {
921                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922                         }
923                         myData2022->toU2022State.g=2;
924                     } else {
925                         /* illegal to have SS2 before a matching designator */
926                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927                     }
928                     break;
929                 /* case SS3_STATE: not used in ISO-2022-JP-x */
930                 case ISO8859_1:
931                 case ISO8859_7:
932                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934                     } else {
935                         /* G2 charset for SS2 */
936                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
937                     }
938                     break;
939                 default:
940                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942                     } else {
943                         /* G0 charset */
944                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
945                     }
946                     break;
947                 }
948             }
949             break;
950 #if !UCONFIG_ONLY_HTML_CONVERSION
951         case ISO_2022_CN:
952             {
953                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954                 switch(tempState) {
955                 case INVALID_STATE:
956                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957                     break;
958                 case SS2_STATE:
959                     if(myData2022->toU2022State.cs[2]!=0) {
960                         if(myData2022->toU2022State.g<2) {
961                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962                         }
963                         myData2022->toU2022State.g=2;
964                     } else {
965                         /* illegal to have SS2 before a matching designator */
966                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967                     }
968                     break;
969                 case SS3_STATE:
970                     if(myData2022->toU2022State.cs[3]!=0) {
971                         if(myData2022->toU2022State.g<2) {
972                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973                         }
974                         myData2022->toU2022State.g=3;
975                     } else {
976                         /* illegal to have SS3 before a matching designator */
977                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978                     }
979                     break;
980                 case ISO_IR_165:
981                     if(myData2022->version==0) {
982                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983                         break;
984                     }
985                     U_FALLTHROUGH;
986                 case GB2312_1:
987                     U_FALLTHROUGH;
988                 case CNS_11643_1:
989                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
990                     break;
991                 case CNS_11643_2:
992                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
993                     break;
994                 default:
995                     /* other CNS 11643 planes */
996                     if(myData2022->version==0) {
997                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998                     } else {
999                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000                     }
1001                     break;
1002                 }
1003             }
1004             break;
1005         case ISO_2022_KR:
1006             if(offset==0x30){
1007                 /* nothing to be done, just accept this one escape sequence */
1008             } else {
1009                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010             }
1011             break;
1012 #endif  // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014         default:
1015             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016             break;
1017         }
1018     }
1019     if(U_SUCCESS(*err)) {
1020         _this->toULength = 0;
1021     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022         if(_this->toULength>1) {
1023             /*
1024              * Ticket 5691: consistent illegal sequences:
1025              * - We include at least the first byte (ESC) in the illegal sequence.
1026              * - If any of the non-initial bytes could be the start of a character,
1027              *   we stop the illegal sequence before the first one of those.
1028              *   In escape sequences, all following bytes are "printable", that is,
1029              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030              *   they are valid single/lead bytes.
1031              *   For simplicity, we always only report the initial ESC byte as the
1032              *   illegal sequence and back out all other bytes we looked at.
1033              */
1034             /* Back out some bytes. */
1035             int8_t backOutDistance=_this->toULength-1;
1036             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037             if(backOutDistance<=bytesFromThisBuffer) {
1038                 /* same as initialToULength<=1 */
1039                 *source-=backOutDistance;
1040             } else {
1041                 /* Back out bytes from the previous buffer: Need to replay them. */
1042                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043                 /* same as -(initialToULength-1) */
1044                 /* preToULength is negative! */
1045                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046                 *source-=bytesFromThisBuffer;
1047             }
1048             _this->toULength=1;
1049         }
1050     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051         _this->toUCallbackReason = UCNV_UNASSIGNED;
1052     }
1053 }
1054
1055 #if !UCONFIG_ONLY_HTML_CONVERSION
1056 /*Checks the characters of the buffer against valid 2022 escape sequences
1057 *if the match we return a pointer to the initial start of the sequence otherwise
1058 *we return sourceLimit
1059 */
1060 /*for 2022 looks ahead in the stream
1061  *to determine the longest possible convertible
1062  *data stream
1063  */
1064 static inline const char*
1065 getEndOfBuffer_2022(const char** source,
1066                    const char* sourceLimit,
1067                    UBool /*flush*/){
1068
1069     const char* mySource = *source;
1070
1071 #ifdef U_ENABLE_GENERIC_ISO_2022
1072     if (*source >= sourceLimit)
1073         return sourceLimit;
1074
1075     do{
1076
1077         if (*mySource == ESC_2022){
1078             int8_t i;
1079             int32_t key = 0;
1080             int32_t offset;
1081             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083             /* Kludge: I could not
1084             * figure out the reason for validating an escape sequence
1085             * twice - once here and once in changeState_2022().
1086             * is it possible to have an ESC character in a ISO2022
1087             * byte stream which is valid in a code page? Is it legal?
1088             */
1089             for (i=0;
1090             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091             i++) {
1092                 value =  getKey_2022(*(mySource+i), &key, &offset);
1093             }
1094             if (value > 0 || *mySource==ESC_2022)
1095                 return mySource;
1096
1097             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098                 return sourceLimit;
1099         }
1100     }while (++mySource < sourceLimit);
1101
1102     return sourceLimit;
1103 #else
1104     while(mySource < sourceLimit && *mySource != ESC_2022) {
1105         ++mySource;
1106     }
1107     return mySource;
1108 #endif
1109 }
1110 #endif
1111
1112 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113  * any future change in _MBCSFromUChar32() function should be reflected here.
1114  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115  */
1116 static inline int32_t
1117 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118                                          UChar32 c,
1119                                          uint32_t* value,
1120                                          UBool useFallback,
1121                                          int outputType)
1122 {
1123     const int32_t *cx;
1124     const uint16_t *table;
1125     uint32_t stage2Entry;
1126     uint32_t myValue;
1127     int32_t length;
1128     const uint8_t *p;
1129     /*
1130      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131      * Use internal version of ucnv_open() that verifies that the new structures are available,
1132      * else U_INTERNAL_PROGRAM_ERROR.
1133      */
1134     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136         table=sharedData->mbcs.fromUnicodeTable;
1137         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138         /* get the bytes and the length for the output */
1139         if(outputType==MBCS_OUTPUT_2){
1140             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141             if(myValue<=0xff) {
1142                 length=1;
1143             } else {
1144                 length=2;
1145             }
1146         } else /* outputType==MBCS_OUTPUT_3 */ {
1147             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149             if(myValue<=0xff) {
1150                 length=1;
1151             } else if(myValue<=0xffff) {
1152                 length=2;
1153             } else {
1154                 length=3;
1155             }
1156         }
1157         /* is this code point assigned, or do we use fallbacks? */
1158         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159             /* assigned */
1160             *value=myValue;
1161             return length;
1162         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163             /*
1164              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165              * There is no way with this data structure for fallback output
1166              * to be a zero byte.
1167              */
1168             *value=myValue;
1169             return -length;
1170         }
1171     }
1172
1173     cx=sharedData->mbcs.extIndexes;
1174     if(cx!=NULL) {
1175         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176     }
1177
1178     /* unassigned */
1179     return 0;
1180 }
1181
1182 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184  * @param retval pointer to output byte
1185  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1186  */
1187 static inline int32_t
1188 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189                                        UChar32 c,
1190                                        uint32_t* retval,
1191                                        UBool useFallback)
1192 {
1193     const uint16_t *table;
1194     int32_t value;
1195     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197         return 0;
1198     }
1199     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200     table=sharedData->mbcs.fromUnicodeTable;
1201     /* get the byte for the output */
1202     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203     /* is this code point assigned, or do we use fallbacks? */
1204     *retval=(uint32_t)(value&0xff);
1205     if(value>=0xf00) {
1206         return 1;  /* roundtrip */
1207     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208         return -1;  /* fallback taken */
1209     } else {
1210         return 0;  /* no mapping */
1211     }
1212 }
1213
1214 /*
1215  * Check that the result is a 2-byte value with each byte in the range A1..FE
1216  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217  * to move it to the ISO 2022 range 21..7E.
1218  * Return 0 if out of range.
1219  */
1220 static inline uint32_t
1221 _2022FromGR94DBCS(uint32_t value) {
1222     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224     ) {
1225         return value - 0x8080;  /* shift down to 21..7e byte range */
1226     } else {
1227         return 0;  /* not valid for ISO 2022 */
1228     }
1229 }
1230
1231 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232 /*
1233  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235  * unchanged. 
1236  */
1237 static inline uint32_t
1238 _2022ToGR94DBCS(uint32_t value) {
1239     uint32_t returnValue = value + 0x8080;
1240     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242         return returnValue;
1243     } else {
1244         return value;
1245     }
1246 }
1247 #endif
1248
1249 #ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251 /**********************************************************************************
1252 *  ISO-2022 Converter
1253 *
1254 *
1255 */
1256
1257 static void U_CALLCONV
1258 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259                                                            UErrorCode* err){
1260     const char* mySourceLimit, *realSourceLimit;
1261     const char* sourceStart;
1262     const UChar* myTargetStart;
1263     UConverter* saveThis;
1264     UConverterDataISO2022* myData;
1265     int8_t length;
1266
1267     saveThis = args->converter;
1268     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270     realSourceLimit = args->sourceLimit;
1271     while (args->source < realSourceLimit) {
1272         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276             if(args->source < mySourceLimit) {
1277                 if(myData->currentConverter==NULL) {
1278                     myData->currentConverter = ucnv_open("ASCII",err);
1279                     if(U_FAILURE(*err)){
1280                         return;
1281                     }
1282
1283                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284                     saveThis->mode = UCNV_SO;
1285                 }
1286
1287                 /* convert to before the ESC or until the end of the buffer */
1288                 myData->isFirstBuffer=FALSE;
1289                 sourceStart = args->source;
1290                 myTargetStart = args->target;
1291                 args->converter = myData->currentConverter;
1292                 ucnv_toUnicode(args->converter,
1293                     &args->target,
1294                     args->targetLimit,
1295                     &args->source,
1296                     mySourceLimit,
1297                     args->offsets,
1298                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299                     err);
1300                 args->converter = saveThis;
1301
1302                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303                     /* move the overflow buffer */
1304                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305                     myData->currentConverter->UCharErrorBufferLength = 0;
1306                     if(length > 0) {
1307                         uprv_memcpy(saveThis->UCharErrorBuffer,
1308                                     myData->currentConverter->UCharErrorBuffer,
1309                                     length*U_SIZEOF_UCHAR);
1310                     }
1311                     return;
1312                 }
1313
1314                 /*
1315                  * At least one of:
1316                  * -Error while converting
1317                  * -Done with entire buffer
1318                  * -Need to write offsets or update the current offset
1319                  *  (leave that up to the code in ucnv.c)
1320                  *
1321                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1322                  */
1323                 if (U_FAILURE(*err) ||
1324                     (args->source == realSourceLimit) ||
1325                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1326                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327                 ) {
1328                     /* copy partial or error input for truncated detection and error handling */
1329                     if(U_FAILURE(*err)) {
1330                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331                         if(length > 0) {
1332                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333                         }
1334                     } else {
1335                         length = saveThis->toULength = myData->currentConverter->toULength;
1336                         if(length > 0) {
1337                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338                             if(args->source < mySourceLimit) {
1339                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340                             }
1341                         }
1342                     }
1343                     return;
1344                 }
1345             }
1346         }
1347
1348         sourceStart = args->source;
1349         changeState_2022(args->converter,
1350                &(args->source),
1351                realSourceLimit,
1352                ISO_2022,
1353                err);
1354         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1355             /* let the ucnv.c code update its current offset */
1356             return;
1357         }
1358     }
1359 }
1360
1361 #endif
1362
1363 /*
1364  * To Unicode Callback helper function
1365  */
1366 static void
1367 toUnicodeCallback(UConverter *cnv,
1368                   const uint32_t sourceChar, const uint32_t targetUniChar,
1369                   UErrorCode* err){
1370     if(sourceChar>0xff){
1371         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372         cnv->toUBytes[1] = (uint8_t)sourceChar;
1373         cnv->toULength = 2;
1374     }
1375     else{
1376         cnv->toUBytes[0] =(char) sourceChar;
1377         cnv->toULength = 1;
1378     }
1379
1380     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381         *err = U_INVALID_CHAR_FOUND;
1382     }
1383     else{
1384         *err = U_ILLEGAL_CHAR_FOUND;
1385     }
1386 }
1387
1388 /**************************************ISO-2022-JP*************************************************/
1389
1390 /************************************** IMPORTANT **************************************************
1391 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393 * The converter iterates over each Unicode codepoint
1394 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1396 * would do as far as possible.
1397 *
1398 * If the implementation of these macros or structure of sharedData struct change in the future, make
1399 * sure that ISO-2022 is also changed.
1400 ***************************************************************************************************
1401 */
1402
1403 /***************************************************************************************************
1404 * Rules for ISO-2022-jp encoding
1405 * (i)   Escape sequences must be fully contained within a line they should not
1406 *       span new lines or CRs
1407 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1408 *       JIS-Roman character escape sequence should follow before the line terminates
1409 * (iii) If the first character on the line is represented by two bytes then a two
1410 *       byte character escape sequence should precede it
1411 * (iv)  If no escape sequence is encountered then the characters are ASCII
1412 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413 *       and invoked with SS2 (ESC N).
1414 * (vi)  If there is any G0 designation in text, there must be a switch to
1415 *       ASCII or to JIS X 0201-Roman before a space character (but not
1416 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417 *       characters such as tab or CRLF.
1418 * (vi)  Supported encodings:
1419 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420 *
1421 *  source : RFC-1554
1422 *
1423 *          JISX201, JISX208,JISX212 : new .cnv data files created
1424 *          KSC5601 : alias to ibm-949 mapping table
1425 *          GB2312 : alias to ibm-1386 mapping table
1426 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1428 */
1429
1430 /* preference order of JP charsets */
1431 static const StateEnum jpCharsetPref[]={
1432     ASCII,
1433     JISX201,
1434     ISO8859_1,
1435     JISX208,
1436     ISO8859_7,
1437     JISX212,
1438     GB2312,
1439     KSC5601,
1440     HWKANA_7BIT
1441 };
1442
1443 /*
1444  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1445  * not in order of jpCharsetPref[]!
1446  */
1447 static const char escSeqChars[][6] ={
1448     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1449     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1450     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1451     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1452     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1453     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1454     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1455     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1456     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1457
1458 };
1459 static  const int8_t escSeqCharsLen[] ={
1460     3, /* length of <ESC>(B  ASCII       */
1461     3, /* length of <ESC>.A  ISO-8859-1  */
1462     3, /* length of <ESC>.F  ISO-8859-7  */
1463     3, /* length of <ESC>(J  JISX-201    */
1464     3, /* length of <ESC>$B  JISX-208    */
1465     4, /* length of <ESC>$(D JISX-212    */
1466     3, /* length of <ESC>$A  GB2312      */
1467     4, /* length of <ESC>$(C KSC5601     */
1468     3  /* length of <ESC>(I  HWKANA_7BIT */
1469 };
1470
1471 /*
1472 * The iteration over various code pages works this way:
1473 * i)   Get the currentState from myConverterData->currentState
1474 * ii)  Check if the character is mapped to a valid character in the currentState
1475 *      Yes ->  a) set the initIterState to currentState
1476 *       b) remain in this state until an invalid character is found
1477 *      No  ->  a) go to the next code page and find the character
1478 * iii) Before changing the state increment the current state check if the current state
1479 *      is equal to the intitIteration state
1480 *      Yes ->  A character that cannot be represented in any of the supported encodings
1481 *       break and return a U_INVALID_CHARACTER error
1482 *      No  ->  Continue and find the character in next code page
1483 *
1484 *
1485 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486 */
1487
1488 /* Map 00..7F to Unicode according to JIS X 0201. */
1489 static inline uint32_t
1490 jisx201ToU(uint32_t value) {
1491     if(value < 0x5c) {
1492         return value;
1493     } else if(value == 0x5c) {
1494         return 0xa5;
1495     } else if(value == 0x7e) {
1496         return 0x203e;
1497     } else /* value <= 0x7f */ {
1498         return value;
1499     }
1500 }
1501
1502 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503 static inline uint32_t
1504 jisx201FromU(uint32_t value) {
1505     if(value<=0x7f) {
1506         if(value!=0x5c && value!=0x7e) {
1507             return value;
1508         }
1509     } else if(value==0xa5) {
1510         return 0x5c;
1511     } else if(value==0x203e) {
1512         return 0x7e;
1513     }
1514     return 0xfffe;
1515 }
1516
1517 /*
1518  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520  * Return 0 if the byte pair is out of range.
1521  */
1522 static inline uint32_t
1523 _2022FromSJIS(uint32_t value) {
1524     uint8_t trail;
1525
1526     if(value > 0xEFFC) {
1527         return 0;  /* beyond JIS X 0208 */
1528     }
1529
1530     trail = (uint8_t)value;
1531
1532     value &= 0xff00;  /* lead byte */
1533     if(value <= 0x9f00) {
1534         value -= 0x7000;
1535     } else /* 0xe000 <= value <= 0xef00 */ {
1536         value -= 0xb000;
1537     }
1538     value <<= 1;
1539
1540     if(trail <= 0x9e) {
1541         value -= 0x100;
1542         if(trail <= 0x7e) {
1543             value |= trail - 0x1f;
1544         } else {
1545             value |= trail - 0x20;
1546         }
1547     } else /* trail <= 0xfc */ {
1548         value |= trail - 0x7e;
1549     }
1550     return value;
1551 }
1552
1553 /*
1554  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555  * If either byte is outside 21..7E make sure that the result is not valid
1556  * for Shift-JIS so that the converter catches it.
1557  * Some invalid byte values already turn into equally invalid Shift-JIS
1558  * byte values and need not be tested explicitly.
1559  */
1560 static inline void
1561 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562     if(c1&1) {
1563         ++c1;
1564         if(c2 <= 0x5f) {
1565             c2 += 0x1f;
1566         } else if(c2 <= 0x7e) {
1567             c2 += 0x20;
1568         } else {
1569             c2 = 0;  /* invalid */
1570         }
1571     } else {
1572         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1573             c2 += 0x7e;
1574         } else {
1575             c2 = 0;  /* invalid */
1576         }
1577     }
1578     c1 >>= 1;
1579     if(c1 <= 0x2f) {
1580         c1 += 0x70;
1581     } else if(c1 <= 0x3f) {
1582         c1 += 0xb0;
1583     } else {
1584         c1 = 0;  /* invalid */
1585     }
1586     bytes[0] = (char)c1;
1587     bytes[1] = (char)c2;
1588 }
1589
1590 /*
1591  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592  * Katakana.
1593  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595  * These were the only fallbacks in ICU's jisx-208.ucm file.
1596  */
1597 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598     0x2123,  /* U+FF61 */
1599     0x2156,
1600     0x2157,
1601     0x2122,
1602     0x2126,
1603     0x2572,
1604     0x2521,
1605     0x2523,
1606     0x2525,
1607     0x2527,
1608     0x2529,
1609     0x2563,
1610     0x2565,
1611     0x2567,
1612     0x2543,
1613     0x213C,  /* U+FF70 */
1614     0x2522,
1615     0x2524,
1616     0x2526,
1617     0x2528,
1618     0x252A,
1619     0x252B,
1620     0x252D,
1621     0x252F,
1622     0x2531,
1623     0x2533,
1624     0x2535,
1625     0x2537,
1626     0x2539,
1627     0x253B,
1628     0x253D,
1629     0x253F,  /* U+FF80 */
1630     0x2541,
1631     0x2544,
1632     0x2546,
1633     0x2548,
1634     0x254A,
1635     0x254B,
1636     0x254C,
1637     0x254D,
1638     0x254E,
1639     0x254F,
1640     0x2552,
1641     0x2555,
1642     0x2558,
1643     0x255B,
1644     0x255E,
1645     0x255F,  /* U+FF90 */
1646     0x2560,
1647     0x2561,
1648     0x2562,
1649     0x2564,
1650     0x2566,
1651     0x2568,
1652     0x2569,
1653     0x256A,
1654     0x256B,
1655     0x256C,
1656     0x256D,
1657     0x256F,
1658     0x2573,
1659     0x212B,
1660     0x212C   /* U+FF9F */
1661 };
1662
1663 static void U_CALLCONV
1664 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665     UConverter *cnv = args->converter;
1666     UConverterDataISO2022 *converterData;
1667     ISO2022State *pFromU2022State;
1668     uint8_t *target = (uint8_t *) args->target;
1669     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1670     const UChar* source = args->source;
1671     const UChar* sourceLimit = args->sourceLimit;
1672     int32_t* offsets = args->offsets;
1673     UChar32 sourceChar;
1674     char buffer[8];
1675     int32_t len, outLen;
1676     int8_t choices[10];
1677     int32_t choiceCount;
1678     uint32_t targetValue = 0;
1679     UBool useFallback;
1680
1681     int32_t i;
1682     int8_t cs, g;
1683
1684     /* set up the state */
1685     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1686     pFromU2022State   = &converterData->fromU2022State;
1687
1688     choiceCount = 0;
1689
1690     /* check if the last codepoint of previous buffer was a lead surrogate*/
1691     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1692         goto getTrail;
1693     }
1694
1695     while(source < sourceLimit) {
1696         if(target < targetLimit) {
1697
1698             sourceChar  = *(source++);
1699             /*check if the char is a First surrogate*/
1700             if(U16_IS_SURROGATE(sourceChar)) {
1701                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702 getTrail:
1703                     /*look ahead to find the trail surrogate*/
1704                     if(source < sourceLimit) {
1705                         /* test the following code unit */
1706                         UChar trail=(UChar) *source;
1707                         if(U16_IS_TRAIL(trail)) {
1708                             source++;
1709                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710                             cnv->fromUChar32=0x00;
1711                             /* convert this supplementary code point */
1712                             /* exit this condition tree */
1713                         } else {
1714                             /* this is an unmatched lead code unit (1st surrogate) */
1715                             /* callback(illegal) */
1716                             *err=U_ILLEGAL_CHAR_FOUND;
1717                             cnv->fromUChar32=sourceChar;
1718                             break;
1719                         }
1720                     } else {
1721                         /* no more input */
1722                         cnv->fromUChar32=sourceChar;
1723                         break;
1724                     }
1725                 } else {
1726                     /* this is an unmatched trail code unit (2nd surrogate) */
1727                     /* callback(illegal) */
1728                     *err=U_ILLEGAL_CHAR_FOUND;
1729                     cnv->fromUChar32=sourceChar;
1730                     break;
1731                 }
1732             }
1733
1734             /* do not convert SO/SI/ESC */
1735             if(IS_2022_CONTROL(sourceChar)) {
1736                 /* callback(illegal) */
1737                 *err=U_ILLEGAL_CHAR_FOUND;
1738                 cnv->fromUChar32=sourceChar;
1739                 break;
1740             }
1741
1742             /* do the conversion */
1743
1744             if(choiceCount == 0) {
1745                 uint16_t csm;
1746
1747                 /*
1748                  * The csm variable keeps track of which charsets are allowed
1749                  * and not used yet while building the choices[].
1750                  */
1751                 csm = jpCharsetMasks[converterData->version];
1752                 choiceCount = 0;
1753
1754                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755                 if(converterData->version == 3 || converterData->version == 4) {
1756                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1757                 }
1758                 /* Do not try single-byte half-width Katakana for other versions. */
1759                 csm &= ~CSM(HWKANA_7BIT);
1760
1761                 /* try the current G0 charset */
1762                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763                 csm &= ~CSM(cs);
1764
1765                 /* try the current G2 charset */
1766                 if((cs = pFromU2022State->cs[2]) != 0) {
1767                     choices[choiceCount++] = cs;
1768                     csm &= ~CSM(cs);
1769                 }
1770
1771                 /* try all the other possible charsets */
1772                 for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773                     cs = (int8_t)jpCharsetPref[i];
1774                     if(CSM(cs) & csm) {
1775                         choices[choiceCount++] = cs;
1776                         csm &= ~CSM(cs);
1777                     }
1778                 }
1779             }
1780
1781             cs = g = 0;
1782             /*
1783              * len==0: no mapping found yet
1784              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785              * len>0: found a roundtrip result, done
1786              */
1787             len = 0;
1788             /*
1789              * We will turn off useFallback after finding a fallback,
1790              * but we still get fallbacks from PUA code points as usual.
1791              * Therefore, we will also need to check that we don't overwrite
1792              * an early fallback with a later one.
1793              */
1794             useFallback = cnv->useFallback;
1795
1796             for(i = 0; i < choiceCount && len <= 0; ++i) {
1797                 uint32_t value;
1798                 int32_t len2;
1799                 int8_t cs0 = choices[i];
1800                 switch(cs0) {
1801                 case ASCII:
1802                     if(sourceChar <= 0x7f) {
1803                         targetValue = (uint32_t)sourceChar;
1804                         len = 1;
1805                         cs = cs0;
1806                         g = 0;
1807                     }
1808                     break;
1809                 case ISO8859_1:
1810                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811                         targetValue = (uint32_t)sourceChar - 0x80;
1812                         len = 1;
1813                         cs = cs0;
1814                         g = 2;
1815                     }
1816                     break;
1817                 case HWKANA_7BIT:
1818                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819                         if(converterData->version==3) {
1820                             /* JIS7: use G1 (SO) */
1821                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1823                             len = 1;
1824                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1825                             g = 1;
1826                         } else if(converterData->version==4) {
1827                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1830                             len = 1;
1831
1832                             cs = pFromU2022State->cs[0];
1833                             if(IS_JP_DBCS(cs)) {
1834                                 /* switch from a DBCS charset to JISX201 */
1835                                 cs = (int8_t)JISX201;
1836                             }
1837                             /* else stay in the current G0 charset */
1838                             g = 0;
1839                         }
1840                         /* else do not use HWKANA_7BIT with other versions */
1841                     }
1842                     break;
1843                 case JISX201:
1844                     /* G0 SBCS */
1845                     value = jisx201FromU(sourceChar);
1846                     if(value <= 0x7f) {
1847                         targetValue = value;
1848                         len = 1;
1849                         cs = cs0;
1850                         g = 0;
1851                         useFallback = FALSE;
1852                     }
1853                     break;
1854                 case JISX208:
1855                     /* G0 DBCS from Shift-JIS table */
1856                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1857                                 converterData->myConverterArray[cs0],
1858                                 sourceChar, &value,
1859                                 useFallback, MBCS_OUTPUT_2);
1860                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1861                         value = _2022FromSJIS(value);
1862                         if(value != 0) {
1863                             targetValue = value;
1864                             len = len2;
1865                             cs = cs0;
1866                             g = 0;
1867                             useFallback = FALSE;
1868                         }
1869                     } else if(len == 0 && useFallback &&
1870                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872                         len = -2;
1873                         cs = cs0;
1874                         g = 0;
1875                         useFallback = FALSE;
1876                     }
1877                     break;
1878                 case ISO8859_7:
1879                     /* G0 SBCS forced to 7-bit output */
1880                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1881                                 converterData->myConverterArray[cs0],
1882                                 sourceChar, &value,
1883                                 useFallback);
1884                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885                         targetValue = value - 0x80;
1886                         len = len2;
1887                         cs = cs0;
1888                         g = 2;
1889                         useFallback = FALSE;
1890                     }
1891                     break;
1892                 default:
1893                     /* G0 DBCS */
1894                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1895                                 converterData->myConverterArray[cs0],
1896                                 sourceChar, &value,
1897                                 useFallback, MBCS_OUTPUT_2);
1898                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1899                         if(cs0 == KSC5601) {
1900                             /*
1901                              * Check for valid bytes for the encoding scheme.
1902                              * This is necessary because the sub-converter (windows-949)
1903                              * has a broader encoding scheme than is valid for 2022.
1904                              */
1905                             value = _2022FromGR94DBCS(value);
1906                             if(value == 0) {
1907                                 break;
1908                             }
1909                         }
1910                         targetValue = value;
1911                         len = len2;
1912                         cs = cs0;
1913                         g = 0;
1914                         useFallback = FALSE;
1915                     }
1916                     break;
1917                 }
1918             }
1919
1920             if(len != 0) {
1921                 if(len < 0) {
1922                     len = -len;  /* fallback */
1923                 }
1924                 outLen = 0; /* count output bytes */
1925
1926                 /* write SI if necessary (only for JIS7) */
1927                 if(pFromU2022State->g == 1 && g == 0) {
1928                     buffer[outLen++] = UCNV_SI;
1929                     pFromU2022State->g = 0;
1930                 }
1931
1932                 /* write the designation sequence if necessary */
1933                 if(cs != pFromU2022State->cs[g]) {
1934                     int32_t escLen = escSeqCharsLen[cs];
1935                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936                     outLen += escLen;
1937                     pFromU2022State->cs[g] = cs;
1938
1939                     /* invalidate the choices[] */
1940                     choiceCount = 0;
1941                 }
1942
1943                 /* write the shift sequence if necessary */
1944                 if(g != pFromU2022State->g) {
1945                     switch(g) {
1946                     /* case 0 handled before writing escapes */
1947                     case 1:
1948                         buffer[outLen++] = UCNV_SO;
1949                         pFromU2022State->g = 1;
1950                         break;
1951                     default: /* case 2 */
1952                         buffer[outLen++] = 0x1b;
1953                         buffer[outLen++] = 0x4e;
1954                         break;
1955                     /* no case 3: no SS3 in ISO-2022-JP-x */
1956                     }
1957                 }
1958
1959                 /* write the output bytes */
1960                 if(len == 1) {
1961                     buffer[outLen++] = (char)targetValue;
1962                 } else /* len == 2 */ {
1963                     buffer[outLen++] = (char)(targetValue >> 8);
1964                     buffer[outLen++] = (char)targetValue;
1965                 }
1966             } else {
1967                 /*
1968                  * if we cannot find the character after checking all codepages
1969                  * then this is an error
1970                  */
1971                 *err = U_INVALID_CHAR_FOUND;
1972                 cnv->fromUChar32=sourceChar;
1973                 break;
1974             }
1975
1976             if(sourceChar == CR || sourceChar == LF) {
1977                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978                 pFromU2022State->cs[2] = 0;
1979                 choiceCount = 0;
1980             }
1981
1982             /* output outLen>0 bytes in buffer[] */
1983             if(outLen == 1) {
1984                 *target++ = buffer[0];
1985                 if(offsets) {
1986                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1987                 }
1988             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989                 *target++ = buffer[0];
1990                 *target++ = buffer[1];
1991                 if(offsets) {
1992                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1993                     *offsets++ = sourceIndex;
1994                     *offsets++ = sourceIndex;
1995                 }
1996             } else {
1997                 fromUWriteUInt8(
1998                     cnv,
1999                     buffer, outLen,
2000                     &target, (const char *)targetLimit,
2001                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2002                     err);
2003                 if(U_FAILURE(*err)) {
2004                     break;
2005                 }
2006             }
2007         } /* end if(myTargetIndex<myTargetLength) */
2008         else{
2009             *err =U_BUFFER_OVERFLOW_ERROR;
2010             break;
2011         }
2012
2013     }/* end while(mySourceIndex<mySourceLength) */
2014
2015     /*
2016      * the end of the input stream and detection of truncated input
2017      * are handled by the framework, but for ISO-2022-JP conversion
2018      * we need to be in ASCII mode at the very end
2019      *
2020      * conditions:
2021      *   successful
2022      *   in SO mode or not in ASCII mode
2023      *   end of input and no truncated input
2024      */
2025     if( U_SUCCESS(*err) &&
2026         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2027         args->flush && source>=sourceLimit && cnv->fromUChar32==0
2028     ) {
2029         int32_t sourceIndex;
2030
2031         outLen = 0;
2032
2033         if(pFromU2022State->g != 0) {
2034             buffer[outLen++] = UCNV_SI;
2035             pFromU2022State->g = 0;
2036         }
2037
2038         if(pFromU2022State->cs[0] != ASCII) {
2039             int32_t escLen = escSeqCharsLen[ASCII];
2040             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041             outLen += escLen;
2042             pFromU2022State->cs[0] = (int8_t)ASCII;
2043         }
2044
2045         /* get the source index of the last input character */
2046         /*
2047          * TODO this would be simpler and more reliable if we used a pair
2048          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049          * so that we could simply use the prevSourceIndex here;
2050          * this code gives an incorrect result for the rare case of an unmatched
2051          * trail surrogate that is alone in the last buffer of the text stream
2052          */
2053         sourceIndex=(int32_t)(source-args->source);
2054         if(sourceIndex>0) {
2055             --sourceIndex;
2056             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058             ) {
2059                 --sourceIndex;
2060             }
2061         } else {
2062             sourceIndex=-1;
2063         }
2064
2065         fromUWriteUInt8(
2066             cnv,
2067             buffer, outLen,
2068             &target, (const char *)targetLimit,
2069             &offsets, sourceIndex,
2070             err);
2071     }
2072
2073     /*save the state and return */
2074     args->source = source;
2075     args->target = (char*)target;
2076 }
2077
2078 /*************** to unicode *******************/
2079
2080 static void U_CALLCONV
2081 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082                                                UErrorCode* err){
2083     char tempBuf[2];
2084     const char *mySource = (char *) args->source;
2085     UChar *myTarget = args->target;
2086     const char *mySourceLimit = args->sourceLimit;
2087     uint32_t targetUniChar = 0x0000;
2088     uint32_t mySourceChar = 0x0000;
2089     uint32_t tmpSourceChar = 0x0000;
2090     UConverterDataISO2022* myData;
2091     ISO2022State *pToU2022State;
2092     StateEnum cs;
2093
2094     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2095     pToU2022State = &myData->toU2022State;
2096
2097     if(myData->key != 0) {
2098         /* continue with a partial escape sequence */
2099         goto escape;
2100     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101         /* continue with a partial double-byte character */
2102         mySourceChar = args->converter->toUBytes[0];
2103         args->converter->toULength = 0;
2104         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2105         targetUniChar = missingCharMarker;
2106         goto getTrailByte;
2107     }
2108
2109     while(mySource < mySourceLimit){
2110
2111         targetUniChar =missingCharMarker;
2112
2113         if(myTarget < args->targetLimit){
2114
2115             mySourceChar= (unsigned char) *mySource++;
2116
2117             switch(mySourceChar) {
2118             case UCNV_SI:
2119                 if(myData->version==3) {
2120                     pToU2022State->g=0;
2121                     continue;
2122                 } else {
2123                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2125                     break;
2126                 }
2127
2128             case UCNV_SO:
2129                 if(myData->version==3) {
2130                     /* JIS7: switch to G1 half-width Katakana */
2131                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2132                     pToU2022State->g=1;
2133                     continue;
2134                 } else {
2135                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2137                     break;
2138                 }
2139
2140             case ESC_2022:
2141                 mySource--;
2142 escape:
2143                 {
2144                     const char * mySourceBefore = mySource;
2145                     int8_t toULengthBefore = args->converter->toULength;
2146
2147                     changeState_2022(args->converter,&(mySource),
2148                         mySourceLimit, ISO_2022_JP,err);
2149
2150                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2151                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2155                     }
2156                 }
2157
2158                 /* invalid or illegal escape sequence */
2159                 if(U_FAILURE(*err)){
2160                     args->target = myTarget;
2161                     args->source = mySource;
2162                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2163                     return;
2164                 }
2165                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166                 if(myData->key==0) {
2167                     myData->isEmptySegment = TRUE;
2168                 }
2169                 continue;
2170
2171             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2172
2173             case CR:
2174             case LF:
2175                 /* automatically reset to single-byte mode */
2176                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2177                     pToU2022State->cs[0] = (int8_t)ASCII;
2178                 }
2179                 pToU2022State->cs[2] = 0;
2180                 pToU2022State->g = 0;
2181                 U_FALLTHROUGH;
2182             default:
2183                 /* convert one or two bytes */
2184                 myData->isEmptySegment = FALSE;
2185                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2186                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2187                     !IS_JP_DBCS(cs)
2188                 ) {
2189                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2190                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2191
2192                     /* return from a single-shift state to the previous one */
2193                     if(pToU2022State->g >= 2) {
2194                         pToU2022State->g=pToU2022State->prevG;
2195                     }
2196                 } else switch(cs) {
2197                 case ASCII:
2198                     if(mySourceChar <= 0x7f) {
2199                         targetUniChar = mySourceChar;
2200                     }
2201                     break;
2202                 case ISO8859_1:
2203                     if(mySourceChar <= 0x7f) {
2204                         targetUniChar = mySourceChar + 0x80;
2205                     }
2206                     /* return from a single-shift state to the previous one */
2207                     pToU2022State->g=pToU2022State->prevG;
2208                     break;
2209                 case ISO8859_7:
2210                     if(mySourceChar <= 0x7f) {
2211                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2212                         targetUniChar =
2213                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214                                 myData->myConverterArray[cs],
2215                                 mySourceChar + 0x80);
2216                     }
2217                     /* return from a single-shift state to the previous one */
2218                     pToU2022State->g=pToU2022State->prevG;
2219                     break;
2220                 case JISX201:
2221                     if(mySourceChar <= 0x7f) {
2222                         targetUniChar = jisx201ToU(mySourceChar);
2223                     }
2224                     break;
2225                 case HWKANA_7BIT:
2226                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2227                         /* 7-bit halfwidth Katakana */
2228                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2229                     }
2230                     break;
2231                 default:
2232                     /* G0 DBCS */
2233                     if(mySource < mySourceLimit) {
2234                         int leadIsOk, trailIsOk;
2235                         uint8_t trailByte;
2236 getTrailByte:
2237                         trailByte = (uint8_t)*mySource;
2238                         /*
2239                          * Ticket 5691: consistent illegal sequences:
2240                          * - We include at least the first byte in the illegal sequence.
2241                          * - If any of the non-initial bytes could be the start of a character,
2242                          *   we stop the illegal sequence before the first one of those.
2243                          *
2244                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246                          * Otherwise we convert or report the pair of bytes.
2247                          */
2248                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2249                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2250                         if (leadIsOk && trailIsOk) {
2251                             ++mySource;
2252                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2253                             if(cs == JISX208) {
2254                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2255                                 mySourceChar = tmpSourceChar;
2256                             } else {
2257                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2258                                 mySourceChar = tmpSourceChar;
2259                                 if (cs == KSC5601) {
2260                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2261                                 }
2262                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2263                                 tempBuf[1] = (char)(tmpSourceChar);
2264                             }
2265                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2266                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2267                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2268                             ++mySource;
2269                             /* add another bit so that the code below writes 2 bytes in case of error */
2270                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2271                         }
2272                     } else {
2273                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2274                         args->converter->toULength = 1;
2275                         goto endloop;
2276                     }
2277                 }  /* End of inner switch */
2278                 break;
2279             }  /* End of outer switch */
2280             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2281                 if(args->offsets){
2282                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2283                 }
2284                 *(myTarget++)=(UChar)targetUniChar;
2285             }
2286             else if(targetUniChar > missingCharMarker){
2287                 /* disassemble the surrogate pair and write to output*/
2288                 targetUniChar-=0x0010000;
2289                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2290                 if(args->offsets){
2291                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2292                 }
2293                 ++myTarget;
2294                 if(myTarget< args->targetLimit){
2295                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2296                     if(args->offsets){
2297                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2298                     }
2299                     ++myTarget;
2300                 }else{
2301                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2302                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2303                 }
2304
2305             }
2306             else{
2307                 /* Call the callback function*/
2308                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2309                 break;
2310             }
2311         }
2312         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2313             *err =U_BUFFER_OVERFLOW_ERROR;
2314             break;
2315         }
2316     }
2317 endloop:
2318     args->target = myTarget;
2319     args->source = mySource;
2320 }
2321
2322
2323 #if !UCONFIG_ONLY_HTML_CONVERSION
2324 /***************************************************************
2325 *   Rules for ISO-2022-KR encoding
2326 *   i) The KSC5601 designator sequence should appear only once in a file,
2327 *      at the begining of a line before any KSC5601 characters. This usually
2328 *      means that it appears by itself on the first line of the file
2329 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2330 *      and SI to shift into single byte mode
2331 */
2332 static void U_CALLCONV
2333 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2334
2335     UConverter* saveConv = args->converter;
2336     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2337     args->converter=myConverterData->currentConverter;
2338
2339     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2340     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2341     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2342
2343     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2344         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2345             uprv_memcpy(
2346                 saveConv->charErrorBuffer,
2347                 myConverterData->currentConverter->charErrorBuffer,
2348                 myConverterData->currentConverter->charErrorBufferLength);
2349         }
2350         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2351         myConverterData->currentConverter->charErrorBufferLength = 0;
2352     }
2353     args->converter=saveConv;
2354 }
2355
2356 static void U_CALLCONV
2357 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2358
2359     const UChar *source = args->source;
2360     const UChar *sourceLimit = args->sourceLimit;
2361     unsigned char *target = (unsigned char *) args->target;
2362     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2363     int32_t* offsets = args->offsets;
2364     uint32_t targetByteUnit = 0x0000;
2365     UChar32 sourceChar = 0x0000;
2366     UBool isTargetByteDBCS;
2367     UBool oldIsTargetByteDBCS;
2368     UConverterDataISO2022 *converterData;
2369     UConverterSharedData* sharedData;
2370     UBool useFallback;
2371     int32_t length =0;
2372
2373     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2374     /* if the version is 1 then the user is requesting
2375      * conversion with ibm-25546 pass the arguments to
2376      * MBCS converter and return
2377      */
2378     if(converterData->version==1){
2379         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2380         return;
2381     }
2382
2383     /* initialize data */
2384     sharedData = converterData->currentConverter->sharedData;
2385     useFallback = args->converter->useFallback;
2386     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2387     oldIsTargetByteDBCS = isTargetByteDBCS;
2388
2389     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2390     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2391         goto getTrail;
2392     }
2393     while(source < sourceLimit){
2394
2395         targetByteUnit = missingCharMarker;
2396
2397         if(target < (unsigned char*) args->targetLimit){
2398             sourceChar = *source++;
2399
2400             /* do not convert SO/SI/ESC */
2401             if(IS_2022_CONTROL(sourceChar)) {
2402                 /* callback(illegal) */
2403                 *err=U_ILLEGAL_CHAR_FOUND;
2404                 args->converter->fromUChar32=sourceChar;
2405                 break;
2406             }
2407
2408             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2409             if(length < 0) {
2410                 length = -length;  /* fallback */
2411             }
2412             /* only DBCS or SBCS characters are expected*/
2413             /* DB characters with high bit set to 1 are expected */
2414             if( length > 2 || length==0 ||
2415                 (length == 1 && targetByteUnit > 0x7f) ||
2416                 (length == 2 &&
2417                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2418                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2419             ) {
2420                 targetByteUnit=missingCharMarker;
2421             }
2422             if (targetByteUnit != missingCharMarker){
2423
2424                 oldIsTargetByteDBCS = isTargetByteDBCS;
2425                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2426                   /* append the shift sequence */
2427                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2428
2429                     if (isTargetByteDBCS)
2430                         *target++ = UCNV_SO;
2431                     else
2432                         *target++ = UCNV_SI;
2433                     if(offsets)
2434                         *(offsets++) = (int32_t)(source - args->source-1);
2435                 }
2436                 /* write the targetUniChar  to target */
2437                 if(targetByteUnit <= 0x00FF){
2438                     if( target < targetLimit){
2439                         *(target++) = (unsigned char) targetByteUnit;
2440                         if(offsets){
2441                             *(offsets++) = (int32_t)(source - args->source-1);
2442                         }
2443
2444                     }else{
2445                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2446                         *err = U_BUFFER_OVERFLOW_ERROR;
2447                     }
2448                 }else{
2449                     if(target < targetLimit){
2450                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2451                         if(offsets){
2452                             *(offsets++) = (int32_t)(source - args->source-1);
2453                         }
2454                         if(target < targetLimit){
2455                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2456                             if(offsets){
2457                                 *(offsets++) = (int32_t)(source - args->source-1);
2458                             }
2459                         }else{
2460                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2461                             *err = U_BUFFER_OVERFLOW_ERROR;
2462                         }
2463                     }else{
2464                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2465                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2466                         *err = U_BUFFER_OVERFLOW_ERROR;
2467                     }
2468                 }
2469
2470             }
2471             else{
2472                 /* oops.. the code point is unassingned
2473                  * set the error and reason
2474                  */
2475
2476                 /*check if the char is a First surrogate*/
2477                 if(U16_IS_SURROGATE(sourceChar)) {
2478                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2479 getTrail:
2480                         /*look ahead to find the trail surrogate*/
2481                         if(source <  sourceLimit) {
2482                             /* test the following code unit */
2483                             UChar trail=(UChar) *source;
2484                             if(U16_IS_TRAIL(trail)) {
2485                                 source++;
2486                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2487                                 *err = U_INVALID_CHAR_FOUND;
2488                                 /* convert this surrogate code point */
2489                                 /* exit this condition tree */
2490                             } else {
2491                                 /* this is an unmatched lead code unit (1st surrogate) */
2492                                 /* callback(illegal) */
2493                                 *err=U_ILLEGAL_CHAR_FOUND;
2494                             }
2495                         } else {
2496                             /* no more input */
2497                             *err = U_ZERO_ERROR;
2498                         }
2499                     } else {
2500                         /* this is an unmatched trail code unit (2nd surrogate) */
2501                         /* callback(illegal) */
2502                         *err=U_ILLEGAL_CHAR_FOUND;
2503                     }
2504                 } else {
2505                     /* callback(unassigned) for a BMP code point */
2506                     *err = U_INVALID_CHAR_FOUND;
2507                 }
2508
2509                 args->converter->fromUChar32=sourceChar;
2510                 break;
2511             }
2512         } /* end if(myTargetIndex<myTargetLength) */
2513         else{
2514             *err =U_BUFFER_OVERFLOW_ERROR;
2515             break;
2516         }
2517
2518     }/* end while(mySourceIndex<mySourceLength) */
2519
2520     /*
2521      * the end of the input stream and detection of truncated input
2522      * are handled by the framework, but for ISO-2022-KR conversion
2523      * we need to be in ASCII mode at the very end
2524      *
2525      * conditions:
2526      *   successful
2527      *   not in ASCII mode
2528      *   end of input and no truncated input
2529      */
2530     if( U_SUCCESS(*err) &&
2531         isTargetByteDBCS &&
2532         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2533     ) {
2534         int32_t sourceIndex;
2535
2536         /* we are switching to ASCII */
2537         isTargetByteDBCS=FALSE;
2538
2539         /* get the source index of the last input character */
2540         /*
2541          * TODO this would be simpler and more reliable if we used a pair
2542          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543          * so that we could simply use the prevSourceIndex here;
2544          * this code gives an incorrect result for the rare case of an unmatched
2545          * trail surrogate that is alone in the last buffer of the text stream
2546          */
2547         sourceIndex=(int32_t)(source-args->source);
2548         if(sourceIndex>0) {
2549             --sourceIndex;
2550             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2551                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2552             ) {
2553                 --sourceIndex;
2554             }
2555         } else {
2556             sourceIndex=-1;
2557         }
2558
2559         fromUWriteUInt8(
2560             args->converter,
2561             SHIFT_IN_STR, 1,
2562             &target, (const char *)targetLimit,
2563             &offsets, sourceIndex,
2564             err);
2565     }
2566
2567     /*save the state and return */
2568     args->source = source;
2569     args->target = (char*)target;
2570     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2571 }
2572
2573 /************************ To Unicode ***************************************/
2574
2575 static void U_CALLCONV
2576 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2577                                                             UErrorCode* err){
2578     char const* sourceStart;
2579     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2580
2581     UConverterToUnicodeArgs subArgs;
2582     int32_t minArgsSize;
2583
2584     /* set up the subconverter arguments */
2585     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2586         minArgsSize = args->size;
2587     } else {
2588         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2589     }
2590
2591     uprv_memcpy(&subArgs, args, minArgsSize);
2592     subArgs.size = (uint16_t)minArgsSize;
2593     subArgs.converter = myData->currentConverter;
2594
2595     /* remember the original start of the input for offsets */
2596     sourceStart = args->source;
2597
2598     if(myData->key != 0) {
2599         /* continue with a partial escape sequence */
2600         goto escape;
2601     }
2602
2603     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2604         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2605         subArgs.source = args->source;
2606         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2607         if(subArgs.source != subArgs.sourceLimit) {
2608             /*
2609              * get the current partial byte sequence
2610              *
2611              * it needs to be moved between the public and the subconverter
2612              * so that the conversion framework, which only sees the public
2613              * converter, can handle truncated and illegal input etc.
2614              */
2615             if(args->converter->toULength > 0) {
2616                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2617             }
2618             subArgs.converter->toULength = args->converter->toULength;
2619
2620             /*
2621              * Convert up to the end of the input, or to before the next escape character.
2622              * Does not handle conversion extensions because the preToU[] state etc.
2623              * is not copied.
2624              */
2625             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2626
2627             if(args->offsets != NULL && sourceStart != args->source) {
2628                 /* update offsets to base them on the actual start of the input */
2629                 int32_t *offsets = args->offsets;
2630                 UChar *target = args->target;
2631                 int32_t delta = (int32_t)(args->source - sourceStart);
2632                 while(target < subArgs.target) {
2633                     if(*offsets >= 0) {
2634                         *offsets += delta;
2635                     }
2636                     ++offsets;
2637                     ++target;
2638                 }
2639             }
2640             args->source = subArgs.source;
2641             args->target = subArgs.target;
2642             args->offsets = subArgs.offsets;
2643
2644             /* copy input/error/overflow buffers */
2645             if(subArgs.converter->toULength > 0) {
2646                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2647             }
2648             args->converter->toULength = subArgs.converter->toULength;
2649
2650             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2651                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2652                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2653                                 subArgs.converter->UCharErrorBufferLength);
2654                 }
2655                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2656                 subArgs.converter->UCharErrorBufferLength = 0;
2657             }
2658         }
2659
2660         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2661             return;
2662         }
2663
2664 escape:
2665         changeState_2022(args->converter,
2666                &(args->source),
2667                args->sourceLimit,
2668                ISO_2022_KR,
2669                err);
2670     }
2671 }
2672
2673 static void U_CALLCONV
2674 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2675                                                             UErrorCode* err){
2676     char tempBuf[2];
2677     const char *mySource = ( char *) args->source;
2678     UChar *myTarget = args->target;
2679     const char *mySourceLimit = args->sourceLimit;
2680     UChar32 targetUniChar = 0x0000;
2681     UChar mySourceChar = 0x0000;
2682     UConverterDataISO2022* myData;
2683     UConverterSharedData* sharedData ;
2684     UBool useFallback;
2685
2686     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2687     if(myData->version==1){
2688         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2689         return;
2690     }
2691
2692     /* initialize state */
2693     sharedData = myData->currentConverter->sharedData;
2694     useFallback = args->converter->useFallback;
2695
2696     if(myData->key != 0) {
2697         /* continue with a partial escape sequence */
2698         goto escape;
2699     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2700         /* continue with a partial double-byte character */
2701         mySourceChar = args->converter->toUBytes[0];
2702         args->converter->toULength = 0;
2703         goto getTrailByte;
2704     }
2705
2706     while(mySource< mySourceLimit){
2707
2708         if(myTarget < args->targetLimit){
2709
2710             mySourceChar= (unsigned char) *mySource++;
2711
2712             if(mySourceChar==UCNV_SI){
2713                 myData->toU2022State.g = 0;
2714                 if (myData->isEmptySegment) {
2715                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2716                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2717                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2718                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2719                     args->converter->toULength = 1;
2720                     args->target = myTarget;
2721                     args->source = mySource;
2722                     return;
2723                 }
2724                 /*consume the source */
2725                 continue;
2726             }else if(mySourceChar==UCNV_SO){
2727                 myData->toU2022State.g = 1;
2728                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2729                 /*consume the source */
2730                 continue;
2731             }else if(mySourceChar==ESC_2022){
2732                 mySource--;
2733 escape:
2734                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2735                 changeState_2022(args->converter,&(mySource),
2736                                 mySourceLimit, ISO_2022_KR, err);
2737                 if(U_FAILURE(*err)){
2738                     args->target = myTarget;
2739                     args->source = mySource;
2740                     return;
2741                 }
2742                 continue;
2743             }
2744
2745             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2746             if(myData->toU2022State.g == 1) {
2747                 if(mySource < mySourceLimit) {
2748                     int leadIsOk, trailIsOk;
2749                     uint8_t trailByte;
2750 getTrailByte:
2751                     targetUniChar = missingCharMarker;
2752                     trailByte = (uint8_t)*mySource;
2753                     /*
2754                      * Ticket 5691: consistent illegal sequences:
2755                      * - We include at least the first byte in the illegal sequence.
2756                      * - If any of the non-initial bytes could be the start of a character,
2757                      *   we stop the illegal sequence before the first one of those.
2758                      *
2759                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761                      * Otherwise we convert or report the pair of bytes.
2762                      */
2763                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2764                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2765                     if (leadIsOk && trailIsOk) {
2766                         ++mySource;
2767                         tempBuf[0] = (char)(mySourceChar + 0x80);
2768                         tempBuf[1] = (char)(trailByte + 0x80);
2769                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2770                         mySourceChar = (mySourceChar << 8) | trailByte;
2771                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2772                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2773                         ++mySource;
2774                         /* add another bit so that the code below writes 2 bytes in case of error */
2775                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2776                     }
2777                 } else {
2778                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2779                     args->converter->toULength = 1;
2780                     break;
2781                 }
2782             }
2783             else if(mySourceChar <= 0x7f) {
2784                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2785             } else {
2786                 targetUniChar = 0xffff;
2787             }
2788             if(targetUniChar < 0xfffe){
2789                 if(args->offsets) {
2790                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2791                 }
2792                 *(myTarget++)=(UChar)targetUniChar;
2793             }
2794             else {
2795                 /* Call the callback function*/
2796                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2797                 break;
2798             }
2799         }
2800         else{
2801             *err =U_BUFFER_OVERFLOW_ERROR;
2802             break;
2803         }
2804     }
2805     args->target = myTarget;
2806     args->source = mySource;
2807 }
2808
2809 /*************************** END ISO2022-KR *********************************/
2810
2811 /*************************** ISO-2022-CN *********************************
2812 *
2813 * Rules for ISO-2022-CN Encoding:
2814 * i)   The designator sequence must appear once on a line before any instance
2815 *      of character set it designates.
2816 * ii)  If two lines contain characters from the same character set, both lines
2817 *      must include the designator sequence.
2818 * iii) Once the designator sequence is known, a shifting sequence has to be found
2819 *      to invoke the  shifting
2820 * iv)  All lines start in ASCII and end in ASCII.
2821 * v)   Four shifting sequences are employed for this purpose:
2822 *
2823 *      Sequcence   ASCII Eq    Charsets
2824 *      ----------  -------    ---------
2825 *      SI           <SI>        US-ASCII
2826 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2828 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2829 *
2830 * vi)
2831 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2832 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2833 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2834 *
2835 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2836 *       characters as defined in GB 2312-80, until
2837 *       another SOdesignation appears
2838 *
2839 *
2840 *      ESC $ ) E       Indicates the bytes following SO are as defined
2841 *       in ISO-IR-165 (for details, see section 2.1),
2842 *       until another SOdesignation appears
2843 *
2844 *      ESC $ ) G       Indicates the bytes following SO are as defined
2845 *       in CNS 11643-plane-1, until another
2846 *       SOdesignation appears
2847 *
2848 *      ESC $ * H       Indicates the two bytes immediately following
2849 *       SS2 is a Chinese character as defined in CNS
2850 *       11643-plane-2, until another SS2designation
2851 *       appears
2852 *       (Meaning <ESC>N must preceed every 2 byte
2853 *        sequence.)
2854 *
2855 *      ESC $ + I       Indicates the immediate two bytes following SS3
2856 *       is a Chinese character as defined in CNS
2857 *       11643-plane-3, until another SS3designation
2858 *       appears
2859 *       (Meaning <ESC>O must preceed every 2 byte
2860 *        sequence.)
2861 *
2862 *      ESC $ + J       Indicates the immediate two bytes following SS3
2863 *       is a Chinese character as defined in CNS
2864 *       11643-plane-4, until another SS3designation
2865 *       appears
2866 *       (In English: <ESC>O must preceed every 2 byte
2867 *        sequence.)
2868 *
2869 *      ESC $ + K       Indicates the immediate two bytes following SS3
2870 *       is a Chinese character as defined in CNS
2871 *       11643-plane-5, until another SS3designation
2872 *       appears
2873 *
2874 *      ESC $ + L       Indicates the immediate two bytes following SS3
2875 *       is a Chinese character as defined in CNS
2876 *       11643-plane-6, until another SS3designation
2877 *       appears
2878 *
2879 *      ESC $ + M       Indicates the immediate two bytes following SS3
2880 *       is a Chinese character as defined in CNS
2881 *       11643-plane-7, until another SS3designation
2882 *       appears
2883 *
2884 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885 *       has its own designation information before any Chinese characters
2886 *       appear
2887 *
2888 */
2889
2890 /* The following are defined this way to make the strings truly readonly */
2891 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2892 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2893 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2894 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2895 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2896 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2897 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2898 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2899 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2900
2901 /********************** ISO2022-CN Data **************************/
2902 static const char* const escSeqCharsCN[10] ={
2903         SHIFT_IN_STR,                   /* 0 ASCII */
2904         GB_2312_80_STR,                 /* 1 GB2312_1 */
2905         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2906         CNS_11643_1992_Plane_1_STR,
2907         CNS_11643_1992_Plane_2_STR,
2908         CNS_11643_1992_Plane_3_STR,
2909         CNS_11643_1992_Plane_4_STR,
2910         CNS_11643_1992_Plane_5_STR,
2911         CNS_11643_1992_Plane_6_STR,
2912         CNS_11643_1992_Plane_7_STR
2913 };
2914
2915 static void U_CALLCONV
2916 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2917     UConverter *cnv = args->converter;
2918     UConverterDataISO2022 *converterData;
2919     ISO2022State *pFromU2022State;
2920     uint8_t *target = (uint8_t *) args->target;
2921     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2922     const UChar* source = args->source;
2923     const UChar* sourceLimit = args->sourceLimit;
2924     int32_t* offsets = args->offsets;
2925     UChar32 sourceChar;
2926     char buffer[8];
2927     int32_t len;
2928     int8_t choices[3];
2929     int32_t choiceCount;
2930     uint32_t targetValue = 0;
2931     UBool useFallback;
2932
2933     /* set up the state */
2934     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2935     pFromU2022State   = &converterData->fromU2022State;
2936
2937     choiceCount = 0;
2938
2939     /* check if the last codepoint of previous buffer was a lead surrogate*/
2940     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2941         goto getTrail;
2942     }
2943
2944     while( source < sourceLimit){
2945         if(target < targetLimit){
2946
2947             sourceChar  = *(source++);
2948             /*check if the char is a First surrogate*/
2949              if(U16_IS_SURROGATE(sourceChar)) {
2950                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2951 getTrail:
2952                     /*look ahead to find the trail surrogate*/
2953                     if(source < sourceLimit) {
2954                         /* test the following code unit */
2955                         UChar trail=(UChar) *source;
2956                         if(U16_IS_TRAIL(trail)) {
2957                             source++;
2958                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2959                             cnv->fromUChar32=0x00;
2960                             /* convert this supplementary code point */
2961                             /* exit this condition tree */
2962                         } else {
2963                             /* this is an unmatched lead code unit (1st surrogate) */
2964                             /* callback(illegal) */
2965                             *err=U_ILLEGAL_CHAR_FOUND;
2966                             cnv->fromUChar32=sourceChar;
2967                             break;
2968                         }
2969                     } else {
2970                         /* no more input */
2971                         cnv->fromUChar32=sourceChar;
2972                         break;
2973                     }
2974                 } else {
2975                     /* this is an unmatched trail code unit (2nd surrogate) */
2976                     /* callback(illegal) */
2977                     *err=U_ILLEGAL_CHAR_FOUND;
2978                     cnv->fromUChar32=sourceChar;
2979                     break;
2980                 }
2981             }
2982
2983             /* do the conversion */
2984             if(sourceChar <= 0x007f ){
2985                 /* do not convert SO/SI/ESC */
2986                 if(IS_2022_CONTROL(sourceChar)) {
2987                     /* callback(illegal) */
2988                     *err=U_ILLEGAL_CHAR_FOUND;
2989                     cnv->fromUChar32=sourceChar;
2990                     break;
2991                 }
2992
2993                 /* US-ASCII */
2994                 if(pFromU2022State->g == 0) {
2995                     buffer[0] = (char)sourceChar;
2996                     len = 1;
2997                 } else {
2998                     buffer[0] = UCNV_SI;
2999                     buffer[1] = (char)sourceChar;
3000                     len = 2;
3001                     pFromU2022State->g = 0;
3002                     choiceCount = 0;
3003                 }
3004                 if(sourceChar == CR || sourceChar == LF) {
3005                     /* reset the state at the end of a line */
3006                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3007                     choiceCount = 0;
3008                 }
3009             }
3010             else{
3011                 /* convert U+0080..U+10ffff */
3012                 int32_t i;
3013                 int8_t cs, g;
3014
3015                 if(choiceCount == 0) {
3016                     /* try the current SO/G1 converter first */
3017                     choices[0] = pFromU2022State->cs[1];
3018
3019                     /* default to GB2312_1 if none is designated yet */
3020                     if(choices[0] == 0) {
3021                         choices[0] = GB2312_1;
3022                     }
3023
3024                     if(converterData->version == 0) {
3025                         /* ISO-2022-CN */
3026
3027                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3028                         if(choices[0] == GB2312_1) {
3029                             choices[1] = (int8_t)CNS_11643_1;
3030                         } else {
3031                             choices[1] = (int8_t)GB2312_1;
3032                         }
3033
3034                         choiceCount = 2;
3035                     } else if (converterData->version == 1) {
3036                         /* ISO-2022-CN-EXT */
3037
3038                         /* try one of the other converters */
3039                         switch(choices[0]) {
3040                         case GB2312_1:
3041                             choices[1] = (int8_t)CNS_11643_1;
3042                             choices[2] = (int8_t)ISO_IR_165;
3043                             break;
3044                         case ISO_IR_165:
3045                             choices[1] = (int8_t)GB2312_1;
3046                             choices[2] = (int8_t)CNS_11643_1;
3047                             break;
3048                         default: /* CNS_11643_x */
3049                             choices[1] = (int8_t)GB2312_1;
3050                             choices[2] = (int8_t)ISO_IR_165;
3051                             break;
3052                         }
3053
3054                         choiceCount = 3;
3055                     } else {
3056                         choices[0] = (int8_t)CNS_11643_1;
3057                         choices[1] = (int8_t)GB2312_1;
3058                     }
3059                 }
3060
3061                 cs = g = 0;
3062                 /*
3063                  * len==0: no mapping found yet
3064                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065                  * len>0: found a roundtrip result, done
3066                  */
3067                 len = 0;
3068                 /*
3069                  * We will turn off useFallback after finding a fallback,
3070                  * but we still get fallbacks from PUA code points as usual.
3071                  * Therefore, we will also need to check that we don't overwrite
3072                  * an early fallback with a later one.
3073                  */
3074                 useFallback = cnv->useFallback;
3075
3076                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3077                     int8_t cs0 = choices[i];
3078                     if(cs0 > 0) {
3079                         uint32_t value;
3080                         int32_t len2;
3081                         if(cs0 >= CNS_11643_0) {
3082                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3083                                         converterData->myConverterArray[CNS_11643],
3084                                         sourceChar,
3085                                         &value,
3086                                         useFallback,
3087                                         MBCS_OUTPUT_3);
3088                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3089                                 targetValue = value;
3090                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3091                                 if(len2 >= 0) {
3092                                     len = 2;
3093                                 } else {
3094                                     len = -2;
3095                                     useFallback = FALSE;
3096                                 }
3097                                 if(cs == CNS_11643_1) {
3098                                     g = 1;
3099                                 } else if(cs == CNS_11643_2) {
3100                                     g = 2;
3101                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3102                                     g = 3;
3103                                 } else {
3104                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3105                                     len = 0;
3106                                 }
3107                             }
3108                         } else {
3109                             /* GB2312_1 or ISO-IR-165 */
3110                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3111                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3112                                         converterData->myConverterArray[cs0],
3113                                         sourceChar,
3114                                         &value,
3115                                         useFallback,
3116                                         MBCS_OUTPUT_2);
3117                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3118                                 targetValue = value;
3119                                 len = len2;
3120                                 cs = cs0;
3121                                 g = 1;
3122                                 useFallback = FALSE;
3123                             }
3124                         }
3125                     }
3126                 }
3127
3128                 if(len != 0) {
3129                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3130
3131                     /* write the designation sequence if necessary */
3132                     if(cs != pFromU2022State->cs[g]) {
3133                         if(cs < CNS_11643) {
3134                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3135                         } else {
3136                             U_ASSERT(cs >= CNS_11643_1);
3137                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3138                         }
3139                         len = 4;
3140                         pFromU2022State->cs[g] = cs;
3141                         if(g == 1) {
3142                             /* changing the SO/G1 charset invalidates the choices[] */
3143                             choiceCount = 0;
3144                         }
3145                     }
3146
3147                     /* write the shift sequence if necessary */
3148                     if(g != pFromU2022State->g) {
3149                         switch(g) {
3150                         case 1:
3151                             buffer[len++] = UCNV_SO;
3152
3153                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3154                             pFromU2022State->g = 1;
3155                             break;
3156                         case 2:
3157                             buffer[len++] = 0x1b;
3158                             buffer[len++] = 0x4e;
3159                             break;
3160                         default: /* case 3 */
3161                             buffer[len++] = 0x1b;
3162                             buffer[len++] = 0x4f;
3163                             break;
3164                         }
3165                     }
3166
3167                     /* write the two output bytes */
3168                     buffer[len++] = (char)(targetValue >> 8);
3169                     buffer[len++] = (char)targetValue;
3170                 } else {
3171                     /* if we cannot find the character after checking all codepages
3172                      * then this is an error
3173                      */
3174                     *err = U_INVALID_CHAR_FOUND;
3175                     cnv->fromUChar32=sourceChar;
3176                     break;
3177                 }
3178             }
3179
3180             /* output len>0 bytes in buffer[] */
3181             if(len == 1) {
3182                 *target++ = buffer[0];
3183                 if(offsets) {
3184                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3185                 }
3186             } else if(len == 2 && (target + 2) <= targetLimit) {
3187                 *target++ = buffer[0];
3188                 *target++ = buffer[1];
3189                 if(offsets) {
3190                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3191                     *offsets++ = sourceIndex;
3192                     *offsets++ = sourceIndex;
3193                 }
3194             } else {
3195                 fromUWriteUInt8(
3196                     cnv,
3197                     buffer, len,
3198                     &target, (const char *)targetLimit,
3199                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3200                     err);
3201                 if(U_FAILURE(*err)) {
3202                     break;
3203                 }
3204             }
3205         } /* end if(myTargetIndex<myTargetLength) */
3206         else{
3207             *err =U_BUFFER_OVERFLOW_ERROR;
3208             break;
3209         }
3210
3211     }/* end while(mySourceIndex<mySourceLength) */
3212
3213     /*
3214      * the end of the input stream and detection of truncated input
3215      * are handled by the framework, but for ISO-2022-CN conversion
3216      * we need to be in ASCII mode at the very end
3217      *
3218      * conditions:
3219      *   successful
3220      *   not in ASCII mode
3221      *   end of input and no truncated input
3222      */
3223     if( U_SUCCESS(*err) &&
3224         pFromU2022State->g!=0 &&
3225         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3226     ) {
3227         int32_t sourceIndex;
3228
3229         /* we are switching to ASCII */
3230         pFromU2022State->g=0;
3231
3232         /* get the source index of the last input character */
3233         /*
3234          * TODO this would be simpler and more reliable if we used a pair
3235          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236          * so that we could simply use the prevSourceIndex here;
3237          * this code gives an incorrect result for the rare case of an unmatched
3238          * trail surrogate that is alone in the last buffer of the text stream
3239          */
3240         sourceIndex=(int32_t)(source-args->source);
3241         if(sourceIndex>0) {
3242             --sourceIndex;
3243             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3244                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3245             ) {
3246                 --sourceIndex;
3247             }
3248         } else {
3249             sourceIndex=-1;
3250         }
3251
3252         fromUWriteUInt8(
3253             cnv,
3254             SHIFT_IN_STR, 1,
3255             &target, (const char *)targetLimit,
3256             &offsets, sourceIndex,
3257             err);
3258     }
3259
3260     /*save the state and return */
3261     args->source = source;
3262     args->target = (char*)target;
3263 }
3264
3265
3266 static void U_CALLCONV
3267 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3268                                                UErrorCode* err){
3269     char tempBuf[3];
3270     const char *mySource = (char *) args->source;
3271     UChar *myTarget = args->target;
3272     const char *mySourceLimit = args->sourceLimit;
3273     uint32_t targetUniChar = 0x0000;
3274     uint32_t mySourceChar = 0x0000;
3275     UConverterDataISO2022* myData;
3276     ISO2022State *pToU2022State;
3277
3278     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3279     pToU2022State = &myData->toU2022State;
3280
3281     if(myData->key != 0) {
3282         /* continue with a partial escape sequence */
3283         goto escape;
3284     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3285         /* continue with a partial double-byte character */
3286         mySourceChar = args->converter->toUBytes[0];
3287         args->converter->toULength = 0;
3288         targetUniChar = missingCharMarker;
3289         goto getTrailByte;
3290     }
3291
3292     while(mySource < mySourceLimit){
3293
3294         targetUniChar =missingCharMarker;
3295
3296         if(myTarget < args->targetLimit){
3297
3298             mySourceChar= (unsigned char) *mySource++;
3299
3300             switch(mySourceChar){
3301             case UCNV_SI:
3302                 pToU2022State->g=0;
3303                 if (myData->isEmptySegment) {
3304                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3305                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3306                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3307                     args->converter->toUBytes[0] = mySourceChar;
3308                     args->converter->toULength = 1;
3309                     args->target = myTarget;
3310                     args->source = mySource;
3311                     return;
3312                 }
3313                 continue;
3314
3315             case UCNV_SO:
3316                 if(pToU2022State->cs[1] != 0) {
3317                     pToU2022State->g=1;
3318                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3319                     continue;
3320                 } else {
3321                     /* illegal to have SO before a matching designator */
3322                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3323                     break;
3324                 }
3325
3326             case ESC_2022:
3327                 mySource--;
3328 escape:
3329                 {
3330                     const char * mySourceBefore = mySource;
3331                     int8_t toULengthBefore = args->converter->toULength;
3332
3333                     changeState_2022(args->converter,&(mySource),
3334                         mySourceLimit, ISO_2022_CN,err);
3335
3336                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3337                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3338                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3339                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3340                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3341                     }
3342                 }
3343
3344                 /* invalid or illegal escape sequence */
3345                 if(U_FAILURE(*err)){
3346                     args->target = myTarget;
3347                     args->source = mySource;
3348                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3349                     return;
3350                 }
3351                 continue;
3352
3353             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3354
3355             case CR:
3356             case LF:
3357                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3358                 U_FALLTHROUGH;
3359             default:
3360                 /* convert one or two bytes */
3361                 myData->isEmptySegment = FALSE;
3362                 if(pToU2022State->g != 0) {
3363                     if(mySource < mySourceLimit) {
3364                         UConverterSharedData *cnv;
3365                         StateEnum tempState;
3366                         int32_t tempBufLen;
3367                         int leadIsOk, trailIsOk;
3368                         uint8_t trailByte;
3369 getTrailByte:
3370                         trailByte = (uint8_t)*mySource;
3371                         /*
3372                          * Ticket 5691: consistent illegal sequences:
3373                          * - We include at least the first byte in the illegal sequence.
3374                          * - If any of the non-initial bytes could be the start of a character,
3375                          *   we stop the illegal sequence before the first one of those.
3376                          *
3377                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379                          * Otherwise we convert or report the pair of bytes.
3380                          */
3381                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3382                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3383                         if (leadIsOk && trailIsOk) {
3384                             ++mySource;
3385                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3386                             if(tempState >= CNS_11643_0) {
3387                                 cnv = myData->myConverterArray[CNS_11643];
3388                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3389                                 tempBuf[1] = (char) (mySourceChar);
3390                                 tempBuf[2] = (char) trailByte;
3391                                 tempBufLen = 3;
3392
3393                             }else{
3394                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3395                                 cnv = myData->myConverterArray[tempState];
3396                                 tempBuf[0] = (char) (mySourceChar);
3397                                 tempBuf[1] = (char) trailByte;
3398                                 tempBufLen = 2;
3399                             }
3400                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3401                             mySourceChar = (mySourceChar << 8) | trailByte;
3402                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3403                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3404                             ++mySource;
3405                             /* add another bit so that the code below writes 2 bytes in case of error */
3406                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3407                         }
3408                         if(pToU2022State->g>=2) {
3409                             /* return from a single-shift state to the previous one */
3410                             pToU2022State->g=pToU2022State->prevG;
3411                         }
3412                     } else {
3413                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3414                         args->converter->toULength = 1;
3415                         goto endloop;
3416                     }
3417                 }
3418                 else{
3419                     if(mySourceChar <= 0x7f) {
3420                         targetUniChar = (UChar) mySourceChar;
3421                     }
3422                 }
3423                 break;
3424             }
3425             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3426                 if(args->offsets){
3427                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3428                 }
3429                 *(myTarget++)=(UChar)targetUniChar;
3430             }
3431             else if(targetUniChar > missingCharMarker){
3432                 /* disassemble the surrogate pair and write to output*/
3433                 targetUniChar-=0x0010000;
3434                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3435                 if(args->offsets){
3436                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3437                 }
3438                 ++myTarget;
3439                 if(myTarget< args->targetLimit){
3440                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3441                     if(args->offsets){
3442                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3443                     }
3444                     ++myTarget;
3445                 }else{
3446                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3447                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3448                 }
3449
3450             }
3451             else{
3452                 /* Call the callback function*/
3453                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3454                 break;
3455             }
3456         }
3457         else{
3458             *err =U_BUFFER_OVERFLOW_ERROR;
3459             break;
3460         }
3461     }
3462 endloop:
3463     args->target = myTarget;
3464     args->source = mySource;
3465 }
3466 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3467
3468 static void U_CALLCONV
3469 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3470     UConverter *cnv = args->converter;
3471     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3472     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3473     char *p, *subchar;
3474     char buffer[8];
3475     int32_t length;
3476
3477     subchar=(char *)cnv->subChars;
3478     length=cnv->subCharLen; /* assume length==1 for most variants */
3479
3480     p = buffer;
3481     switch(myConverterData->locale[0]){
3482     case 'j':
3483         {
3484             int8_t cs;
3485
3486             if(pFromU2022State->g == 1) {
3487                 /* JIS7: switch from G1 to G0 */
3488                 pFromU2022State->g = 0;
3489                 *p++ = UCNV_SI;
3490             }
3491
3492             cs = pFromU2022State->cs[0];
3493             if(cs != ASCII && cs != JISX201) {
3494                 /* not in ASCII or JIS X 0201: switch to ASCII */
3495                 pFromU2022State->cs[0] = (int8_t)ASCII;
3496                 *p++ = '\x1b';
3497                 *p++ = '\x28';
3498                 *p++ = '\x42';
3499             }
3500
3501             *p++ = subchar[0];
3502             break;
3503         }
3504     case 'c':
3505         if(pFromU2022State->g != 0) {
3506             /* not in ASCII mode: switch to ASCII */
3507             pFromU2022State->g = 0;
3508             *p++ = UCNV_SI;
3509         }
3510         *p++ = subchar[0];
3511         break;
3512     case 'k':
3513         if(myConverterData->version == 0) {
3514             if(length == 1) {
3515                 if((UBool)args->converter->fromUnicodeStatus) {
3516                     /* in DBCS mode: switch to SBCS */
3517                     args->converter->fromUnicodeStatus = 0;
3518                     *p++ = UCNV_SI;
3519                 }
3520                 *p++ = subchar[0];
3521             } else /* length == 2*/ {
3522                 if(!(UBool)args->converter->fromUnicodeStatus) {
3523                     /* in SBCS mode: switch to DBCS */
3524                     args->converter->fromUnicodeStatus = 1;
3525                     *p++ = UCNV_SO;
3526                 }
3527                 *p++ = subchar[0];
3528                 *p++ = subchar[1];
3529             }
3530             break;
3531         } else {
3532             /* save the subconverter's substitution string */
3533             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3534             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3535
3536             /* set our substitution string into the subconverter */
3537             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3538             myConverterData->currentConverter->subCharLen = (int8_t)length;
3539
3540             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3541             args->converter = myConverterData->currentConverter;
3542             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3543             ucnv_cbFromUWriteSub(args, 0, err);
3544             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3545             args->converter = cnv;
3546
3547             /* restore the subconverter's substitution string */
3548             myConverterData->currentConverter->subChars = currentSubChars;
3549             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3550
3551             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3552                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3553                     uprv_memcpy(
3554                         cnv->charErrorBuffer,
3555                         myConverterData->currentConverter->charErrorBuffer,
3556                         myConverterData->currentConverter->charErrorBufferLength);
3557                 }
3558                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3559                 myConverterData->currentConverter->charErrorBufferLength = 0;
3560             }
3561             return;
3562         }
3563     default:
3564         /* not expected */
3565         break;
3566     }
3567     ucnv_cbFromUWriteBytes(args,
3568                            buffer, (int32_t)(p - buffer),
3569                            offsetIndex, err);
3570 }
3571
3572 /*
3573  * Structure for cloning an ISO 2022 converter into a single memory block.
3574  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3575  * and then ucnv_safeClone() of the sub-converter may additionally align
3576  * currentConverter inside the cloneStruct, for which we need the deadSpace
3577  * after currentConverter.
3578  * This is because UAlignedMemory may be larger than the actually
3579  * necessary alignment size for the platform.
3580  * The other cloneStruct fields will not be moved around,
3581  * and are aligned properly with cloneStruct's alignment.
3582  */
3583 struct cloneStruct
3584 {
3585     UConverter cnv;
3586     UConverter currentConverter;
3587     UAlignedMemory deadSpace;
3588     UConverterDataISO2022 mydata;
3589 };
3590
3591
3592 U_CDECL_BEGIN
3593
3594 static UConverter * U_CALLCONV
3595 _ISO_2022_SafeClone(
3596             const UConverter *cnv,
3597             void *stackBuffer,
3598             int32_t *pBufferSize,
3599             UErrorCode *status)
3600 {
3601     struct cloneStruct * localClone;
3602     UConverterDataISO2022 *cnvData;
3603     int32_t i, size;
3604
3605     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3606         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3607         return NULL;
3608     }
3609
3610     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3611     localClone = (struct cloneStruct *)stackBuffer;
3612
3613     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3614
3615     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3616     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3617     localClone->cnv.isExtraLocal = TRUE;
3618
3619     /* share the subconverters */
3620
3621     if(cnvData->currentConverter != NULL) {
3622         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3623         localClone->mydata.currentConverter =
3624             ucnv_safeClone(cnvData->currentConverter,
3625                             &localClone->currentConverter,
3626                             &size, status);
3627         if(U_FAILURE(*status)) {
3628             return NULL;
3629         }
3630     }
3631
3632     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3633         if(cnvData->myConverterArray[i] != NULL) {
3634             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3635         }
3636     }
3637
3638     return &localClone->cnv;
3639 }
3640
3641 U_CDECL_END
3642
3643 static void U_CALLCONV
3644 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3645                     const USetAdder *sa,
3646                     UConverterUnicodeSet which,
3647                     UErrorCode *pErrorCode)
3648 {
3649     int32_t i;
3650     UConverterDataISO2022* cnvData;
3651
3652     if (U_FAILURE(*pErrorCode)) {
3653         return;
3654     }
3655 #ifdef U_ENABLE_GENERIC_ISO_2022
3656     if (cnv->sharedData == &_ISO2022Data) {
3657         /* We use UTF-8 in this case */
3658         sa->addRange(sa->set, 0, 0xd7FF);
3659         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3660         return;
3661     }
3662 #endif
3663
3664     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3665
3666     /* open a set and initialize it with code points that are algorithmically round-tripped */
3667     switch(cnvData->locale[0]){
3668     case 'j':
3669         /* include JIS X 0201 which is hardcoded */
3670         sa->add(sa->set, 0xa5);
3671         sa->add(sa->set, 0x203e);
3672         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3673             /* include Latin-1 for some variants of JP */
3674             sa->addRange(sa->set, 0, 0xff);
3675         } else {
3676             /* include ASCII for JP */
3677             sa->addRange(sa->set, 0, 0x7f);
3678         }
3679         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3680             /*
3681              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3682              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3683              * use half-width Katakana.
3684              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3685              * half-width Katakana via the ESC ( I sequence.
3686              * However, we only emit (fromUnicode) half-width Katakana according to the
3687              * definition of each variant.
3688              *
3689              * When including fallbacks,
3690              * we need to include half-width Katakana Unicode code points for all JP variants because
3691              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3692              */
3693             /* include half-width Katakana for JP */
3694             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3695         }
3696         break;
3697 #if !UCONFIG_ONLY_HTML_CONVERSION
3698     case 'c':
3699     case 'z':
3700         /* include ASCII for CN */
3701         sa->addRange(sa->set, 0, 0x7f);
3702         break;
3703     case 'k':
3704         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3705         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3706                 cnvData->currentConverter, sa, which, pErrorCode);
3707         /* the loop over myConverterArray[] will simply not find another converter */
3708         break;
3709 #endif
3710     default:
3711         break;
3712     }
3713
3714 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3715             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3716                 cnvData->version==0 && i==CNS_11643
3717             ) {
3718                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3719                 ucnv_MBCSGetUnicodeSetForBytes(
3720                         cnvData->myConverterArray[i],
3721                         sa, UCNV_ROUNDTRIP_SET,
3722                         0, 0x81, 0x82,
3723                         pErrorCode);
3724             }
3725 #endif
3726
3727     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3728         UConverterSetFilter filter;
3729         if(cnvData->myConverterArray[i]!=NULL) {
3730             if(cnvData->locale[0]=='j' && i==JISX208) {
3731                 /*
3732                  * Only add code points that map to Shift-JIS codes
3733                  * corresponding to JIS X 0208.
3734                  */
3735                 filter=UCNV_SET_FILTER_SJIS;
3736 #if !UCONFIG_ONLY_HTML_CONVERSION
3737             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3738                        cnvData->version==0 && i==CNS_11643) {
3739                 /*
3740                  * Version-specific for CN:
3741                  * CN version 0 does not map CNS planes 3..7 although
3742                  * they are all available in the CNS conversion table;
3743                  * CN version 1 (-EXT) does map them all.
3744                  * The two versions create different Unicode sets.
3745                  */
3746                 filter=UCNV_SET_FILTER_2022_CN;
3747             } else if(i==KSC5601) {
3748                 /*
3749                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3750                  * are broader than GR94.
3751                  */
3752                 filter=UCNV_SET_FILTER_GR94DBCS;
3753 #endif
3754             } else {
3755                 filter=UCNV_SET_FILTER_NONE;
3756             }
3757             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3758         }
3759     }
3760
3761     /*
3762      * ISO 2022 converters must not convert SO/SI/ESC despite what
3763      * sub-converters do by themselves.
3764      * Remove these characters from the set.
3765      */
3766     sa->remove(sa->set, 0x0e);
3767     sa->remove(sa->set, 0x0f);
3768     sa->remove(sa->set, 0x1b);
3769
3770     /* ISO 2022 converters do not convert C1 controls either */
3771     sa->removeRange(sa->set, 0x80, 0x9f);
3772 }
3773
3774 static const UConverterImpl _ISO2022Impl={
3775     UCNV_ISO_2022,
3776
3777     NULL,
3778     NULL,
3779
3780     _ISO2022Open,
3781     _ISO2022Close,
3782     _ISO2022Reset,
3783
3784 #ifdef U_ENABLE_GENERIC_ISO_2022
3785     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3786     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3787     ucnv_fromUnicode_UTF8,
3788     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3789 #else
3790     NULL,
3791     NULL,
3792     NULL,
3793     NULL,
3794 #endif
3795     NULL,
3796
3797     NULL,
3798     _ISO2022getName,
3799     _ISO_2022_WriteSub,
3800     _ISO_2022_SafeClone,
3801     _ISO_2022_GetUnicodeSet,
3802
3803     NULL,
3804     NULL
3805 };
3806 static const UConverterStaticData _ISO2022StaticData={
3807     sizeof(UConverterStaticData),
3808     "ISO_2022",
3809     2022,
3810     UCNV_IBM,
3811     UCNV_ISO_2022,
3812     1,
3813     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3814     { 0x1a, 0, 0, 0 },
3815     1,
3816     FALSE,
3817     FALSE,
3818     0,
3819     0,
3820     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3821 };
3822 const UConverterSharedData _ISO2022Data=
3823         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3824
3825 /*************JP****************/
3826 static const UConverterImpl _ISO2022JPImpl={
3827     UCNV_ISO_2022,
3828
3829     NULL,
3830     NULL,
3831
3832     _ISO2022Open,
3833     _ISO2022Close,
3834     _ISO2022Reset,
3835
3836     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3837     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3838     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3839     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3840     NULL,
3841
3842     NULL,
3843     _ISO2022getName,
3844     _ISO_2022_WriteSub,
3845     _ISO_2022_SafeClone,
3846     _ISO_2022_GetUnicodeSet,
3847
3848     NULL,
3849     NULL
3850 };
3851 static const UConverterStaticData _ISO2022JPStaticData={
3852     sizeof(UConverterStaticData),
3853     "ISO_2022_JP",
3854     0,
3855     UCNV_IBM,
3856     UCNV_ISO_2022,
3857     1,
3858     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3859     { 0x1a, 0, 0, 0 },
3860     1,
3861     FALSE,
3862     FALSE,
3863     0,
3864     0,
3865     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3866 };
3867
3868 namespace {
3869
3870 const UConverterSharedData _ISO2022JPData=
3871         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3872
3873 }  // namespace
3874
3875 #if !UCONFIG_ONLY_HTML_CONVERSION
3876 /************* KR ***************/
3877 static const UConverterImpl _ISO2022KRImpl={
3878     UCNV_ISO_2022,
3879
3880     NULL,
3881     NULL,
3882
3883     _ISO2022Open,
3884     _ISO2022Close,
3885     _ISO2022Reset,
3886
3887     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3888     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3889     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3890     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3891     NULL,
3892
3893     NULL,
3894     _ISO2022getName,
3895     _ISO_2022_WriteSub,
3896     _ISO_2022_SafeClone,
3897     _ISO_2022_GetUnicodeSet,
3898
3899     NULL,
3900     NULL
3901 };
3902 static const UConverterStaticData _ISO2022KRStaticData={
3903     sizeof(UConverterStaticData),
3904     "ISO_2022_KR",
3905     0,
3906     UCNV_IBM,
3907     UCNV_ISO_2022,
3908     1,
3909     8, /* max 8 bytes per UChar */
3910     { 0x1a, 0, 0, 0 },
3911     1,
3912     FALSE,
3913     FALSE,
3914     0,
3915     0,
3916     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3917 };
3918
3919 namespace {
3920
3921 const UConverterSharedData _ISO2022KRData=
3922         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3923
3924 }  // namespace
3925
3926 /*************** CN ***************/
3927 static const UConverterImpl _ISO2022CNImpl={
3928
3929     UCNV_ISO_2022,
3930
3931     NULL,
3932     NULL,
3933
3934     _ISO2022Open,
3935     _ISO2022Close,
3936     _ISO2022Reset,
3937
3938     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3939     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3940     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3941     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3942     NULL,
3943
3944     NULL,
3945     _ISO2022getName,
3946     _ISO_2022_WriteSub,
3947     _ISO_2022_SafeClone,
3948     _ISO_2022_GetUnicodeSet,
3949
3950     NULL,
3951     NULL
3952 };
3953 static const UConverterStaticData _ISO2022CNStaticData={
3954     sizeof(UConverterStaticData),
3955     "ISO_2022_CN",
3956     0,
3957     UCNV_IBM,
3958     UCNV_ISO_2022,
3959     1,
3960     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3961     { 0x1a, 0, 0, 0 },
3962     1,
3963     FALSE,
3964     FALSE,
3965     0,
3966     0,
3967     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3968 };
3969
3970 namespace {
3971
3972 const UConverterSharedData _ISO2022CNData=
3973         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3974
3975 }  // namespace
3976 #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3977
3978 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */