Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / third_party / icu / source / common / ucnv2022.cpp
1 /*
2 **********************************************************************
3 *   Copyright (C) 2000-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv2022.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2000feb03
12 *   created by: Markus W. Scherer
13 *
14 *   Change history:
15 *
16 *   06/29/2000  helena  Major rewrite of the callback APIs.
17 *   08/08/2000  Ram     Included support for ISO-2022-JP-2
18 *                       Changed implementation of toUnicode
19 *                       function
20 *   08/21/2000  Ram     Added support for ISO-2022-KR
21 *   08/29/2000  Ram     Seperated implementation of EBCDIC to
22 *                       ucnvebdc.c
23 *   09/20/2000  Ram     Added support for ISO-2022-CN
24 *                       Added implementations for getNextUChar()
25 *                       for specific 2022 country variants.
26 *   10/31/2000  Ram     Implemented offsets logic functions
27 */
28
29 #include "unicode/utypes.h"
30
31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
32
33 #include "unicode/ucnv.h"
34 #include "unicode/uset.h"
35 #include "unicode/ucnv_err.h"
36 #include "unicode/ucnv_cb.h"
37 #include "unicode/utf16.h"
38 #include "ucnv_imp.h"
39 #include "ucnv_bld.h"
40 #include "ucnv_cnv.h"
41 #include "ucnvmbcs.h"
42 #include "cstring.h"
43 #include "cmemory.h"
44 #include "uassert.h"
45
46 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
47
48 #ifdef U_ENABLE_GENERIC_ISO_2022
49 /*
50  * I am disabling the generic ISO-2022 converter after proposing to do so on
51  * the icu mailing list two days ago.
52  *
53  * Reasons:
54  * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55  *    its designation sequences, single shifts with return to the previous state,
56  *    switch-with-no-return to UTF-16BE or similar, etc.
57  *    This is unlike the language-specific variants like ISO-2022-JP which
58  *    require a much smaller repertoire of ISO-2022 features.
59  *    These variants continue to be supported.
60  * 2. I believe that no one is really using the generic ISO-2022 converter
61  *    but rather always one of the language-specific variants.
62  *    Note that ICU's generic ISO-2022 converter has always output one escape
63  *    sequence followed by UTF-8 for the whole stream.
64  * 3. Switching between subcharsets is extremely slow, because each time
65  *    the previous converter is closed and a new one opened,
66  *    without any kind of caching, least-recently-used list, etc.
67  * 4. The code is currently buggy, and given the above it does not seem
68  *    reasonable to spend the time on maintenance.
69  * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70  *    This means, for example, that when ISO-8859-7 is designated, the following
71  *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72  *    The ICU ISO-2022 converter does not handle this - and has no information
73  *    about which subconverter would have to be shifted vs. which is designed
74  *    for 7-bit ISO-2022.
75  *
76  * Markus Scherer 2003-dec-03
77  */
78 #endif
79
80 static const char SHIFT_IN_STR[]  = "\x0F";
81 // static const char SHIFT_OUT_STR[] = "\x0E";
82
83 #define CR      0x0D
84 #define LF      0x0A
85 #define H_TAB   0x09
86 #define V_TAB   0x0B
87 #define SPACE   0x20
88
89 enum {
90     HWKANA_START=0xff61,
91     HWKANA_END=0xff9f
92 };
93
94 /*
95  * 94-character sets with native byte values A1..FE are encoded in ISO 2022
96  * as bytes 21..7E. (Subtract 0x80.)
97  * 96-character sets with native byte values A0..FF are encoded in ISO 2022
98  * as bytes 20..7F. (Subtract 0x80.)
99  * Do not encode C1 control codes with native bytes 80..9F
100  * as bytes 00..1F (C0 control codes).
101  */
102 enum {
103     GR94_START=0xa1,
104     GR94_END=0xfe,
105     GR96_START=0xa0,
106     GR96_END=0xff
107 };
108
109 /*
110  * ISO 2022 control codes must not be converted from Unicode
111  * because they would mess up the byte stream.
112  * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
113  * corresponding to SO, SI, and ESC.
114  */
115 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
116
117 /* for ISO-2022-JP and -CN implementations */
118 typedef enum  {
119         /* shared values */
120         INVALID_STATE=-1,
121         ASCII = 0,
122
123         SS2_STATE=0x10,
124         SS3_STATE,
125
126         /* JP */
127         ISO8859_1 = 1 ,
128         ISO8859_7 = 2 ,
129         JISX201  = 3,
130         JISX208 = 4,
131         JISX212 = 5,
132         GB2312  =6,
133         KSC5601 =7,
134         HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
135
136         /* CN */
137         /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
138         GB2312_1=1,
139         ISO_IR_165=2,
140         CNS_11643=3,
141
142         /*
143          * these are used in StateEnum and ISO2022State variables,
144          * but CNS_11643 must be used to index into myConverterArray[]
145          */
146         CNS_11643_0=0x20,
147         CNS_11643_1,
148         CNS_11643_2,
149         CNS_11643_3,
150         CNS_11643_4,
151         CNS_11643_5,
152         CNS_11643_6,
153         CNS_11643_7
154 } StateEnum;
155
156 /* is the StateEnum charset value for a DBCS charset? */
157 #if UCONFIG_NO_NON_HTML5_CONVERSION
158 #define IS_JP_DBCS(cs) (JISX208==(cs))
159 #else
160 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
161 #endif
162
163 #define CSM(cs) ((uint16_t)1<<(cs))
164
165 /*
166  * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
167  * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
168  *
169  * Note: The converter uses some leniency:
170  * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
171  *   all versions, not just JIS7 and JIS8.
172  * - ICU does not distinguish between different versions of JIS X 0208.
173  */
174 #if UCONFIG_NO_NON_HTML5_CONVERSION
175 enum { MAX_JA_VERSION=0 };
176 #else
177 enum { MAX_JA_VERSION=4 };
178 #endif
179 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
180 /* 
181  * TODO(jshin): The encoding spec has JISX212, but we don't support it.
182  * See https://www.w3.org/Bugs/Public/show_bug.cgi?id=26885
183  */
184     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
185 #if !UCONFIG_NO_NON_HTML5_CONVERSION
186     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
187     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
188     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
189     CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
190 #endif
191 };
192
193 typedef enum {
194         ASCII1=0,
195         LATIN1,
196         SBCS,
197         DBCS,
198         MBCS,
199         HWKANA
200 }Cnv2022Type;
201
202 typedef struct ISO2022State {
203     int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
204     int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
205     int8_t prevG;       /* g before single shift (SS2 or SS3) */
206 } ISO2022State;
207
208 #define UCNV_OPTIONS_VERSION_MASK 0xf
209 #define UCNV_2022_MAX_CONVERTERS 10
210
211 typedef struct{
212     UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
213     UConverter *currentConverter;
214     Cnv2022Type currentType;
215     ISO2022State toU2022State, fromU2022State;
216     uint32_t key;
217     uint32_t version;
218 #ifdef U_ENABLE_GENERIC_ISO_2022
219     UBool isFirstBuffer;
220 #endif
221     UBool isEmptySegment;
222     char name[30];
223     char locale[3];
224 }UConverterDataISO2022;
225
226 /* Protos */
227 /* ISO-2022 ----------------------------------------------------------------- */
228
229 /*Forward declaration */
230 U_CFUNC void
231 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
232                       UErrorCode * err);
233 U_CFUNC void
234 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
235                                     UErrorCode * err);
236
237 #define ESC_2022 0x1B /*ESC*/
238
239 typedef enum
240 {
241         INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
242         VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
243         VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
244         VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
245 } UCNV_TableStates_2022;
246
247 /*
248 * The way these state transition arrays work is:
249 * ex : ESC$B is the sequence for JISX208
250 *      a) First Iteration: char is ESC
251 *          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
252 *             int x = normalize_esq_chars_2022[27] which is equal to 1
253 *         ii) Search for this value in escSeqStateTable_Key_2022[]
254 *             value of x is stored at escSeqStateTable_Key_2022[0]
255 *        iii) Save this index as offset
256 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
257 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
258 *     b) Switch on this state and continue to next char
259 *          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
260 *             which is normalize_esq_chars_2022[36] == 4
261 *         ii) x is currently 1(from above)
262 *               x<<=5 -- x is now 32
263 *               x+=normalize_esq_chars_2022[36]
264 *               now x is 36
265 *        iii) Search for this value in escSeqStateTable_Key_2022[]
266 *             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
267 *         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
268 *             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
269 *     c) Switch on this state and continue to next char
270 *        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
271 *        ii) x is currently 36 (from above)
272 *            x<<=5 -- x is now 1152
273 *            x+=normalize_esq_chars_2022[66]
274 *            now x is 1161
275 *       iii) Search for this value in escSeqStateTable_Key_2022[]
276 *            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
277 *        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
278 *            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
279 *         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
280 */
281
282
283 /*Below are the 3 arrays depicting a state transition table*/
284 static const int8_t normalize_esq_chars_2022[256] = {
285 /*       0      1       2       3       4      5       6        7       8       9           */
286
287          0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
288         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
289         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
290         ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
291         ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
292         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
293         ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
294         ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
295         ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
296         ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
311         ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
312         ,0     ,0      ,0      ,0      ,0      ,0
313 };
314
315 #ifdef U_ENABLE_GENERIC_ISO_2022
316 /*
317  * When the generic ISO-2022 converter is completely removed, not just disabled
318  * per #ifdef, then the following state table and the associated tables that are
319  * dimensioned with MAX_STATES_2022 should be trimmed.
320  *
321  * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
322  * the associated escape sequences starting with ESC ( B should be removed.
323  * This includes the ones with key values 1097 and all of the ones above 1000000.
324  *
325  * For the latter, the tables can simply be truncated.
326  * For the former, since the tables must be kept parallel, it is probably best
327  * to simply duplicate an adjacent table cell, parallel in all tables.
328  *
329  * It may make sense to restructure the tables, especially by using small search
330  * tables for the variants instead of indexing them parallel to the table here.
331  */
332 #endif
333
334 #define MAX_STATES_2022 74
335 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
336 /*   0           1           2           3           4           5           6           7           8           9           */
337
338      1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
339     ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
340     ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
341     ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
342     ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
343     ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
344     ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
345     ,35947631   ,35947635   ,35947636   ,35947638
346 };
347
348 #ifdef U_ENABLE_GENERIC_ISO_2022
349
350 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
351  /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
352
353      NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
354     ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
355     ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
356     ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
357     ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
358     ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
359     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
360     ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
361 };
362
363 #endif
364
365 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
366 /*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
367      VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
368     ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369     ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
370     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
374     ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
375 };
376
377 /* Type def for refactoring changeState_2022 code*/
378 typedef enum{
379 #ifdef U_ENABLE_GENERIC_ISO_2022
380     ISO_2022=0,
381 #endif
382 #if UCONFIG_NO_NON_HTML5_CONVERSION
383     ISO_2022_JP=1
384 #else
385     ISO_2022_JP=1,
386     ISO_2022_KR=2,
387     ISO_2022_CN=3
388 #endif
389 } Variant2022;
390
391 /*********** ISO 2022 Converter Protos ***********/
392 static void
393 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
394
395 static void
396  _ISO2022Close(UConverter *converter);
397
398 static void
399 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
400
401 static const char*
402 _ISO2022getName(const UConverter* cnv);
403
404 static void
405 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
406
407 static UConverter *
408 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
409
410 #ifdef U_ENABLE_GENERIC_ISO_2022
411 static void
412 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
413 #endif
414
415 namespace {
416
417 /*const UConverterSharedData _ISO2022Data;*/
418 extern const UConverterSharedData _ISO2022JPData;
419 extern const UConverterSharedData _ISO2022KRData;
420 extern const UConverterSharedData _ISO2022CNData;
421
422 }  // namespace
423
424 /*************** Converter implementations ******************/
425
426 /* The purpose of this function is to get around gcc compiler warnings. */
427 static inline void
428 fromUWriteUInt8(UConverter *cnv,
429                  const char *bytes, int32_t length,
430                  uint8_t **target, const char *targetLimit,
431                  int32_t **offsets,
432                  int32_t sourceIndex,
433                  UErrorCode *pErrorCode)
434 {
435     char *targetChars = (char *)*target;
436     ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
437                          offsets, sourceIndex, pErrorCode);
438     *target = (uint8_t*)targetChars;
439
440 }
441
442 static inline void
443 setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
444     if(myConverterData->version == 1) {
445         UConverter *cnv = myConverterData->currentConverter;
446
447         cnv->toUnicodeStatus=0;     /* offset */
448         cnv->mode=0;                /* state */
449         cnv->toULength=0;           /* byteIndex */
450     }
451 }
452
453 static inline void
454 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
455    /* in ISO-2022-KR the designator sequence appears only once
456     * in a file so we append it only once
457     */
458     if( converter->charErrorBufferLength==0){
459
460         converter->charErrorBufferLength = 4;
461         converter->charErrorBuffer[0] = 0x1b;
462         converter->charErrorBuffer[1] = 0x24;
463         converter->charErrorBuffer[2] = 0x29;
464         converter->charErrorBuffer[3] = 0x43;
465     }
466     if(myConverterData->version == 1) {
467         UConverter *cnv = myConverterData->currentConverter;
468
469         cnv->fromUChar32=0;
470         cnv->fromUnicodeStatus=1;   /* prevLength */
471     }
472 }
473
474 static void
475 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
476
477     char myLocale[6]={' ',' ',' ',' ',' ',' '};
478
479     cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
480     if(cnv->extraInfo != NULL) {
481         UConverterNamePieces stackPieces;
482         UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
483         UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
484         uint32_t version;
485
486         stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
487
488         uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
489         myConverterData->currentType = ASCII1;
490         cnv->fromUnicodeStatus =FALSE;
491         if(pArgs->locale){
492             uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
493         }
494         version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
495         myConverterData->version = version;
496         if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
497             (myLocale[2]=='_' || myLocale[2]=='\0'))
498         {
499             size_t len=0;
500             /* open the required converters and cache them */
501             if(version>MAX_JA_VERSION) {
502                 /* prevent indexing beyond jpCharsetMasks[] */
503                 myConverterData->version = version = 0;
504             }
505 #if !UCONFIG_NO_NON_HTML5_CONVERSION
506             if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
507                 myConverterData->myConverterArray[ISO8859_7] =
508                     ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
509             }
510 #endif
511             myConverterData->myConverterArray[JISX208] =
512                 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
513 #if !UCONFIG_NO_NON_HTML5_CONVERSION
514             if(jpCharsetMasks[version]&CSM(JISX212)) {
515                 myConverterData->myConverterArray[JISX212] =
516                     ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
517             }
518             if(jpCharsetMasks[version]&CSM(GB2312)) {
519                 myConverterData->myConverterArray[GB2312] =
520                     ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
521             }
522             if(jpCharsetMasks[version]&CSM(KSC5601)) {
523                 myConverterData->myConverterArray[KSC5601] =
524                     ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
525             }
526 #endif
527
528             /* set the function pointers to appropriate funtions */
529             cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
530             uprv_strcpy(myConverterData->locale,"ja");
531
532             (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
533             len = uprv_strlen(myConverterData->name);
534             myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
535             myConverterData->name[len+1]='\0';
536         }
537 #if !UCONFIG_NO_NON_HTML5_CONVERSION
538         else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
539             (myLocale[2]=='_' || myLocale[2]=='\0'))
540         {
541             const char *cnvName;
542             if(version==1) {
543                 cnvName="icu-internal-25546";
544             } else {
545                 cnvName="ibm-949";
546                 myConverterData->version=version=0;
547             }
548             if(pArgs->onlyTestIsLoadable) {
549                 ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
550                 uprv_free(cnv->extraInfo);
551                 cnv->extraInfo=NULL;
552                 return;
553             } else {
554                 myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
555                 if (U_FAILURE(*errorCode)) {
556                     _ISO2022Close(cnv);
557                     return;
558                 }
559
560                 if(version==1) {
561                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
562                     uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
563                     cnv->subCharLen = myConverterData->currentConverter->subCharLen;
564                 }else{
565                     (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
566                 }
567
568                 /* initialize the state variables */
569                 setInitialStateToUnicodeKR(cnv, myConverterData);
570                 setInitialStateFromUnicodeKR(cnv, myConverterData);
571
572                 /* set the function pointers to appropriate funtions */
573                 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
574                 uprv_strcpy(myConverterData->locale,"ko");
575             }
576         }
577         else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
578             (myLocale[2]=='_' || myLocale[2]=='\0'))
579         {
580
581             /* open the required converters and cache them */
582             myConverterData->myConverterArray[GB2312_1] =
583                 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
584             if(version==1) {
585                 myConverterData->myConverterArray[ISO_IR_165] =
586                     ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
587             }
588             myConverterData->myConverterArray[CNS_11643] =
589                 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
590
591
592             /* set the function pointers to appropriate funtions */
593             cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
594             uprv_strcpy(myConverterData->locale,"cn");
595
596             if (version==0){
597                 myConverterData->version = 0;
598                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
599             }else if (version==1){
600                 myConverterData->version = 1;
601                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
602             }else {
603                 myConverterData->version = 2;
604                 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
605             }
606         }
607 #endif // !UCONFIG_NO_NON_HTML5_CONVERSION
608         else{
609 #ifdef U_ENABLE_GENERIC_ISO_2022
610             myConverterData->isFirstBuffer = TRUE;
611
612             /* append the UTF-8 escape sequence */
613             cnv->charErrorBufferLength = 3;
614             cnv->charErrorBuffer[0] = 0x1b;
615             cnv->charErrorBuffer[1] = 0x25;
616             cnv->charErrorBuffer[2] = 0x42;
617
618             cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
619             /* initialize the state variables */
620             uprv_strcpy(myConverterData->name,"ISO_2022");
621 #else
622             *errorCode = U_UNSUPPORTED_ERROR;
623             return;
624 #endif
625         }
626
627         cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
628
629         if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
630             _ISO2022Close(cnv);
631         }
632     } else {
633         *errorCode = U_MEMORY_ALLOCATION_ERROR;
634     }
635 }
636
637
638 static void
639 _ISO2022Close(UConverter *converter) {
640     UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
641     UConverterSharedData **array = myData->myConverterArray;
642     int32_t i;
643
644     if (converter->extraInfo != NULL) {
645         /*close the array of converter pointers and free the memory*/
646         for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
647             if(array[i]!=NULL) {
648                 ucnv_unloadSharedDataIfReady(array[i]);
649             }
650         }
651
652         ucnv_close(myData->currentConverter);
653
654         if(!converter->isExtraLocal){
655             uprv_free (converter->extraInfo);
656             converter->extraInfo = NULL;
657         }
658     }
659 }
660
661 static void
662 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
663     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
664     if(choice<=UCNV_RESET_TO_UNICODE) {
665         uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
666         myConverterData->key = 0;
667         myConverterData->isEmptySegment = FALSE;
668     }
669     if(choice!=UCNV_RESET_TO_UNICODE) {
670         uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
671     }
672 #ifdef U_ENABLE_GENERIC_ISO_2022
673     if(myConverterData->locale[0] == 0){
674         if(choice<=UCNV_RESET_TO_UNICODE) {
675             myConverterData->isFirstBuffer = TRUE;
676             myConverterData->key = 0;
677             if (converter->mode == UCNV_SO){
678                 ucnv_close (myConverterData->currentConverter);
679                 myConverterData->currentConverter=NULL;
680             }
681             converter->mode = UCNV_SI;
682         }
683         if(choice!=UCNV_RESET_TO_UNICODE) {
684             /* re-append UTF-8 escape sequence */
685             converter->charErrorBufferLength = 3;
686             converter->charErrorBuffer[0] = 0x1b;
687             converter->charErrorBuffer[1] = 0x28;
688             converter->charErrorBuffer[2] = 0x42;
689         }
690     }
691     else
692 #endif
693     {
694         /* reset the state variables */
695         if(myConverterData->locale[0] == 'k'){
696             if(choice<=UCNV_RESET_TO_UNICODE) {
697                 setInitialStateToUnicodeKR(converter, myConverterData);
698             }
699             if(choice!=UCNV_RESET_TO_UNICODE) {
700                 setInitialStateFromUnicodeKR(converter, myConverterData);
701             }
702         }
703     }
704 }
705
706 static const char*
707 _ISO2022getName(const UConverter* cnv){
708     if(cnv->extraInfo){
709         UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
710         return myData->name;
711     }
712     return NULL;
713 }
714
715
716 /*************** to unicode *******************/
717 /****************************************************************************
718  * Recognized escape sequences are
719  * <ESC>(B  ASCII
720  * <ESC>.A  ISO-8859-1
721  * <ESC>.F  ISO-8859-7
722  * <ESC>(J  JISX-201
723  * <ESC>(I  JISX-201
724  * <ESC>$B  JISX-208
725  * <ESC>$@  JISX-208
726  * <ESC>$(D JISX-212
727  * <ESC>$A  GB2312
728  * <ESC>$(C KSC5601
729  */
730 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
731 /*      0                1               2               3               4               5               6               7               8               9    */
732     INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
733     ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
734     ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
735     ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
736     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
737     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
738     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
739     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
740 };
741
742 #if !UCONFIG_NO_NON_HTML5_CONVERSION
743 /*************** to unicode *******************/
744 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
745 /*      0                1               2               3               4               5               6               7               8               9    */
746      INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
747     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
748     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
749     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
750     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
751     ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
752     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
753     ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
754 };
755 #endif
756
757
758 static UCNV_TableStates_2022
759 getKey_2022(char c,int32_t* key,int32_t* offset){
760     int32_t togo;
761     int32_t low = 0;
762     int32_t hi = MAX_STATES_2022;
763     int32_t oldmid=0;
764
765     togo = normalize_esq_chars_2022[(uint8_t)c];
766     if(togo == 0) {
767         /* not a valid character anywhere in an escape sequence */
768         *key = 0;
769         *offset = 0;
770         return INVALID_2022;
771     }
772     togo = (*key << 5) + togo;
773
774     while (hi != low)  /*binary search*/{
775
776         register int32_t mid = (hi+low) >> 1; /*Finds median*/
777
778         if (mid == oldmid)
779             break;
780
781         if (escSeqStateTable_Key_2022[mid] > togo){
782             hi = mid;
783         }
784         else if (escSeqStateTable_Key_2022[mid] < togo){
785             low = mid;
786         }
787         else /*we found it*/{
788             *key = togo;
789             *offset = mid;
790             return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
791         }
792         oldmid = mid;
793
794     }
795
796     *key = 0;
797     *offset = 0;
798     return INVALID_2022;
799 }
800
801 /*runs through a state machine to determine the escape sequence - codepage correspondance
802  */
803 static void
804 changeState_2022(UConverter* _this,
805                 const char** source,
806                 const char* sourceLimit,
807                 Variant2022 var,
808                 UErrorCode* err){
809     UCNV_TableStates_2022 value;
810     UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
811     uint32_t key = myData2022->key;
812     int32_t offset = 0;
813     int8_t initialToULength = _this->toULength;
814     char c;
815
816     value = VALID_NON_TERMINAL_2022;
817     while (*source < sourceLimit) {
818         c = *(*source)++;
819         _this->toUBytes[_this->toULength++]=(uint8_t)c;
820         value = getKey_2022(c,(int32_t *) &key, &offset);
821
822         switch (value){
823
824         case VALID_NON_TERMINAL_2022 :
825             /* continue with the loop */
826             break;
827
828         case VALID_TERMINAL_2022:
829             key = 0;
830             goto DONE;
831
832         case INVALID_2022:
833             goto DONE;
834
835         case VALID_MAYBE_TERMINAL_2022:
836 #ifdef U_ENABLE_GENERIC_ISO_2022
837             /* ESC ( B is ambiguous only for ISO_2022 itself */
838             if(var == ISO_2022) {
839                 /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
840                 _this->toULength = 0;
841
842                 /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
843
844                 /* continue with the loop */
845                 value = VALID_NON_TERMINAL_2022;
846                 break;
847             } else
848 #endif
849             {
850                 /* not ISO_2022 itself, finish here */
851                 value = VALID_TERMINAL_2022;
852                 key = 0;
853                 goto DONE;
854             }
855         }
856     }
857
858 DONE:
859     myData2022->key = key;
860
861     if (value == VALID_NON_TERMINAL_2022) {
862         /* indicate that the escape sequence is incomplete: key!=0 */
863         return;
864     } else if (value == INVALID_2022 ) {
865         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
866     } else /* value == VALID_TERMINAL_2022 */ {
867         switch(var){
868 #ifdef U_ENABLE_GENERIC_ISO_2022
869         case ISO_2022:
870         {
871             const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
872             if(chosenConverterName == NULL) {
873                 /* SS2 or SS3 */
874                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
875                 _this->toUCallbackReason = UCNV_UNASSIGNED;
876                 return;
877             }
878
879             _this->mode = UCNV_SI;
880             ucnv_close(myData2022->currentConverter);
881             myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
882             if(U_SUCCESS(*err)) {
883                 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
884                 _this->mode = UCNV_SO;
885             }
886             break;
887         }
888 #endif
889         case ISO_2022_JP:
890             {
891                 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
892                 switch(tempState) {
893                 case INVALID_STATE:
894                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
895                     break;
896                 case SS2_STATE:
897                     if(myData2022->toU2022State.cs[2]!=0) {
898                         if(myData2022->toU2022State.g<2) {
899                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
900                         }
901                         myData2022->toU2022State.g=2;
902                     } else {
903                         /* illegal to have SS2 before a matching designator */
904                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
905                     }
906                     break;
907                 /* case SS3_STATE: not used in ISO-2022-JP-x */
908 #if !UCONFIG_NO_NON_HTML5_CONVERSION
909                 case ISO8859_1:
910                 case ISO8859_7:
911                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
912                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
913                     } else {
914                         /* G2 charset for SS2 */
915                         myData2022->toU2022State.cs[2]=(int8_t)tempState;
916                     }
917                     break;
918 #endif
919                 default:
920                     if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
921                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
922                     } else {
923                         /* G0 charset */
924                         myData2022->toU2022State.cs[0]=(int8_t)tempState;
925                     }
926                     break;
927                 }
928             }
929             break;
930 #if !UCONFIG_NO_NON_HTML5_CONVERSION
931         case ISO_2022_CN:
932             {
933                 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
934                 switch(tempState) {
935                 case INVALID_STATE:
936                     *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
937                     break;
938                 case SS2_STATE:
939                     if(myData2022->toU2022State.cs[2]!=0) {
940                         if(myData2022->toU2022State.g<2) {
941                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
942                         }
943                         myData2022->toU2022State.g=2;
944                     } else {
945                         /* illegal to have SS2 before a matching designator */
946                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
947                     }
948                     break;
949                 case SS3_STATE:
950                     if(myData2022->toU2022State.cs[3]!=0) {
951                         if(myData2022->toU2022State.g<2) {
952                             myData2022->toU2022State.prevG=myData2022->toU2022State.g;
953                         }
954                         myData2022->toU2022State.g=3;
955                     } else {
956                         /* illegal to have SS3 before a matching designator */
957                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
958                     }
959                     break;
960                 case ISO_IR_165:
961                     if(myData2022->version==0) {
962                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
963                         break;
964                     }
965                     /*fall through*/
966                 case GB2312_1:
967                     /*fall through*/
968                 case CNS_11643_1:
969                     myData2022->toU2022State.cs[1]=(int8_t)tempState;
970                     break;
971                 case CNS_11643_2:
972                     myData2022->toU2022State.cs[2]=(int8_t)tempState;
973                     break;
974                 default:
975                     /* other CNS 11643 planes */
976                     if(myData2022->version==0) {
977                         *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
978                     } else {
979                        myData2022->toU2022State.cs[3]=(int8_t)tempState;
980                     }
981                     break;
982                 }
983             }
984             break;
985         case ISO_2022_KR:
986             if(offset==0x30){
987                 /* nothing to be done, just accept this one escape sequence */
988             } else {
989                 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
990             }
991             break;
992 #endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
993
994         default:
995             *err = U_ILLEGAL_ESCAPE_SEQUENCE;
996             break;
997         }
998     }
999     if(U_SUCCESS(*err)) {
1000         _this->toULength = 0;
1001     } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1002         if(_this->toULength>1) {
1003             /*
1004              * Ticket 5691: consistent illegal sequences:
1005              * - We include at least the first byte (ESC) in the illegal sequence.
1006              * - If any of the non-initial bytes could be the start of a character,
1007              *   we stop the illegal sequence before the first one of those.
1008              *   In escape sequences, all following bytes are "printable", that is,
1009              *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1010              *   they are valid single/lead bytes.
1011              *   For simplicity, we always only report the initial ESC byte as the
1012              *   illegal sequence and back out all other bytes we looked at.
1013              */
1014             /* Back out some bytes. */
1015             int8_t backOutDistance=_this->toULength-1;
1016             int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1017             if(backOutDistance<=bytesFromThisBuffer) {
1018                 /* same as initialToULength<=1 */
1019                 *source-=backOutDistance;
1020             } else {
1021                 /* Back out bytes from the previous buffer: Need to replay them. */
1022                 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1023                 /* same as -(initialToULength-1) */
1024                 /* preToULength is negative! */
1025                 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1026                 *source-=bytesFromThisBuffer;
1027             }
1028             _this->toULength=1;
1029         }
1030     } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1031         _this->toUCallbackReason = UCNV_UNASSIGNED;
1032     }
1033 }
1034
1035 /*Checks the characters of the buffer against valid 2022 escape sequences
1036 *if the match we return a pointer to the initial start of the sequence otherwise
1037 *we return sourceLimit
1038 */
1039 /*for 2022 looks ahead in the stream
1040  *to determine the longest possible convertible
1041  *data stream
1042  */
1043 static inline const char*
1044 getEndOfBuffer_2022(const char** source,
1045                    const char* sourceLimit,
1046                    UBool /*flush*/){
1047
1048     const char* mySource = *source;
1049
1050 #ifdef U_ENABLE_GENERIC_ISO_2022
1051     if (*source >= sourceLimit)
1052         return sourceLimit;
1053
1054     do{
1055
1056         if (*mySource == ESC_2022){
1057             int8_t i;
1058             int32_t key = 0;
1059             int32_t offset;
1060             UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1061
1062             /* Kludge: I could not
1063             * figure out the reason for validating an escape sequence
1064             * twice - once here and once in changeState_2022().
1065             * is it possible to have an ESC character in a ISO2022
1066             * byte stream which is valid in a code page? Is it legal?
1067             */
1068             for (i=0;
1069             (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1070             i++) {
1071                 value =  getKey_2022(*(mySource+i), &key, &offset);
1072             }
1073             if (value > 0 || *mySource==ESC_2022)
1074                 return mySource;
1075
1076             if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1077                 return sourceLimit;
1078         }
1079     }while (++mySource < sourceLimit);
1080
1081     return sourceLimit;
1082 #else
1083     while(mySource < sourceLimit && *mySource != ESC_2022) {
1084         ++mySource;
1085     }
1086     return mySource;
1087 #endif
1088 }
1089
1090
1091 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1092  * any future change in _MBCSFromUChar32() function should be reflected here.
1093  * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1094  */
1095 static inline int32_t
1096 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1097                                          UChar32 c,
1098                                          uint32_t* value,
1099                                          UBool useFallback,
1100                                          int outputType)
1101 {
1102     const int32_t *cx;
1103     const uint16_t *table;
1104     uint32_t stage2Entry;
1105     uint32_t myValue;
1106     int32_t length;
1107     const uint8_t *p;
1108     /*
1109      * TODO(markus): Use and require new, faster MBCS conversion table structures.
1110      * Use internal version of ucnv_open() that verifies that the new structures are available,
1111      * else U_INTERNAL_PROGRAM_ERROR.
1112      */
1113     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1114     if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1115         table=sharedData->mbcs.fromUnicodeTable;
1116         stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1117         /* get the bytes and the length for the output */
1118         if(outputType==MBCS_OUTPUT_2){
1119             myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1120             if(myValue<=0xff) {
1121                 length=1;
1122             } else {
1123                 length=2;
1124             }
1125         } else /* outputType==MBCS_OUTPUT_3 */ {
1126             p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1127             myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1128             if(myValue<=0xff) {
1129                 length=1;
1130             } else if(myValue<=0xffff) {
1131                 length=2;
1132             } else {
1133                 length=3;
1134             }
1135         }
1136         /* is this code point assigned, or do we use fallbacks? */
1137         if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1138             /* assigned */
1139             *value=myValue;
1140             return length;
1141         } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1142             /*
1143              * We allow a 0 byte output if the "assigned" bit is set for this entry.
1144              * There is no way with this data structure for fallback output
1145              * to be a zero byte.
1146              */
1147             *value=myValue;
1148             return -length;
1149         }
1150     }
1151
1152     cx=sharedData->mbcs.extIndexes;
1153     if(cx!=NULL) {
1154         return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1155     }
1156
1157     /* unassigned */
1158     return 0;
1159 }
1160
1161 /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1162  * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1163  * @param retval pointer to output byte
1164  * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1165  */
1166 static inline int32_t
1167 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1168                                        UChar32 c,
1169                                        uint32_t* retval,
1170                                        UBool useFallback)
1171 {
1172     const uint16_t *table;
1173     int32_t value;
1174     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1175     if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1176         return 0;
1177     }
1178     /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1179     table=sharedData->mbcs.fromUnicodeTable;
1180     /* get the byte for the output */
1181     value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1182     /* is this code point assigned, or do we use fallbacks? */
1183     *retval=(uint32_t)(value&0xff);
1184     if(value>=0xf00) {
1185         return 1;  /* roundtrip */
1186     } else if(useFallback ? value>=0x800 : value>=0xc00) {
1187         return -1;  /* fallback taken */
1188     } else {
1189         return 0;  /* no mapping */
1190     }
1191 }
1192
1193 /*
1194  * Check that the result is a 2-byte value with each byte in the range A1..FE
1195  * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1196  * to move it to the ISO 2022 range 21..7E.
1197  * Return 0 if out of range.
1198  */
1199 static inline uint32_t
1200 _2022FromGR94DBCS(uint32_t value) {
1201     if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1202         (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1203     ) {
1204         return value - 0x8080;  /* shift down to 21..7e byte range */
1205     } else {
1206         return 0;  /* not valid for ISO 2022 */
1207     }
1208 }
1209
1210 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1211 /*
1212  * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1213  * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1214  * unchanged. 
1215  */
1216 static inline uint32_t
1217 _2022ToGR94DBCS(uint32_t value) {
1218     uint32_t returnValue = value + 0x8080;
1219     if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1220         (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1221         return returnValue;
1222     } else {
1223         return value;
1224     }
1225 }
1226 #endif
1227
1228 #ifdef U_ENABLE_GENERIC_ISO_2022
1229
1230 /**********************************************************************************
1231 *  ISO-2022 Converter
1232 *
1233 *
1234 */
1235
1236 static void
1237 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1238                                                            UErrorCode* err){
1239     const char* mySourceLimit, *realSourceLimit;
1240     const char* sourceStart;
1241     const UChar* myTargetStart;
1242     UConverter* saveThis;
1243     UConverterDataISO2022* myData;
1244     int8_t length;
1245
1246     saveThis = args->converter;
1247     myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1248
1249     realSourceLimit = args->sourceLimit;
1250     while (args->source < realSourceLimit) {
1251         if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1252             /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1253             mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1254
1255             if(args->source < mySourceLimit) {
1256                 if(myData->currentConverter==NULL) {
1257                     myData->currentConverter = ucnv_open("ASCII",err);
1258                     if(U_FAILURE(*err)){
1259                         return;
1260                     }
1261
1262                     myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1263                     saveThis->mode = UCNV_SO;
1264                 }
1265
1266                 /* convert to before the ESC or until the end of the buffer */
1267                 myData->isFirstBuffer=FALSE;
1268                 sourceStart = args->source;
1269                 myTargetStart = args->target;
1270                 args->converter = myData->currentConverter;
1271                 ucnv_toUnicode(args->converter,
1272                     &args->target,
1273                     args->targetLimit,
1274                     &args->source,
1275                     mySourceLimit,
1276                     args->offsets,
1277                     (UBool)(args->flush && mySourceLimit == realSourceLimit),
1278                     err);
1279                 args->converter = saveThis;
1280
1281                 if (*err == U_BUFFER_OVERFLOW_ERROR) {
1282                     /* move the overflow buffer */
1283                     length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1284                     myData->currentConverter->UCharErrorBufferLength = 0;
1285                     if(length > 0) {
1286                         uprv_memcpy(saveThis->UCharErrorBuffer,
1287                                     myData->currentConverter->UCharErrorBuffer,
1288                                     length*U_SIZEOF_UCHAR);
1289                     }
1290                     return;
1291                 }
1292
1293                 /*
1294                  * At least one of:
1295                  * -Error while converting
1296                  * -Done with entire buffer
1297                  * -Need to write offsets or update the current offset
1298                  *  (leave that up to the code in ucnv.c)
1299                  *
1300                  * or else we just stopped at an ESC byte and continue with changeState_2022()
1301                  */
1302                 if (U_FAILURE(*err) ||
1303                     (args->source == realSourceLimit) ||
1304                     (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1305                     (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1306                 ) {
1307                     /* copy partial or error input for truncated detection and error handling */
1308                     if(U_FAILURE(*err)) {
1309                         length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1310                         if(length > 0) {
1311                             uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1312                         }
1313                     } else {
1314                         length = saveThis->toULength = myData->currentConverter->toULength;
1315                         if(length > 0) {
1316                             uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1317                             if(args->source < mySourceLimit) {
1318                                 *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1319                             }
1320                         }
1321                     }
1322                     return;
1323                 }
1324             }
1325         }
1326
1327         sourceStart = args->source;
1328         changeState_2022(args->converter,
1329                &(args->source),
1330                realSourceLimit,
1331                ISO_2022,
1332                err);
1333         if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1334             /* let the ucnv.c code update its current offset */
1335             return;
1336         }
1337     }
1338 }
1339
1340 #endif
1341
1342 /*
1343  * To Unicode Callback helper function
1344  */
1345 static void
1346 toUnicodeCallback(UConverter *cnv,
1347                   const uint32_t sourceChar, const uint32_t targetUniChar,
1348                   UErrorCode* err){
1349     if(sourceChar>0xff){
1350         cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1351         cnv->toUBytes[1] = (uint8_t)sourceChar;
1352         cnv->toULength = 2;
1353     }
1354     else{
1355         cnv->toUBytes[0] =(char) sourceChar;
1356         cnv->toULength = 1;
1357     }
1358
1359     if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1360         *err = U_INVALID_CHAR_FOUND;
1361     }
1362     else{
1363         *err = U_ILLEGAL_CHAR_FOUND;
1364     }
1365 }
1366
1367 /**************************************ISO-2022-JP*************************************************/
1368
1369 /************************************** IMPORTANT **************************************************
1370 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1371 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1372 * The converter iterates over each Unicode codepoint
1373 * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1374 * processed one char at a time it would make sense to reduce the extra processing a canned converter
1375 * would do as far as possible.
1376 *
1377 * If the implementation of these macros or structure of sharedData struct change in the future, make
1378 * sure that ISO-2022 is also changed.
1379 ***************************************************************************************************
1380 */
1381
1382 /***************************************************************************************************
1383 * Rules for ISO-2022-jp encoding
1384 * (i)   Escape sequences must be fully contained within a line they should not
1385 *       span new lines or CRs
1386 * (ii)  If the last character on a line is represented by two bytes then an ASCII or
1387 *       JIS-Roman character escape sequence should follow before the line terminates
1388 * (iii) If the first character on the line is represented by two bytes then a two
1389 *       byte character escape sequence should precede it
1390 * (iv)  If no escape sequence is encountered then the characters are ASCII
1391 * (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1392 *       and invoked with SS2 (ESC N).
1393 * (vi)  If there is any G0 designation in text, there must be a switch to
1394 *       ASCII or to JIS X 0201-Roman before a space character (but not
1395 *       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1396 *       characters such as tab or CRLF.
1397 * (vi)  Supported encodings:
1398 *          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1399 *
1400 *  source : RFC-1554
1401 *
1402 *          JISX201, JISX208,JISX212 : new .cnv data files created
1403 *          KSC5601 : alias to ibm-949 mapping table
1404 *          GB2312 : alias to ibm-1386 mapping table
1405 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1406 *          ISO-8859-7 : alisas to ibm-9409 mapping table
1407 */
1408
1409 /* preference order of JP charsets */
1410 static const StateEnum jpCharsetPref[]={
1411     ASCII,
1412     JISX201,
1413 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1414     ISO8859_1,
1415     ISO8859_7,
1416 #endif
1417     JISX208,
1418 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1419     JISX212,
1420     GB2312,
1421     KSC5601,
1422 #endif
1423     HWKANA_7BIT
1424 };
1425
1426 /*
1427  * The escape sequences must be in order of the enum constants like JISX201  = 3,
1428  * not in order of jpCharsetPref[]!
1429  */
1430 static const char escSeqChars[][6] ={
1431     "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1432     "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1433     "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1434     "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1435     "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1436     "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1437     "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1438     "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1439     "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1440
1441 };
1442 static  const int8_t escSeqCharsLen[] ={
1443     3, /* length of <ESC>(B  ASCII       */
1444     3, /* length of <ESC>.A  ISO-8859-1  */
1445     3, /* length of <ESC>.F  ISO-8859-7  */
1446     3, /* length of <ESC>(J  JISX-201    */
1447     3, /* length of <ESC>$B  JISX-208    */
1448     4, /* length of <ESC>$(D JISX-212    */
1449     3, /* length of <ESC>$A  GB2312      */
1450     4, /* length of <ESC>$(C KSC5601     */
1451     3  /* length of <ESC>(I  HWKANA_7BIT */
1452 };
1453
1454 /*
1455 * The iteration over various code pages works this way:
1456 * i)   Get the currentState from myConverterData->currentState
1457 * ii)  Check if the character is mapped to a valid character in the currentState
1458 *      Yes ->  a) set the initIterState to currentState
1459 *       b) remain in this state until an invalid character is found
1460 *      No  ->  a) go to the next code page and find the character
1461 * iii) Before changing the state increment the current state check if the current state
1462 *      is equal to the intitIteration state
1463 *      Yes ->  A character that cannot be represented in any of the supported encodings
1464 *       break and return a U_INVALID_CHARACTER error
1465 *      No  ->  Continue and find the character in next code page
1466 *
1467 *
1468 * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1469 */
1470
1471 /* Map 00..7F to Unicode according to JIS X 0201. */
1472 static inline uint32_t
1473 jisx201ToU(uint32_t value) {
1474     if(value < 0x5c) {
1475         return value;
1476     } else if(value == 0x5c) {
1477         return 0xa5;
1478     } else if(value == 0x7e) {
1479         return 0x203e;
1480     } else /* value <= 0x7f */ {
1481         return value;
1482     }
1483 }
1484
1485 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1486 static inline uint32_t
1487 jisx201FromU(uint32_t value) {
1488     if(value<=0x7f) {
1489         if(value!=0x5c && value!=0x7e) {
1490             return value;
1491         }
1492     } else if(value==0xa5) {
1493         return 0x5c;
1494     } else if(value==0x203e) {
1495         return 0x7e;
1496     }
1497     return 0xfffe;
1498 }
1499
1500 /*
1501  * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1502  * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1503  * Return 0 if the byte pair is out of range.
1504  */
1505 static inline uint32_t
1506 _2022FromSJIS(uint32_t value) {
1507     uint8_t trail;
1508
1509     if(value > 0xEFFC) {
1510         return 0;  /* beyond JIS X 0208 */
1511     }
1512
1513     trail = (uint8_t)value;
1514
1515     value &= 0xff00;  /* lead byte */
1516     if(value <= 0x9f00) {
1517         value -= 0x7000;
1518     } else /* 0xe000 <= value <= 0xef00 */ {
1519         value -= 0xb000;
1520     }
1521     value <<= 1;
1522
1523     if(trail <= 0x9e) {
1524         value -= 0x100;
1525         if(trail <= 0x7e) {
1526             value |= trail - 0x1f;
1527         } else {
1528             value |= trail - 0x20;
1529         }
1530     } else /* trail <= 0xfc */ {
1531         value |= trail - 0x7e;
1532     }
1533     return value;
1534 }
1535
1536 /*
1537  * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1538  * If either byte is outside 21..7E make sure that the result is not valid
1539  * for Shift-JIS so that the converter catches it.
1540  * Some invalid byte values already turn into equally invalid Shift-JIS
1541  * byte values and need not be tested explicitly.
1542  */
1543 static inline void
1544 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1545     if(c1&1) {
1546         ++c1;
1547         if(c2 <= 0x5f) {
1548             c2 += 0x1f;
1549         } else if(c2 <= 0x7e) {
1550             c2 += 0x20;
1551         } else {
1552             c2 = 0;  /* invalid */
1553         }
1554     } else {
1555         if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1556             c2 += 0x7e;
1557         } else {
1558             c2 = 0;  /* invalid */
1559         }
1560     }
1561     c1 >>= 1;
1562     if(c1 <= 0x2f) {
1563         c1 += 0x70;
1564     } else if(c1 <= 0x3f) {
1565         c1 += 0xb0;
1566     } else {
1567         c1 = 0;  /* invalid */
1568     }
1569     bytes[0] = (char)c1;
1570     bytes[1] = (char)c2;
1571 }
1572
1573 /*
1574  * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1575  * Katakana.
1576  * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1577  * because Shift-JIS roundtrips half-width Katakana to single bytes.
1578  * These were the only fallbacks in ICU's jisx-208.ucm file.
1579  */
1580 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1581     0x2123,  /* U+FF61 */
1582     0x2156,
1583     0x2157,
1584     0x2122,
1585     0x2126,
1586     0x2572,
1587     0x2521,
1588     0x2523,
1589     0x2525,
1590     0x2527,
1591     0x2529,
1592     0x2563,
1593     0x2565,
1594     0x2567,
1595     0x2543,
1596     0x213C,  /* U+FF70 */
1597     0x2522,
1598     0x2524,
1599     0x2526,
1600     0x2528,
1601     0x252A,
1602     0x252B,
1603     0x252D,
1604     0x252F,
1605     0x2531,
1606     0x2533,
1607     0x2535,
1608     0x2537,
1609     0x2539,
1610     0x253B,
1611     0x253D,
1612     0x253F,  /* U+FF80 */
1613     0x2541,
1614     0x2544,
1615     0x2546,
1616     0x2548,
1617     0x254A,
1618     0x254B,
1619     0x254C,
1620     0x254D,
1621     0x254E,
1622     0x254F,
1623     0x2552,
1624     0x2555,
1625     0x2558,
1626     0x255B,
1627     0x255E,
1628     0x255F,  /* U+FF90 */
1629     0x2560,
1630     0x2561,
1631     0x2562,
1632     0x2564,
1633     0x2566,
1634     0x2568,
1635     0x2569,
1636     0x256A,
1637     0x256B,
1638     0x256C,
1639     0x256D,
1640     0x256F,
1641     0x2573,
1642     0x212B,
1643     0x212C   /* U+FF9F */
1644 };
1645
1646 static void
1647 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1648     UConverter *cnv = args->converter;
1649     UConverterDataISO2022 *converterData;
1650     ISO2022State *pFromU2022State;
1651     uint8_t *target = (uint8_t *) args->target;
1652     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1653     const UChar* source = args->source;
1654     const UChar* sourceLimit = args->sourceLimit;
1655     int32_t* offsets = args->offsets;
1656     UChar32 sourceChar;
1657     char buffer[8];
1658     int32_t len, outLen;
1659     int8_t choices[10];
1660     int32_t choiceCount;
1661     uint32_t targetValue = 0;
1662     UBool useFallback;
1663
1664     int32_t i;
1665     int8_t cs, g;
1666
1667     /* set up the state */
1668     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1669     pFromU2022State   = &converterData->fromU2022State;
1670
1671     choiceCount = 0;
1672
1673     /* check if the last codepoint of previous buffer was a lead surrogate*/
1674     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1675         goto getTrail;
1676     }
1677
1678     while(source < sourceLimit) {
1679         if(target < targetLimit) {
1680
1681             sourceChar  = *(source++);
1682             /*check if the char is a First surrogate*/
1683             if(U16_IS_SURROGATE(sourceChar)) {
1684                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1685 getTrail:
1686                     /*look ahead to find the trail surrogate*/
1687                     if(source < sourceLimit) {
1688                         /* test the following code unit */
1689                         UChar trail=(UChar) *source;
1690                         if(U16_IS_TRAIL(trail)) {
1691                             source++;
1692                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1693                             cnv->fromUChar32=0x00;
1694                             /* convert this supplementary code point */
1695                             /* exit this condition tree */
1696                         } else {
1697                             /* this is an unmatched lead code unit (1st surrogate) */
1698                             /* callback(illegal) */
1699                             *err=U_ILLEGAL_CHAR_FOUND;
1700                             cnv->fromUChar32=sourceChar;
1701                             break;
1702                         }
1703                     } else {
1704                         /* no more input */
1705                         cnv->fromUChar32=sourceChar;
1706                         break;
1707                     }
1708                 } else {
1709                     /* this is an unmatched trail code unit (2nd surrogate) */
1710                     /* callback(illegal) */
1711                     *err=U_ILLEGAL_CHAR_FOUND;
1712                     cnv->fromUChar32=sourceChar;
1713                     break;
1714                 }
1715             }
1716
1717             /* do not convert SO/SI/ESC */
1718             if(IS_2022_CONTROL(sourceChar)) {
1719                 /* callback(illegal) */
1720                 *err=U_ILLEGAL_CHAR_FOUND;
1721                 cnv->fromUChar32=sourceChar;
1722                 break;
1723             }
1724
1725             /* do the conversion */
1726
1727             if(choiceCount == 0) {
1728                 uint16_t csm;
1729
1730                 /*
1731                  * The csm variable keeps track of which charsets are allowed
1732                  * and not used yet while building the choices[].
1733                  */
1734                 csm = jpCharsetMasks[converterData->version];
1735                 choiceCount = 0;
1736
1737                 /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1738                 if(converterData->version == 3 || converterData->version == 4) {
1739                     choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1740                 }
1741                 /* Do not try single-byte half-width Katakana for other versions. */
1742                 csm &= ~CSM(HWKANA_7BIT);
1743
1744                 /* try the current G0 charset */
1745                 choices[choiceCount++] = cs = pFromU2022State->cs[0];
1746                 csm &= ~CSM(cs);
1747
1748                 /* try the current G2 charset */
1749                 if((cs = pFromU2022State->cs[2]) != 0) {
1750                     choices[choiceCount++] = cs;
1751                     csm &= ~CSM(cs);
1752                 }
1753
1754                 /* try all the other possible charsets */
1755                 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1756                     cs = (int8_t)jpCharsetPref[i];
1757                     if(CSM(cs) & csm) {
1758                         choices[choiceCount++] = cs;
1759                         csm &= ~CSM(cs);
1760                     }
1761                 }
1762             }
1763
1764             cs = g = 0;
1765             /*
1766              * len==0: no mapping found yet
1767              * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1768              * len>0: found a roundtrip result, done
1769              */
1770             len = 0;
1771             /*
1772              * We will turn off useFallback after finding a fallback,
1773              * but we still get fallbacks from PUA code points as usual.
1774              * Therefore, we will also need to check that we don't overwrite
1775              * an early fallback with a later one.
1776              */
1777             useFallback = cnv->useFallback;
1778
1779             for(i = 0; i < choiceCount && len <= 0; ++i) {
1780                 uint32_t value;
1781                 int32_t len2;
1782                 int8_t cs0 = choices[i];
1783                 switch(cs0) {
1784                 case ASCII:
1785                     if(sourceChar <= 0x7f) {
1786                         targetValue = (uint32_t)sourceChar;
1787                         len = 1;
1788                         cs = cs0;
1789                         g = 0;
1790                     }
1791                     break;
1792 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1793                 case ISO8859_1:
1794                     if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1795                         targetValue = (uint32_t)sourceChar - 0x80;
1796                         len = 1;
1797                         cs = cs0;
1798                         g = 2;
1799                     }
1800                     break;
1801 #endif
1802                 case HWKANA_7BIT:
1803                     if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1804                         if(converterData->version==3) {
1805                             /* JIS7: use G1 (SO) */
1806                             /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1807                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1808                             len = 1;
1809                             pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1810                             g = 1;
1811                         } else if(converterData->version==4) {
1812                             /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1813                             /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1814                             targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1815                             len = 1;
1816
1817                             cs = pFromU2022State->cs[0];
1818                             if(IS_JP_DBCS(cs)) {
1819                                 /* switch from a DBCS charset to JISX201 */
1820                                 cs = (int8_t)JISX201;
1821                             }
1822                             /* else stay in the current G0 charset */
1823                             g = 0;
1824                         }
1825                         /* else do not use HWKANA_7BIT with other versions */
1826                     }
1827                     break;
1828                 case JISX201:
1829                     /* G0 SBCS */
1830                     value = jisx201FromU(sourceChar);
1831                     if(value <= 0x7f) {
1832                         targetValue = value;
1833                         len = 1;
1834                         cs = cs0;
1835                         g = 0;
1836                         useFallback = FALSE;
1837                     }
1838                     break;
1839                 case JISX208:
1840                     /* G0 DBCS from Shift-JIS table */
1841                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1842                                 converterData->myConverterArray[cs0],
1843                                 sourceChar, &value,
1844                                 useFallback, MBCS_OUTPUT_2);
1845                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1846                         value = _2022FromSJIS(value);
1847                         if(value != 0) {
1848                             targetValue = value;
1849                             len = len2;
1850                             cs = cs0;
1851                             g = 0;
1852                             useFallback = FALSE;
1853                         }
1854                     } else if(len == 0 && useFallback &&
1855                               (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1856                         targetValue = hwkana_fb[sourceChar - HWKANA_START];
1857                         len = -2;
1858                         cs = cs0;
1859                         g = 0;
1860                         useFallback = FALSE;
1861                     }
1862                     break;
1863 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1864                 case ISO8859_7:
1865                     /* G0 SBCS forced to 7-bit output */
1866                     len2 = MBCS_SINGLE_FROM_UCHAR32(
1867                                 converterData->myConverterArray[cs0],
1868                                 sourceChar, &value,
1869                                 useFallback);
1870                     if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1871                         targetValue = value - 0x80;
1872                         len = len2;
1873                         cs = cs0;
1874                         g = 2;
1875                         useFallback = FALSE;
1876                     }
1877                     break;
1878 #endif
1879                 default:
1880                     /* G0 DBCS */
1881                     len2 = MBCS_FROM_UCHAR32_ISO2022(
1882                                 converterData->myConverterArray[cs0],
1883                                 sourceChar, &value,
1884                                 useFallback, MBCS_OUTPUT_2);
1885                     if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1886 #if !UCONFIG_NO_NON_HTML5_CONVERSION
1887                         if(cs0 == KSC5601) {
1888                             /*
1889                              * Check for valid bytes for the encoding scheme.
1890                              * This is necessary because the sub-converter (windows-949)
1891                              * has a broader encoding scheme than is valid for 2022.
1892                              */
1893                             value = _2022FromGR94DBCS(value);
1894                             if(value == 0) {
1895                                 break;
1896                             }
1897                         }
1898 #endif
1899                         targetValue = value;
1900                         len = len2;
1901                         cs = cs0;
1902                         g = 0;
1903                         useFallback = FALSE;
1904                     }
1905                     break;
1906                 }
1907             }
1908
1909             if(len != 0) {
1910                 if(len < 0) {
1911                     len = -len;  /* fallback */
1912                 }
1913                 outLen = 0; /* count output bytes */
1914
1915                 /* write SI if necessary (only for JIS7) */
1916                 if(pFromU2022State->g == 1 && g == 0) {
1917                     buffer[outLen++] = UCNV_SI;
1918                     pFromU2022State->g = 0;
1919                 }
1920
1921                 /* write the designation sequence if necessary */
1922                 if(cs != pFromU2022State->cs[g]) {
1923                     int32_t escLen = escSeqCharsLen[cs];
1924                     uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1925                     outLen += escLen;
1926                     pFromU2022State->cs[g] = cs;
1927
1928                     /* invalidate the choices[] */
1929                     choiceCount = 0;
1930                 }
1931
1932                 /* write the shift sequence if necessary */
1933                 if(g != pFromU2022State->g) {
1934                     switch(g) {
1935                     /* case 0 handled before writing escapes */
1936                     case 1:
1937                         buffer[outLen++] = UCNV_SO;
1938                         pFromU2022State->g = 1;
1939                         break;
1940                     default: /* case 2 */
1941                         buffer[outLen++] = 0x1b;
1942                         buffer[outLen++] = 0x4e;
1943                         break;
1944                     /* no case 3: no SS3 in ISO-2022-JP-x */
1945                     }
1946                 }
1947
1948                 /* write the output bytes */
1949                 if(len == 1) {
1950                     buffer[outLen++] = (char)targetValue;
1951                 } else /* len == 2 */ {
1952                     buffer[outLen++] = (char)(targetValue >> 8);
1953                     buffer[outLen++] = (char)targetValue;
1954                 }
1955             } else {
1956                 /*
1957                  * if we cannot find the character after checking all codepages
1958                  * then this is an error
1959                  */
1960                 *err = U_INVALID_CHAR_FOUND;
1961                 cnv->fromUChar32=sourceChar;
1962                 break;
1963             }
1964
1965             if(sourceChar == CR || sourceChar == LF) {
1966                 /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1967                 pFromU2022State->cs[2] = 0;
1968                 choiceCount = 0;
1969             }
1970
1971             /* output outLen>0 bytes in buffer[] */
1972             if(outLen == 1) {
1973                 *target++ = buffer[0];
1974                 if(offsets) {
1975                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1976                 }
1977             } else if(outLen == 2 && (target + 2) <= targetLimit) {
1978                 *target++ = buffer[0];
1979                 *target++ = buffer[1];
1980                 if(offsets) {
1981                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1982                     *offsets++ = sourceIndex;
1983                     *offsets++ = sourceIndex;
1984                 }
1985             } else {
1986                 fromUWriteUInt8(
1987                     cnv,
1988                     buffer, outLen,
1989                     &target, (const char *)targetLimit,
1990                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1991                     err);
1992                 if(U_FAILURE(*err)) {
1993                     break;
1994                 }
1995             }
1996         } /* end if(myTargetIndex<myTargetLength) */
1997         else{
1998             *err =U_BUFFER_OVERFLOW_ERROR;
1999             break;
2000         }
2001
2002     }/* end while(mySourceIndex<mySourceLength) */
2003
2004     /*
2005      * the end of the input stream and detection of truncated input
2006      * are handled by the framework, but for ISO-2022-JP conversion
2007      * we need to be in ASCII mode at the very end
2008      *
2009      * conditions:
2010      *   successful
2011      *   in SO mode or not in ASCII mode
2012      *   end of input and no truncated input
2013      */
2014     if( U_SUCCESS(*err) &&
2015         (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2016         args->flush && source>=sourceLimit && cnv->fromUChar32==0
2017     ) {
2018         int32_t sourceIndex;
2019
2020         outLen = 0;
2021
2022         if(pFromU2022State->g != 0) {
2023             buffer[outLen++] = UCNV_SI;
2024             pFromU2022State->g = 0;
2025         }
2026
2027         if(pFromU2022State->cs[0] != ASCII) {
2028             int32_t escLen = escSeqCharsLen[ASCII];
2029             uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2030             outLen += escLen;
2031             pFromU2022State->cs[0] = (int8_t)ASCII;
2032         }
2033
2034         /* get the source index of the last input character */
2035         /*
2036          * TODO this would be simpler and more reliable if we used a pair
2037          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2038          * so that we could simply use the prevSourceIndex here;
2039          * this code gives an incorrect result for the rare case of an unmatched
2040          * trail surrogate that is alone in the last buffer of the text stream
2041          */
2042         sourceIndex=(int32_t)(source-args->source);
2043         if(sourceIndex>0) {
2044             --sourceIndex;
2045             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2046                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2047             ) {
2048                 --sourceIndex;
2049             }
2050         } else {
2051             sourceIndex=-1;
2052         }
2053
2054         fromUWriteUInt8(
2055             cnv,
2056             buffer, outLen,
2057             &target, (const char *)targetLimit,
2058             &offsets, sourceIndex,
2059             err);
2060     }
2061
2062     /*save the state and return */
2063     args->source = source;
2064     args->target = (char*)target;
2065 }
2066
2067 /*************** to unicode *******************/
2068
2069 static void
2070 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2071                                                UErrorCode* err){
2072     char tempBuf[2];
2073     const char *mySource = (char *) args->source;
2074     UChar *myTarget = args->target;
2075     const char *mySourceLimit = args->sourceLimit;
2076     uint32_t targetUniChar = 0x0000;
2077     uint32_t mySourceChar = 0x0000;
2078     uint32_t tmpSourceChar = 0x0000;
2079     UConverterDataISO2022* myData;
2080     ISO2022State *pToU2022State;
2081     StateEnum cs;
2082
2083     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2084     pToU2022State = &myData->toU2022State;
2085
2086     if(myData->key != 0) {
2087         /* continue with a partial escape sequence */
2088         goto escape;
2089     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2090         /* continue with a partial double-byte character */
2091         mySourceChar = args->converter->toUBytes[0];
2092         args->converter->toULength = 0;
2093         cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2094         targetUniChar = missingCharMarker;
2095         goto getTrailByte;
2096     }
2097
2098     while(mySource < mySourceLimit){
2099
2100         targetUniChar =missingCharMarker;
2101
2102         if(myTarget < args->targetLimit){
2103
2104             mySourceChar= (unsigned char) *mySource++;
2105
2106             switch(mySourceChar) {
2107             case UCNV_SI:
2108                 if(myData->version==3) {
2109                     pToU2022State->g=0;
2110                     continue;
2111                 } else {
2112                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2113                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2114                     break;
2115                 }
2116
2117             case UCNV_SO:
2118                 if(myData->version==3) {
2119                     /* JIS7: switch to G1 half-width Katakana */
2120                     pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2121                     pToU2022State->g=1;
2122                     continue;
2123                 } else {
2124                     /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2125                     myData->isEmptySegment = FALSE;     /* reset this, we have a different error */
2126                     break;
2127                 }
2128
2129             case ESC_2022:
2130                 mySource--;
2131 escape:
2132                 {
2133                     const char * mySourceBefore = mySource;
2134                     int8_t toULengthBefore = args->converter->toULength;
2135
2136                     changeState_2022(args->converter,&(mySource),
2137                         mySourceLimit, ISO_2022_JP,err);
2138
2139                     /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2140                     if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2141                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2142                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
2143                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2144                     }
2145                 }
2146
2147                 /* invalid or illegal escape sequence */
2148                 if(U_FAILURE(*err)){
2149                     args->target = myTarget;
2150                     args->source = mySource;
2151                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
2152                     return;
2153                 }
2154                 /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2155                 if(myData->key==0) {
2156                     myData->isEmptySegment = TRUE;
2157                 }
2158                 continue;
2159
2160             /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2161
2162             case CR:
2163                 /*falls through*/
2164             case LF:
2165                 /* automatically reset to single-byte mode */
2166                 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2167                     pToU2022State->cs[0] = (int8_t)ASCII;
2168                 }
2169                 pToU2022State->cs[2] = 0;
2170                 pToU2022State->g = 0;
2171                 /* falls through */
2172             default:
2173                 /* convert one or two bytes */
2174                 myData->isEmptySegment = FALSE;
2175                 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2176                 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2177                     !IS_JP_DBCS(cs)
2178                 ) {
2179                     /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2180                     targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2181
2182                     /* return from a single-shift state to the previous one */
2183                     if(pToU2022State->g >= 2) {
2184                         pToU2022State->g=pToU2022State->prevG;
2185                     }
2186                 } else switch(cs) {
2187                 case ASCII:
2188                     if(mySourceChar <= 0x7f) {
2189                         targetUniChar = mySourceChar;
2190                     }
2191                     break;
2192 #if !UCONFIG_NO_NON_HTML5_CONVERSION
2193                 case ISO8859_1:
2194                     if(mySourceChar <= 0x7f) {
2195                         targetUniChar = mySourceChar + 0x80;
2196                     }
2197                     /* return from a single-shift state to the previous one */
2198                     pToU2022State->g=pToU2022State->prevG;
2199                     break;
2200                 case ISO8859_7:
2201                     if(mySourceChar <= 0x7f) {
2202                         /* convert mySourceChar+0x80 to use a normal 8-bit table */
2203                         targetUniChar =
2204                             _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2205                                 myData->myConverterArray[cs],
2206                                 mySourceChar + 0x80);
2207                     }
2208                     /* return from a single-shift state to the previous one */
2209                     pToU2022State->g=pToU2022State->prevG;
2210                     break;
2211 #endif
2212                 case JISX201:
2213                     if(mySourceChar <= 0x7f) {
2214                         targetUniChar = jisx201ToU(mySourceChar);
2215                     }
2216                     break;
2217                 case HWKANA_7BIT:
2218                     if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2219                         /* 7-bit halfwidth Katakana */
2220                         targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2221                     }
2222                     break;
2223                 default:
2224                     /* G0 DBCS */
2225                     if(mySource < mySourceLimit) {
2226                         int leadIsOk, trailIsOk;
2227                         uint8_t trailByte;
2228 getTrailByte:
2229                         trailByte = (uint8_t)*mySource;
2230                         /*
2231                          * Ticket 5691: consistent illegal sequences:
2232                          * - We include at least the first byte in the illegal sequence.
2233                          * - If any of the non-initial bytes could be the start of a character,
2234                          *   we stop the illegal sequence before the first one of those.
2235                          *
2236                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2237                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2238                          * Otherwise we convert or report the pair of bytes.
2239                          */
2240                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2241                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2242                         if (leadIsOk && trailIsOk) {
2243                             ++mySource;
2244                             tmpSourceChar = (mySourceChar << 8) | trailByte;
2245                             if(cs == JISX208) {
2246                                 _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2247                                 mySourceChar = tmpSourceChar;
2248                             } else {
2249                                 /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2250                                 mySourceChar = tmpSourceChar;
2251 #if !UCONFIG_NO_NON_HTML5_CONVERSION
2252                                 if (cs == KSC5601) {
2253                                     tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2254                                 }
2255 #endif
2256                                 tempBuf[0] = (char)(tmpSourceChar >> 8);
2257                                 tempBuf[1] = (char)(tmpSourceChar);
2258                             }
2259                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2260                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2261                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2262                             ++mySource;
2263                             /* add another bit so that the code below writes 2 bytes in case of error */
2264                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2265                         }
2266                     } else {
2267                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2268                         args->converter->toULength = 1;
2269                         goto endloop;
2270                     }
2271                 }  /* End of inner switch */
2272                 break;
2273             }  /* End of outer switch */
2274             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2275                 if(args->offsets){
2276                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2277                 }
2278                 *(myTarget++)=(UChar)targetUniChar;
2279             }
2280             else if(targetUniChar > missingCharMarker){
2281                 /* disassemble the surrogate pair and write to output*/
2282                 targetUniChar-=0x0010000;
2283                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2284                 if(args->offsets){
2285                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2286                 }
2287                 ++myTarget;
2288                 if(myTarget< args->targetLimit){
2289                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2290                     if(args->offsets){
2291                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2292                     }
2293                     ++myTarget;
2294                 }else{
2295                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2296                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2297                 }
2298
2299             }
2300             else{
2301                 /* Call the callback function*/
2302                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2303                 break;
2304             }
2305         }
2306         else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2307             *err =U_BUFFER_OVERFLOW_ERROR;
2308             break;
2309         }
2310     }
2311 endloop:
2312     args->target = myTarget;
2313     args->source = mySource;
2314 }
2315
2316
2317 #if !UCONFIG_NO_NON_HTML5_CONVERSION
2318 /***************************************************************
2319 *   Rules for ISO-2022-KR encoding
2320 *   i) The KSC5601 designator sequence should appear only once in a file,
2321 *      at the begining of a line before any KSC5601 characters. This usually
2322 *      means that it appears by itself on the first line of the file
2323 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
2324 *      and SI to shift into single byte mode
2325 */
2326 static void
2327 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2328
2329     UConverter* saveConv = args->converter;
2330     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2331     args->converter=myConverterData->currentConverter;
2332
2333     myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2334     ucnv_MBCSFromUnicodeWithOffsets(args,err);
2335     saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2336
2337     if(*err == U_BUFFER_OVERFLOW_ERROR) {
2338         if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2339             uprv_memcpy(
2340                 saveConv->charErrorBuffer,
2341                 myConverterData->currentConverter->charErrorBuffer,
2342                 myConverterData->currentConverter->charErrorBufferLength);
2343         }
2344         saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2345         myConverterData->currentConverter->charErrorBufferLength = 0;
2346     }
2347     args->converter=saveConv;
2348 }
2349
2350 static void
2351 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2352
2353     const UChar *source = args->source;
2354     const UChar *sourceLimit = args->sourceLimit;
2355     unsigned char *target = (unsigned char *) args->target;
2356     unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2357     int32_t* offsets = args->offsets;
2358     uint32_t targetByteUnit = 0x0000;
2359     UChar32 sourceChar = 0x0000;
2360     UBool isTargetByteDBCS;
2361     UBool oldIsTargetByteDBCS;
2362     UConverterDataISO2022 *converterData;
2363     UConverterSharedData* sharedData;
2364     UBool useFallback;
2365     int32_t length =0;
2366
2367     converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2368     /* if the version is 1 then the user is requesting
2369      * conversion with ibm-25546 pass the arguments to
2370      * MBCS converter and return
2371      */
2372     if(converterData->version==1){
2373         UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2374         return;
2375     }
2376
2377     /* initialize data */
2378     sharedData = converterData->currentConverter->sharedData;
2379     useFallback = args->converter->useFallback;
2380     isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2381     oldIsTargetByteDBCS = isTargetByteDBCS;
2382
2383     isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2384     if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2385         goto getTrail;
2386     }
2387     while(source < sourceLimit){
2388
2389         targetByteUnit = missingCharMarker;
2390
2391         if(target < (unsigned char*) args->targetLimit){
2392             sourceChar = *source++;
2393
2394             /* do not convert SO/SI/ESC */
2395             if(IS_2022_CONTROL(sourceChar)) {
2396                 /* callback(illegal) */
2397                 *err=U_ILLEGAL_CHAR_FOUND;
2398                 args->converter->fromUChar32=sourceChar;
2399                 break;
2400             }
2401
2402             length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2403             if(length < 0) {
2404                 length = -length;  /* fallback */
2405             }
2406             /* only DBCS or SBCS characters are expected*/
2407             /* DB characters with high bit set to 1 are expected */
2408             if( length > 2 || length==0 ||
2409                 (length == 1 && targetByteUnit > 0x7f) ||
2410                 (length == 2 &&
2411                     ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2412                     (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2413             ) {
2414                 targetByteUnit=missingCharMarker;
2415             }
2416             if (targetByteUnit != missingCharMarker){
2417
2418                 oldIsTargetByteDBCS = isTargetByteDBCS;
2419                 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2420                   /* append the shift sequence */
2421                 if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2422
2423                     if (isTargetByteDBCS)
2424                         *target++ = UCNV_SO;
2425                     else
2426                         *target++ = UCNV_SI;
2427                     if(offsets)
2428                         *(offsets++) = (int32_t)(source - args->source-1);
2429                 }
2430                 /* write the targetUniChar  to target */
2431                 if(targetByteUnit <= 0x00FF){
2432                     if( target < targetLimit){
2433                         *(target++) = (unsigned char) targetByteUnit;
2434                         if(offsets){
2435                             *(offsets++) = (int32_t)(source - args->source-1);
2436                         }
2437
2438                     }else{
2439                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2440                         *err = U_BUFFER_OVERFLOW_ERROR;
2441                     }
2442                 }else{
2443                     if(target < targetLimit){
2444                         *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2445                         if(offsets){
2446                             *(offsets++) = (int32_t)(source - args->source-1);
2447                         }
2448                         if(target < targetLimit){
2449                             *(target++) =(unsigned char) (targetByteUnit -0x80);
2450                             if(offsets){
2451                                 *(offsets++) = (int32_t)(source - args->source-1);
2452                             }
2453                         }else{
2454                             args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2455                             *err = U_BUFFER_OVERFLOW_ERROR;
2456                         }
2457                     }else{
2458                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2459                         args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2460                         *err = U_BUFFER_OVERFLOW_ERROR;
2461                     }
2462                 }
2463
2464             }
2465             else{
2466                 /* oops.. the code point is unassingned
2467                  * set the error and reason
2468                  */
2469
2470                 /*check if the char is a First surrogate*/
2471                 if(U16_IS_SURROGATE(sourceChar)) {
2472                     if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2473 getTrail:
2474                         /*look ahead to find the trail surrogate*/
2475                         if(source <  sourceLimit) {
2476                             /* test the following code unit */
2477                             UChar trail=(UChar) *source;
2478                             if(U16_IS_TRAIL(trail)) {
2479                                 source++;
2480                                 sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2481                                 *err = U_INVALID_CHAR_FOUND;
2482                                 /* convert this surrogate code point */
2483                                 /* exit this condition tree */
2484                             } else {
2485                                 /* this is an unmatched lead code unit (1st surrogate) */
2486                                 /* callback(illegal) */
2487                                 *err=U_ILLEGAL_CHAR_FOUND;
2488                             }
2489                         } else {
2490                             /* no more input */
2491                             *err = U_ZERO_ERROR;
2492                         }
2493                     } else {
2494                         /* this is an unmatched trail code unit (2nd surrogate) */
2495                         /* callback(illegal) */
2496                         *err=U_ILLEGAL_CHAR_FOUND;
2497                     }
2498                 } else {
2499                     /* callback(unassigned) for a BMP code point */
2500                     *err = U_INVALID_CHAR_FOUND;
2501                 }
2502
2503                 args->converter->fromUChar32=sourceChar;
2504                 break;
2505             }
2506         } /* end if(myTargetIndex<myTargetLength) */
2507         else{
2508             *err =U_BUFFER_OVERFLOW_ERROR;
2509             break;
2510         }
2511
2512     }/* end while(mySourceIndex<mySourceLength) */
2513
2514     /*
2515      * the end of the input stream and detection of truncated input
2516      * are handled by the framework, but for ISO-2022-KR conversion
2517      * we need to be in ASCII mode at the very end
2518      *
2519      * conditions:
2520      *   successful
2521      *   not in ASCII mode
2522      *   end of input and no truncated input
2523      */
2524     if( U_SUCCESS(*err) &&
2525         isTargetByteDBCS &&
2526         args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2527     ) {
2528         int32_t sourceIndex;
2529
2530         /* we are switching to ASCII */
2531         isTargetByteDBCS=FALSE;
2532
2533         /* get the source index of the last input character */
2534         /*
2535          * TODO this would be simpler and more reliable if we used a pair
2536          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2537          * so that we could simply use the prevSourceIndex here;
2538          * this code gives an incorrect result for the rare case of an unmatched
2539          * trail surrogate that is alone in the last buffer of the text stream
2540          */
2541         sourceIndex=(int32_t)(source-args->source);
2542         if(sourceIndex>0) {
2543             --sourceIndex;
2544             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2545                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2546             ) {
2547                 --sourceIndex;
2548             }
2549         } else {
2550             sourceIndex=-1;
2551         }
2552
2553         fromUWriteUInt8(
2554             args->converter,
2555             SHIFT_IN_STR, 1,
2556             &target, (const char *)targetLimit,
2557             &offsets, sourceIndex,
2558             err);
2559     }
2560
2561     /*save the state and return */
2562     args->source = source;
2563     args->target = (char*)target;
2564     args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2565 }
2566
2567 /************************ To Unicode ***************************************/
2568
2569 static void
2570 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2571                                                             UErrorCode* err){
2572     char const* sourceStart;
2573     UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2574
2575     UConverterToUnicodeArgs subArgs;
2576     int32_t minArgsSize;
2577
2578     /* set up the subconverter arguments */
2579     if(args->size<sizeof(UConverterToUnicodeArgs)) {
2580         minArgsSize = args->size;
2581     } else {
2582         minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2583     }
2584
2585     uprv_memcpy(&subArgs, args, minArgsSize);
2586     subArgs.size = (uint16_t)minArgsSize;
2587     subArgs.converter = myData->currentConverter;
2588
2589     /* remember the original start of the input for offsets */
2590     sourceStart = args->source;
2591
2592     if(myData->key != 0) {
2593         /* continue with a partial escape sequence */
2594         goto escape;
2595     }
2596
2597     while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2598         /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2599         subArgs.source = args->source;
2600         subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2601         if(subArgs.source != subArgs.sourceLimit) {
2602             /*
2603              * get the current partial byte sequence
2604              *
2605              * it needs to be moved between the public and the subconverter
2606              * so that the conversion framework, which only sees the public
2607              * converter, can handle truncated and illegal input etc.
2608              */
2609             if(args->converter->toULength > 0) {
2610                 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2611             }
2612             subArgs.converter->toULength = args->converter->toULength;
2613
2614             /*
2615              * Convert up to the end of the input, or to before the next escape character.
2616              * Does not handle conversion extensions because the preToU[] state etc.
2617              * is not copied.
2618              */
2619             ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2620
2621             if(args->offsets != NULL && sourceStart != args->source) {
2622                 /* update offsets to base them on the actual start of the input */
2623                 int32_t *offsets = args->offsets;
2624                 UChar *target = args->target;
2625                 int32_t delta = (int32_t)(args->source - sourceStart);
2626                 while(target < subArgs.target) {
2627                     if(*offsets >= 0) {
2628                         *offsets += delta;
2629                     }
2630                     ++offsets;
2631                     ++target;
2632                 }
2633             }
2634             args->source = subArgs.source;
2635             args->target = subArgs.target;
2636             args->offsets = subArgs.offsets;
2637
2638             /* copy input/error/overflow buffers */
2639             if(subArgs.converter->toULength > 0) {
2640                 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2641             }
2642             args->converter->toULength = subArgs.converter->toULength;
2643
2644             if(*err == U_BUFFER_OVERFLOW_ERROR) {
2645                 if(subArgs.converter->UCharErrorBufferLength > 0) {
2646                     uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2647                                 subArgs.converter->UCharErrorBufferLength);
2648                 }
2649                 args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2650                 subArgs.converter->UCharErrorBufferLength = 0;
2651             }
2652         }
2653
2654         if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2655             return;
2656         }
2657
2658 escape:
2659         changeState_2022(args->converter,
2660                &(args->source),
2661                args->sourceLimit,
2662                ISO_2022_KR,
2663                err);
2664     }
2665 }
2666
2667 static void
2668 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2669                                                             UErrorCode* err){
2670     char tempBuf[2];
2671     const char *mySource = ( char *) args->source;
2672     UChar *myTarget = args->target;
2673     const char *mySourceLimit = args->sourceLimit;
2674     UChar32 targetUniChar = 0x0000;
2675     UChar mySourceChar = 0x0000;
2676     UConverterDataISO2022* myData;
2677     UConverterSharedData* sharedData ;
2678     UBool useFallback;
2679
2680     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2681     if(myData->version==1){
2682         UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2683         return;
2684     }
2685
2686     /* initialize state */
2687     sharedData = myData->currentConverter->sharedData;
2688     useFallback = args->converter->useFallback;
2689
2690     if(myData->key != 0) {
2691         /* continue with a partial escape sequence */
2692         goto escape;
2693     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2694         /* continue with a partial double-byte character */
2695         mySourceChar = args->converter->toUBytes[0];
2696         args->converter->toULength = 0;
2697         goto getTrailByte;
2698     }
2699
2700     while(mySource< mySourceLimit){
2701
2702         if(myTarget < args->targetLimit){
2703
2704             mySourceChar= (unsigned char) *mySource++;
2705
2706             if(mySourceChar==UCNV_SI){
2707                 myData->toU2022State.g = 0;
2708                 if (myData->isEmptySegment) {
2709                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
2710                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2711                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
2712                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2713                     args->converter->toULength = 1;
2714                     args->target = myTarget;
2715                     args->source = mySource;
2716                     return;
2717                 }
2718                 /*consume the source */
2719                 continue;
2720             }else if(mySourceChar==UCNV_SO){
2721                 myData->toU2022State.g = 1;
2722                 myData->isEmptySegment = TRUE;  /* Begin a new segment, empty so far */
2723                 /*consume the source */
2724                 continue;
2725             }else if(mySourceChar==ESC_2022){
2726                 mySource--;
2727 escape:
2728                 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2729                 changeState_2022(args->converter,&(mySource),
2730                                 mySourceLimit, ISO_2022_KR, err);
2731                 if(U_FAILURE(*err)){
2732                     args->target = myTarget;
2733                     args->source = mySource;
2734                     return;
2735                 }
2736                 continue;
2737             }
2738
2739             myData->isEmptySegment = FALSE;     /* Any invalid char errors will be detected separately, so just reset this */
2740             if(myData->toU2022State.g == 1) {
2741                 if(mySource < mySourceLimit) {
2742                     int leadIsOk, trailIsOk;
2743                     uint8_t trailByte;
2744 getTrailByte:
2745                     targetUniChar = missingCharMarker;
2746                     trailByte = (uint8_t)*mySource;
2747                     /*
2748                      * Ticket 5691: consistent illegal sequences:
2749                      * - We include at least the first byte in the illegal sequence.
2750                      * - If any of the non-initial bytes could be the start of a character,
2751                      *   we stop the illegal sequence before the first one of those.
2752                      *
2753                      * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2754                      * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2755                      * Otherwise we convert or report the pair of bytes.
2756                      */
2757                     leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2758                     trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2759                     if (leadIsOk && trailIsOk) {
2760                         ++mySource;
2761                         tempBuf[0] = (char)(mySourceChar + 0x80);
2762                         tempBuf[1] = (char)(trailByte + 0x80);
2763                         targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2764                         mySourceChar = (mySourceChar << 8) | trailByte;
2765                     } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2766                         /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2767                         ++mySource;
2768                         /* add another bit so that the code below writes 2 bytes in case of error */
2769                         mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2770                     }
2771                 } else {
2772                     args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2773                     args->converter->toULength = 1;
2774                     break;
2775                 }
2776             }
2777             else if(mySourceChar <= 0x7f) {
2778                 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2779             } else {
2780                 targetUniChar = 0xffff;
2781             }
2782             if(targetUniChar < 0xfffe){
2783                 if(args->offsets) {
2784                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2785                 }
2786                 *(myTarget++)=(UChar)targetUniChar;
2787             }
2788             else {
2789                 /* Call the callback function*/
2790                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2791                 break;
2792             }
2793         }
2794         else{
2795             *err =U_BUFFER_OVERFLOW_ERROR;
2796             break;
2797         }
2798     }
2799     args->target = myTarget;
2800     args->source = mySource;
2801 }
2802
2803 /*************************** END ISO2022-KR *********************************/
2804
2805 /*************************** ISO-2022-CN *********************************
2806 *
2807 * Rules for ISO-2022-CN Encoding:
2808 * i)   The designator sequence must appear once on a line before any instance
2809 *      of character set it designates.
2810 * ii)  If two lines contain characters from the same character set, both lines
2811 *      must include the designator sequence.
2812 * iii) Once the designator sequence is known, a shifting sequence has to be found
2813 *      to invoke the  shifting
2814 * iv)  All lines start in ASCII and end in ASCII.
2815 * v)   Four shifting sequences are employed for this purpose:
2816 *
2817 *      Sequcence   ASCII Eq    Charsets
2818 *      ----------  -------    ---------
2819 *      SI           <SI>        US-ASCII
2820 *      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2821 *      SS2          <ESC>N      CNS-11643-1992 Plane 2
2822 *      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2823 *
2824 * vi)
2825 *      SOdesignator  : ESC "$" ")" finalchar_for_SO
2826 *      SS2designator : ESC "$" "*" finalchar_for_SS2
2827 *      SS3designator : ESC "$" "+" finalchar_for_SS3
2828 *
2829 *      ESC $ ) A       Indicates the bytes following SO are Chinese
2830 *       characters as defined in GB 2312-80, until
2831 *       another SOdesignation appears
2832 *
2833 *
2834 *      ESC $ ) E       Indicates the bytes following SO are as defined
2835 *       in ISO-IR-165 (for details, see section 2.1),
2836 *       until another SOdesignation appears
2837 *
2838 *      ESC $ ) G       Indicates the bytes following SO are as defined
2839 *       in CNS 11643-plane-1, until another
2840 *       SOdesignation appears
2841 *
2842 *      ESC $ * H       Indicates the two bytes immediately following
2843 *       SS2 is a Chinese character as defined in CNS
2844 *       11643-plane-2, until another SS2designation
2845 *       appears
2846 *       (Meaning <ESC>N must preceed every 2 byte
2847 *        sequence.)
2848 *
2849 *      ESC $ + I       Indicates the immediate two bytes following SS3
2850 *       is a Chinese character as defined in CNS
2851 *       11643-plane-3, until another SS3designation
2852 *       appears
2853 *       (Meaning <ESC>O must preceed every 2 byte
2854 *        sequence.)
2855 *
2856 *      ESC $ + J       Indicates the immediate two bytes following SS3
2857 *       is a Chinese character as defined in CNS
2858 *       11643-plane-4, until another SS3designation
2859 *       appears
2860 *       (In English: <ESC>O must preceed every 2 byte
2861 *        sequence.)
2862 *
2863 *      ESC $ + K       Indicates the immediate two bytes following SS3
2864 *       is a Chinese character as defined in CNS
2865 *       11643-plane-5, until another SS3designation
2866 *       appears
2867 *
2868 *      ESC $ + L       Indicates the immediate two bytes following SS3
2869 *       is a Chinese character as defined in CNS
2870 *       11643-plane-6, until another SS3designation
2871 *       appears
2872 *
2873 *      ESC $ + M       Indicates the immediate two bytes following SS3
2874 *       is a Chinese character as defined in CNS
2875 *       11643-plane-7, until another SS3designation
2876 *       appears
2877 *
2878 *       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2879 *       has its own designation information before any Chinese characters
2880 *       appear
2881 *
2882 */
2883
2884 /* The following are defined this way to make the strings truly readonly */
2885 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2886 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2887 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2888 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2889 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2890 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2891 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2892 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2893 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2894
2895 /********************** ISO2022-CN Data **************************/
2896 static const char* const escSeqCharsCN[10] ={
2897         SHIFT_IN_STR,                   /* 0 ASCII */
2898         GB_2312_80_STR,                 /* 1 GB2312_1 */
2899         ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2900         CNS_11643_1992_Plane_1_STR,
2901         CNS_11643_1992_Plane_2_STR,
2902         CNS_11643_1992_Plane_3_STR,
2903         CNS_11643_1992_Plane_4_STR,
2904         CNS_11643_1992_Plane_5_STR,
2905         CNS_11643_1992_Plane_6_STR,
2906         CNS_11643_1992_Plane_7_STR
2907 };
2908
2909 static void
2910 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2911     UConverter *cnv = args->converter;
2912     UConverterDataISO2022 *converterData;
2913     ISO2022State *pFromU2022State;
2914     uint8_t *target = (uint8_t *) args->target;
2915     const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2916     const UChar* source = args->source;
2917     const UChar* sourceLimit = args->sourceLimit;
2918     int32_t* offsets = args->offsets;
2919     UChar32 sourceChar;
2920     char buffer[8];
2921     int32_t len;
2922     int8_t choices[3];
2923     int32_t choiceCount;
2924     uint32_t targetValue = 0;
2925     UBool useFallback;
2926
2927     /* set up the state */
2928     converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2929     pFromU2022State   = &converterData->fromU2022State;
2930
2931     choiceCount = 0;
2932
2933     /* check if the last codepoint of previous buffer was a lead surrogate*/
2934     if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2935         goto getTrail;
2936     }
2937
2938     while( source < sourceLimit){
2939         if(target < targetLimit){
2940
2941             sourceChar  = *(source++);
2942             /*check if the char is a First surrogate*/
2943              if(U16_IS_SURROGATE(sourceChar)) {
2944                 if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2945 getTrail:
2946                     /*look ahead to find the trail surrogate*/
2947                     if(source < sourceLimit) {
2948                         /* test the following code unit */
2949                         UChar trail=(UChar) *source;
2950                         if(U16_IS_TRAIL(trail)) {
2951                             source++;
2952                             sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2953                             cnv->fromUChar32=0x00;
2954                             /* convert this supplementary code point */
2955                             /* exit this condition tree */
2956                         } else {
2957                             /* this is an unmatched lead code unit (1st surrogate) */
2958                             /* callback(illegal) */
2959                             *err=U_ILLEGAL_CHAR_FOUND;
2960                             cnv->fromUChar32=sourceChar;
2961                             break;
2962                         }
2963                     } else {
2964                         /* no more input */
2965                         cnv->fromUChar32=sourceChar;
2966                         break;
2967                     }
2968                 } else {
2969                     /* this is an unmatched trail code unit (2nd surrogate) */
2970                     /* callback(illegal) */
2971                     *err=U_ILLEGAL_CHAR_FOUND;
2972                     cnv->fromUChar32=sourceChar;
2973                     break;
2974                 }
2975             }
2976
2977             /* do the conversion */
2978             if(sourceChar <= 0x007f ){
2979                 /* do not convert SO/SI/ESC */
2980                 if(IS_2022_CONTROL(sourceChar)) {
2981                     /* callback(illegal) */
2982                     *err=U_ILLEGAL_CHAR_FOUND;
2983                     cnv->fromUChar32=sourceChar;
2984                     break;
2985                 }
2986
2987                 /* US-ASCII */
2988                 if(pFromU2022State->g == 0) {
2989                     buffer[0] = (char)sourceChar;
2990                     len = 1;
2991                 } else {
2992                     buffer[0] = UCNV_SI;
2993                     buffer[1] = (char)sourceChar;
2994                     len = 2;
2995                     pFromU2022State->g = 0;
2996                     choiceCount = 0;
2997                 }
2998                 if(sourceChar == CR || sourceChar == LF) {
2999                     /* reset the state at the end of a line */
3000                     uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3001                     choiceCount = 0;
3002                 }
3003             }
3004             else{
3005                 /* convert U+0080..U+10ffff */
3006                 int32_t i;
3007                 int8_t cs, g;
3008
3009                 if(choiceCount == 0) {
3010                     /* try the current SO/G1 converter first */
3011                     choices[0] = pFromU2022State->cs[1];
3012
3013                     /* default to GB2312_1 if none is designated yet */
3014                     if(choices[0] == 0) {
3015                         choices[0] = GB2312_1;
3016                     }
3017
3018                     if(converterData->version == 0) {
3019                         /* ISO-2022-CN */
3020
3021                         /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3022                         if(choices[0] == GB2312_1) {
3023                             choices[1] = (int8_t)CNS_11643_1;
3024                         } else {
3025                             choices[1] = (int8_t)GB2312_1;
3026                         }
3027
3028                         choiceCount = 2;
3029                     } else if (converterData->version == 1) {
3030                         /* ISO-2022-CN-EXT */
3031
3032                         /* try one of the other converters */
3033                         switch(choices[0]) {
3034                         case GB2312_1:
3035                             choices[1] = (int8_t)CNS_11643_1;
3036                             choices[2] = (int8_t)ISO_IR_165;
3037                             break;
3038                         case ISO_IR_165:
3039                             choices[1] = (int8_t)GB2312_1;
3040                             choices[2] = (int8_t)CNS_11643_1;
3041                             break;
3042                         default: /* CNS_11643_x */
3043                             choices[1] = (int8_t)GB2312_1;
3044                             choices[2] = (int8_t)ISO_IR_165;
3045                             break;
3046                         }
3047
3048                         choiceCount = 3;
3049                     } else {
3050                         choices[0] = (int8_t)CNS_11643_1;
3051                         choices[1] = (int8_t)GB2312_1;
3052                     }
3053                 }
3054
3055                 cs = g = 0;
3056                 /*
3057                  * len==0: no mapping found yet
3058                  * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3059                  * len>0: found a roundtrip result, done
3060                  */
3061                 len = 0;
3062                 /*
3063                  * We will turn off useFallback after finding a fallback,
3064                  * but we still get fallbacks from PUA code points as usual.
3065                  * Therefore, we will also need to check that we don't overwrite
3066                  * an early fallback with a later one.
3067                  */
3068                 useFallback = cnv->useFallback;
3069
3070                 for(i = 0; i < choiceCount && len <= 0; ++i) {
3071                     int8_t cs0 = choices[i];
3072                     if(cs0 > 0) {
3073                         uint32_t value;
3074                         int32_t len2;
3075                         if(cs0 >= CNS_11643_0) {
3076                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3077                                         converterData->myConverterArray[CNS_11643],
3078                                         sourceChar,
3079                                         &value,
3080                                         useFallback,
3081                                         MBCS_OUTPUT_3);
3082                             if(len2 == 3 || (len2 == -3 && len == 0)) {
3083                                 targetValue = value;
3084                                 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3085                                 if(len2 >= 0) {
3086                                     len = 2;
3087                                 } else {
3088                                     len = -2;
3089                                     useFallback = FALSE;
3090                                 }
3091                                 if(cs == CNS_11643_1) {
3092                                     g = 1;
3093                                 } else if(cs == CNS_11643_2) {
3094                                     g = 2;
3095                                 } else /* plane 3..7 */ if(converterData->version == 1) {
3096                                     g = 3;
3097                                 } else {
3098                                     /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3099                                     len = 0;
3100                                 }
3101                             }
3102                         } else {
3103                             /* GB2312_1 or ISO-IR-165 */
3104                             U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3105                             len2 = MBCS_FROM_UCHAR32_ISO2022(
3106                                         converterData->myConverterArray[cs0],
3107                                         sourceChar,
3108                                         &value,
3109                                         useFallback,
3110                                         MBCS_OUTPUT_2);
3111                             if(len2 == 2 || (len2 == -2 && len == 0)) {
3112                                 targetValue = value;
3113                                 len = len2;
3114                                 cs = cs0;
3115                                 g = 1;
3116                                 useFallback = FALSE;
3117                             }
3118                         }
3119                     }
3120                 }
3121
3122                 if(len != 0) {
3123                     len = 0; /* count output bytes; it must have been abs(len) == 2 */
3124
3125                     /* write the designation sequence if necessary */
3126                     if(cs != pFromU2022State->cs[g]) {
3127                         if(cs < CNS_11643) {
3128                             uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3129                         } else {
3130                             U_ASSERT(cs >= CNS_11643_1);
3131                             uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3132                         }
3133                         len = 4;
3134                         pFromU2022State->cs[g] = cs;
3135                         if(g == 1) {
3136                             /* changing the SO/G1 charset invalidates the choices[] */
3137                             choiceCount = 0;
3138                         }
3139                     }
3140
3141                     /* write the shift sequence if necessary */
3142                     if(g != pFromU2022State->g) {
3143                         switch(g) {
3144                         case 1:
3145                             buffer[len++] = UCNV_SO;
3146
3147                             /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3148                             pFromU2022State->g = 1;
3149                             break;
3150                         case 2:
3151                             buffer[len++] = 0x1b;
3152                             buffer[len++] = 0x4e;
3153                             break;
3154                         default: /* case 3 */
3155                             buffer[len++] = 0x1b;
3156                             buffer[len++] = 0x4f;
3157                             break;
3158                         }
3159                     }
3160
3161                     /* write the two output bytes */
3162                     buffer[len++] = (char)(targetValue >> 8);
3163                     buffer[len++] = (char)targetValue;
3164                 } else {
3165                     /* if we cannot find the character after checking all codepages
3166                      * then this is an error
3167                      */
3168                     *err = U_INVALID_CHAR_FOUND;
3169                     cnv->fromUChar32=sourceChar;
3170                     break;
3171                 }
3172             }
3173
3174             /* output len>0 bytes in buffer[] */
3175             if(len == 1) {
3176                 *target++ = buffer[0];
3177                 if(offsets) {
3178                     *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3179                 }
3180             } else if(len == 2 && (target + 2) <= targetLimit) {
3181                 *target++ = buffer[0];
3182                 *target++ = buffer[1];
3183                 if(offsets) {
3184                     int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3185                     *offsets++ = sourceIndex;
3186                     *offsets++ = sourceIndex;
3187                 }
3188             } else {
3189                 fromUWriteUInt8(
3190                     cnv,
3191                     buffer, len,
3192                     &target, (const char *)targetLimit,
3193                     &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3194                     err);
3195                 if(U_FAILURE(*err)) {
3196                     break;
3197                 }
3198             }
3199         } /* end if(myTargetIndex<myTargetLength) */
3200         else{
3201             *err =U_BUFFER_OVERFLOW_ERROR;
3202             break;
3203         }
3204
3205     }/* end while(mySourceIndex<mySourceLength) */
3206
3207     /*
3208      * the end of the input stream and detection of truncated input
3209      * are handled by the framework, but for ISO-2022-CN conversion
3210      * we need to be in ASCII mode at the very end
3211      *
3212      * conditions:
3213      *   successful
3214      *   not in ASCII mode
3215      *   end of input and no truncated input
3216      */
3217     if( U_SUCCESS(*err) &&
3218         pFromU2022State->g!=0 &&
3219         args->flush && source>=sourceLimit && cnv->fromUChar32==0
3220     ) {
3221         int32_t sourceIndex;
3222
3223         /* we are switching to ASCII */
3224         pFromU2022State->g=0;
3225
3226         /* get the source index of the last input character */
3227         /*
3228          * TODO this would be simpler and more reliable if we used a pair
3229          * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3230          * so that we could simply use the prevSourceIndex here;
3231          * this code gives an incorrect result for the rare case of an unmatched
3232          * trail surrogate that is alone in the last buffer of the text stream
3233          */
3234         sourceIndex=(int32_t)(source-args->source);
3235         if(sourceIndex>0) {
3236             --sourceIndex;
3237             if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3238                 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3239             ) {
3240                 --sourceIndex;
3241             }
3242         } else {
3243             sourceIndex=-1;
3244         }
3245
3246         fromUWriteUInt8(
3247             cnv,
3248             SHIFT_IN_STR, 1,
3249             &target, (const char *)targetLimit,
3250             &offsets, sourceIndex,
3251             err);
3252     }
3253
3254     /*save the state and return */
3255     args->source = source;
3256     args->target = (char*)target;
3257 }
3258
3259
3260 static void
3261 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3262                                                UErrorCode* err){
3263     char tempBuf[3];
3264     const char *mySource = (char *) args->source;
3265     UChar *myTarget = args->target;
3266     const char *mySourceLimit = args->sourceLimit;
3267     uint32_t targetUniChar = 0x0000;
3268     uint32_t mySourceChar = 0x0000;
3269     UConverterDataISO2022* myData;
3270     ISO2022State *pToU2022State;
3271
3272     myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3273     pToU2022State = &myData->toU2022State;
3274
3275     if(myData->key != 0) {
3276         /* continue with a partial escape sequence */
3277         goto escape;
3278     } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3279         /* continue with a partial double-byte character */
3280         mySourceChar = args->converter->toUBytes[0];
3281         args->converter->toULength = 0;
3282         targetUniChar = missingCharMarker;
3283         goto getTrailByte;
3284     }
3285
3286     while(mySource < mySourceLimit){
3287
3288         targetUniChar =missingCharMarker;
3289
3290         if(myTarget < args->targetLimit){
3291
3292             mySourceChar= (unsigned char) *mySource++;
3293
3294             switch(mySourceChar){
3295             case UCNV_SI:
3296                 pToU2022State->g=0;
3297                 if (myData->isEmptySegment) {
3298                     myData->isEmptySegment = FALSE;     /* we are handling it, reset to avoid future spurious errors */
3299                     *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3300                     args->converter->toUCallbackReason = UCNV_IRREGULAR;
3301                     args->converter->toUBytes[0] = mySourceChar;
3302                     args->converter->toULength = 1;
3303                     args->target = myTarget;
3304                     args->source = mySource;
3305                     return;
3306                 }
3307                 continue;
3308
3309             case UCNV_SO:
3310                 if(pToU2022State->cs[1] != 0) {
3311                     pToU2022State->g=1;
3312                     myData->isEmptySegment = TRUE;      /* Begin a new segment, empty so far */
3313                     continue;
3314                 } else {
3315                     /* illegal to have SO before a matching designator */
3316                     myData->isEmptySegment = FALSE;     /* Handling a different error, reset this to avoid future spurious errs */
3317                     break;
3318                 }
3319
3320             case ESC_2022:
3321                 mySource--;
3322 escape:
3323                 {
3324                     const char * mySourceBefore = mySource;
3325                     int8_t toULengthBefore = args->converter->toULength;
3326
3327                     changeState_2022(args->converter,&(mySource),
3328                         mySourceLimit, ISO_2022_CN,err);
3329
3330                     /* After SO there must be at least one character before a designator (designator error handled separately) */
3331                     if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3332                         *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3333                         args->converter->toUCallbackReason = UCNV_IRREGULAR;
3334                         args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3335                     }
3336                 }
3337
3338                 /* invalid or illegal escape sequence */
3339                 if(U_FAILURE(*err)){
3340                     args->target = myTarget;
3341                     args->source = mySource;
3342                     myData->isEmptySegment = FALSE;     /* Reset to avoid future spurious errors */
3343                     return;
3344                 }
3345                 continue;
3346
3347             /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3348
3349             case CR:
3350                 /*falls through*/
3351             case LF:
3352                 uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3353                 /* falls through */
3354             default:
3355                 /* convert one or two bytes */
3356                 myData->isEmptySegment = FALSE;
3357                 if(pToU2022State->g != 0) {
3358                     if(mySource < mySourceLimit) {
3359                         UConverterSharedData *cnv;
3360                         StateEnum tempState;
3361                         int32_t tempBufLen;
3362                         int leadIsOk, trailIsOk;
3363                         uint8_t trailByte;
3364 getTrailByte:
3365                         trailByte = (uint8_t)*mySource;
3366                         /*
3367                          * Ticket 5691: consistent illegal sequences:
3368                          * - We include at least the first byte in the illegal sequence.
3369                          * - If any of the non-initial bytes could be the start of a character,
3370                          *   we stop the illegal sequence before the first one of those.
3371                          *
3372                          * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3373                          * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3374                          * Otherwise we convert or report the pair of bytes.
3375                          */
3376                         leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3377                         trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3378                         if (leadIsOk && trailIsOk) {
3379                             ++mySource;
3380                             tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3381                             if(tempState >= CNS_11643_0) {
3382                                 cnv = myData->myConverterArray[CNS_11643];
3383                                 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3384                                 tempBuf[1] = (char) (mySourceChar);
3385                                 tempBuf[2] = (char) trailByte;
3386                                 tempBufLen = 3;
3387
3388                             }else{
3389                                 U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3390                                 cnv = myData->myConverterArray[tempState];
3391                                 tempBuf[0] = (char) (mySourceChar);
3392                                 tempBuf[1] = (char) trailByte;
3393                                 tempBufLen = 2;
3394                             }
3395                             targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3396                             mySourceChar = (mySourceChar << 8) | trailByte;
3397                         } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3398                             /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3399                             ++mySource;
3400                             /* add another bit so that the code below writes 2 bytes in case of error */
3401                             mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3402                         }
3403                         if(pToU2022State->g>=2) {
3404                             /* return from a single-shift state to the previous one */
3405                             pToU2022State->g=pToU2022State->prevG;
3406                         }
3407                     } else {
3408                         args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3409                         args->converter->toULength = 1;
3410                         goto endloop;
3411                     }
3412                 }
3413                 else{
3414                     if(mySourceChar <= 0x7f) {
3415                         targetUniChar = (UChar) mySourceChar;
3416                     }
3417                 }
3418                 break;
3419             }
3420             if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3421                 if(args->offsets){
3422                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3423                 }
3424                 *(myTarget++)=(UChar)targetUniChar;
3425             }
3426             else if(targetUniChar > missingCharMarker){
3427                 /* disassemble the surrogate pair and write to output*/
3428                 targetUniChar-=0x0010000;
3429                 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3430                 if(args->offsets){
3431                     args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3432                 }
3433                 ++myTarget;
3434                 if(myTarget< args->targetLimit){
3435                     *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3436                     if(args->offsets){
3437                         args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3438                     }
3439                     ++myTarget;
3440                 }else{
3441                     args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3442                                     (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3443                 }
3444
3445             }
3446             else{
3447                 /* Call the callback function*/
3448                 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3449                 break;
3450             }
3451         }
3452         else{
3453             *err =U_BUFFER_OVERFLOW_ERROR;
3454             break;
3455         }
3456     }
3457 endloop:
3458     args->target = myTarget;
3459     args->source = mySource;
3460 }
3461 #endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
3462
3463 static void
3464 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3465     UConverter *cnv = args->converter;
3466     UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3467     ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3468     char *p, *subchar;
3469     char buffer[8];
3470     int32_t length;
3471
3472     subchar=(char *)cnv->subChars;
3473     length=cnv->subCharLen; /* assume length==1 for most variants */
3474
3475     p = buffer;
3476     switch(myConverterData->locale[0]){
3477     case 'j':
3478         {
3479             int8_t cs;
3480
3481             if(pFromU2022State->g == 1) {
3482                 /* JIS7: switch from G1 to G0 */
3483                 pFromU2022State->g = 0;
3484                 *p++ = UCNV_SI;
3485             }
3486
3487             cs = pFromU2022State->cs[0];
3488             if(cs != ASCII && cs != JISX201) {
3489                 /* not in ASCII or JIS X 0201: switch to ASCII */
3490                 pFromU2022State->cs[0] = (int8_t)ASCII;
3491                 *p++ = '\x1b';
3492                 *p++ = '\x28';
3493                 *p++ = '\x42';
3494             }
3495
3496             *p++ = subchar[0];
3497             break;
3498         }
3499     case 'c':
3500         if(pFromU2022State->g != 0) {
3501             /* not in ASCII mode: switch to ASCII */
3502             pFromU2022State->g = 0;
3503             *p++ = UCNV_SI;
3504         }
3505         *p++ = subchar[0];
3506         break;
3507     case 'k':
3508         if(myConverterData->version == 0) {
3509             if(length == 1) {
3510                 if((UBool)args->converter->fromUnicodeStatus) {
3511                     /* in DBCS mode: switch to SBCS */
3512                     args->converter->fromUnicodeStatus = 0;
3513                     *p++ = UCNV_SI;
3514                 }
3515                 *p++ = subchar[0];
3516             } else /* length == 2*/ {
3517                 if(!(UBool)args->converter->fromUnicodeStatus) {
3518                     /* in SBCS mode: switch to DBCS */
3519                     args->converter->fromUnicodeStatus = 1;
3520                     *p++ = UCNV_SO;
3521                 }
3522                 *p++ = subchar[0];
3523                 *p++ = subchar[1];
3524             }
3525             break;
3526         } else {
3527             /* save the subconverter's substitution string */
3528             uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3529             int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3530
3531             /* set our substitution string into the subconverter */
3532             myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3533             myConverterData->currentConverter->subCharLen = (int8_t)length;
3534
3535             /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3536             args->converter = myConverterData->currentConverter;
3537             myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3538             ucnv_cbFromUWriteSub(args, 0, err);
3539             cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3540             args->converter = cnv;
3541
3542             /* restore the subconverter's substitution string */
3543             myConverterData->currentConverter->subChars = currentSubChars;
3544             myConverterData->currentConverter->subCharLen = currentSubCharLen;
3545
3546             if(*err == U_BUFFER_OVERFLOW_ERROR) {
3547                 if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3548                     uprv_memcpy(
3549                         cnv->charErrorBuffer,
3550                         myConverterData->currentConverter->charErrorBuffer,
3551                         myConverterData->currentConverter->charErrorBufferLength);
3552                 }
3553                 cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3554                 myConverterData->currentConverter->charErrorBufferLength = 0;
3555             }
3556             return;
3557         }
3558     default:
3559         /* not expected */
3560         break;
3561     }
3562     ucnv_cbFromUWriteBytes(args,
3563                            buffer, (int32_t)(p - buffer),
3564                            offsetIndex, err);
3565 }
3566
3567 /*
3568  * Structure for cloning an ISO 2022 converter into a single memory block.
3569  * ucnv_safeClone() of the converter will align the entire cloneStruct,
3570  * and then ucnv_safeClone() of the sub-converter may additionally align
3571  * currentConverter inside the cloneStruct, for which we need the deadSpace
3572  * after currentConverter.
3573  * This is because UAlignedMemory may be larger than the actually
3574  * necessary alignment size for the platform.
3575  * The other cloneStruct fields will not be moved around,
3576  * and are aligned properly with cloneStruct's alignment.
3577  */
3578 struct cloneStruct
3579 {
3580     UConverter cnv;
3581     UConverter currentConverter;
3582     UAlignedMemory deadSpace;
3583     UConverterDataISO2022 mydata;
3584 };
3585
3586
3587 static UConverter *
3588 _ISO_2022_SafeClone(
3589             const UConverter *cnv,
3590             void *stackBuffer,
3591             int32_t *pBufferSize,
3592             UErrorCode *status)
3593 {
3594     struct cloneStruct * localClone;
3595     UConverterDataISO2022 *cnvData;
3596     int32_t i, size;
3597
3598     if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3599         *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3600         return NULL;
3601     }
3602
3603     cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3604     localClone = (struct cloneStruct *)stackBuffer;
3605
3606     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3607
3608     uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3609     localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3610     localClone->cnv.isExtraLocal = TRUE;
3611
3612     /* share the subconverters */
3613
3614     if(cnvData->currentConverter != NULL) {
3615         size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3616         localClone->mydata.currentConverter =
3617             ucnv_safeClone(cnvData->currentConverter,
3618                             &localClone->currentConverter,
3619                             &size, status);
3620         if(U_FAILURE(*status)) {
3621             return NULL;
3622         }
3623     }
3624
3625     for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3626         if(cnvData->myConverterArray[i] != NULL) {
3627             ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3628         }
3629     }
3630
3631     return &localClone->cnv;
3632 }
3633
3634 static void
3635 _ISO_2022_GetUnicodeSet(const UConverter *cnv,
3636                     const USetAdder *sa,
3637                     UConverterUnicodeSet which,
3638                     UErrorCode *pErrorCode)
3639 {
3640     int32_t i;
3641     UConverterDataISO2022* cnvData;
3642
3643     if (U_FAILURE(*pErrorCode)) {
3644         return;
3645     }
3646 #ifdef U_ENABLE_GENERIC_ISO_2022
3647     if (cnv->sharedData == &_ISO2022Data) {
3648         /* We use UTF-8 in this case */
3649         sa->addRange(sa->set, 0, 0xd7FF);
3650         sa->addRange(sa->set, 0xE000, 0x10FFFF);
3651         return;
3652     }
3653 #endif
3654
3655     cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3656
3657     /* open a set and initialize it with code points that are algorithmically round-tripped */
3658     switch(cnvData->locale[0]){
3659     case 'j':
3660         /* include JIS X 0201 which is hardcoded */
3661         sa->add(sa->set, 0xa5);
3662         sa->add(sa->set, 0x203e);
3663 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3664         if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3665             /* include Latin-1 for some variants of JP */
3666             sa->addRange(sa->set, 0, 0xff);
3667         } else {
3668             /* include ASCII for JP */
3669             sa->addRange(sa->set, 0, 0x7f);
3670         }
3671 #else
3672         /* include ASCII for JP */
3673         sa->addRange(sa->set, 0, 0x7f);
3674 #endif
3675         if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3676             /*
3677              * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3678              * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3679              * use half-width Katakana.
3680              * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3681              * half-width Katakana via the ESC ( I sequence.
3682              * However, we only emit (fromUnicode) half-width Katakana according to the
3683              * definition of each variant.
3684              *
3685              * When including fallbacks,
3686              * we need to include half-width Katakana Unicode code points for all JP variants because
3687              * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3688              */
3689             /* include half-width Katakana for JP */
3690             sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3691         }
3692         break;
3693 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3694     case 'c':
3695     case 'z':
3696         /* include ASCII for CN */
3697         sa->addRange(sa->set, 0, 0x7f);
3698         break;
3699     case 'k':
3700         /* there is only one converter for KR, and it is not in the myConverterArray[] */
3701         cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3702                 cnvData->currentConverter, sa, which, pErrorCode);
3703         /* the loop over myConverterArray[] will simply not find another converter */
3704         break;
3705 #endif
3706     default:
3707         break;
3708     }
3709
3710 #if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3711             if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3712                 cnvData->version==0 && i==CNS_11643
3713             ) {
3714                 /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3715                 ucnv_MBCSGetUnicodeSetForBytes(
3716                         cnvData->myConverterArray[i],
3717                         sa, UCNV_ROUNDTRIP_SET,
3718                         0, 0x81, 0x82,
3719                         pErrorCode);
3720             }
3721 #endif
3722
3723     for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3724         UConverterSetFilter filter;
3725         if(cnvData->myConverterArray[i]!=NULL) {
3726             if(cnvData->locale[0]=='j' && i==JISX208) {
3727                 /*
3728                  * Only add code points that map to Shift-JIS codes
3729                  * corresponding to JIS X 0208.
3730                  */
3731                 filter=UCNV_SET_FILTER_SJIS;
3732 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3733             } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3734                        cnvData->version==0 && i==CNS_11643) {
3735                 /*
3736                  * Version-specific for CN:
3737                  * CN version 0 does not map CNS planes 3..7 although
3738                  * they are all available in the CNS conversion table;
3739                  * CN version 1 (-EXT) does map them all.
3740                  * The two versions create different Unicode sets.
3741                  */
3742                 filter=UCNV_SET_FILTER_2022_CN;
3743             } else if(i==KSC5601) {
3744                 /*
3745                  * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3746                  * are broader than GR94.
3747                  */
3748                 filter=UCNV_SET_FILTER_GR94DBCS;
3749 #endif
3750             } else {
3751                 filter=UCNV_SET_FILTER_NONE;
3752             }
3753             ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3754         }
3755     }
3756
3757     /*
3758      * ISO 2022 converters must not convert SO/SI/ESC despite what
3759      * sub-converters do by themselves.
3760      * Remove these characters from the set.
3761      */
3762     sa->remove(sa->set, 0x0e);
3763     sa->remove(sa->set, 0x0f);
3764     sa->remove(sa->set, 0x1b);
3765
3766     /* ISO 2022 converters do not convert C1 controls either */
3767     sa->removeRange(sa->set, 0x80, 0x9f);
3768 }
3769
3770 static const UConverterImpl _ISO2022Impl={
3771     UCNV_ISO_2022,
3772
3773     NULL,
3774     NULL,
3775
3776     _ISO2022Open,
3777     _ISO2022Close,
3778     _ISO2022Reset,
3779
3780 #ifdef U_ENABLE_GENERIC_ISO_2022
3781     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3782     T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3783     ucnv_fromUnicode_UTF8,
3784     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3785 #else
3786     NULL,
3787     NULL,
3788     NULL,
3789     NULL,
3790 #endif
3791     NULL,
3792
3793     NULL,
3794     _ISO2022getName,
3795     _ISO_2022_WriteSub,
3796     _ISO_2022_SafeClone,
3797     _ISO_2022_GetUnicodeSet,
3798
3799     NULL,
3800     NULL
3801 };
3802 static const UConverterStaticData _ISO2022StaticData={
3803     sizeof(UConverterStaticData),
3804     "ISO_2022",
3805     2022,
3806     UCNV_IBM,
3807     UCNV_ISO_2022,
3808     1,
3809     3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3810     { 0x1a, 0, 0, 0 },
3811     1,
3812     FALSE,
3813     FALSE,
3814     0,
3815     0,
3816     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3817 };
3818 const UConverterSharedData _ISO2022Data={
3819     sizeof(UConverterSharedData),
3820     ~((uint32_t) 0),
3821     NULL,
3822     NULL,
3823     &_ISO2022StaticData,
3824     FALSE,
3825     &_ISO2022Impl,
3826     0, UCNV_MBCS_TABLE_INITIALIZER
3827 };
3828
3829 /*************JP****************/
3830 static const UConverterImpl _ISO2022JPImpl={
3831     UCNV_ISO_2022,
3832
3833     NULL,
3834     NULL,
3835
3836     _ISO2022Open,
3837     _ISO2022Close,
3838     _ISO2022Reset,
3839
3840     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3841     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3842     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3843     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3844     NULL,
3845
3846     NULL,
3847     _ISO2022getName,
3848     _ISO_2022_WriteSub,
3849     _ISO_2022_SafeClone,
3850     _ISO_2022_GetUnicodeSet,
3851
3852     NULL,
3853     NULL
3854 };
3855 static const UConverterStaticData _ISO2022JPStaticData={
3856     sizeof(UConverterStaticData),
3857     "ISO_2022_JP",
3858     0,
3859     UCNV_IBM,
3860     UCNV_ISO_2022,
3861     1,
3862     6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3863     { 0x1a, 0, 0, 0 },
3864     1,
3865     FALSE,
3866     FALSE,
3867     0,
3868     0,
3869     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3870 };
3871
3872 namespace {
3873
3874 const UConverterSharedData _ISO2022JPData={
3875     sizeof(UConverterSharedData),
3876     ~((uint32_t) 0),
3877     NULL,
3878     NULL,
3879     &_ISO2022JPStaticData,
3880     FALSE,
3881     &_ISO2022JPImpl,
3882     0, UCNV_MBCS_TABLE_INITIALIZER
3883 };
3884
3885 }  // namespace
3886
3887 #if !UCONFIG_NO_NON_HTML5_CONVERSION
3888 /************* KR ***************/
3889 static const UConverterImpl _ISO2022KRImpl={
3890     UCNV_ISO_2022,
3891
3892     NULL,
3893     NULL,
3894
3895     _ISO2022Open,
3896     _ISO2022Close,
3897     _ISO2022Reset,
3898
3899     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3900     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3901     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3902     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3903     NULL,
3904
3905     NULL,
3906     _ISO2022getName,
3907     _ISO_2022_WriteSub,
3908     _ISO_2022_SafeClone,
3909     _ISO_2022_GetUnicodeSet,
3910
3911     NULL,
3912     NULL
3913 };
3914 static const UConverterStaticData _ISO2022KRStaticData={
3915     sizeof(UConverterStaticData),
3916     "ISO_2022_KR",
3917     0,
3918     UCNV_IBM,
3919     UCNV_ISO_2022,
3920     1,
3921     3, /* max 3 bytes per UChar: SO+DBCS */
3922     { 0x1a, 0, 0, 0 },
3923     1,
3924     FALSE,
3925     FALSE,
3926     0,
3927     0,
3928     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3929 };
3930
3931 namespace {
3932
3933 const UConverterSharedData _ISO2022KRData={
3934     sizeof(UConverterSharedData),
3935     ~((uint32_t) 0),
3936     NULL,
3937     NULL,
3938     &_ISO2022KRStaticData,
3939     FALSE,
3940     &_ISO2022KRImpl,
3941     0, UCNV_MBCS_TABLE_INITIALIZER
3942 };
3943
3944 }  // namespace
3945
3946 /*************** CN ***************/
3947 static const UConverterImpl _ISO2022CNImpl={
3948
3949     UCNV_ISO_2022,
3950
3951     NULL,
3952     NULL,
3953
3954     _ISO2022Open,
3955     _ISO2022Close,
3956     _ISO2022Reset,
3957
3958     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3959     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3960     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3961     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3962     NULL,
3963
3964     NULL,
3965     _ISO2022getName,
3966     _ISO_2022_WriteSub,
3967     _ISO_2022_SafeClone,
3968     _ISO_2022_GetUnicodeSet,
3969
3970     NULL,
3971     NULL
3972 };
3973 static const UConverterStaticData _ISO2022CNStaticData={
3974     sizeof(UConverterStaticData),
3975     "ISO_2022_CN",
3976     0,
3977     UCNV_IBM,
3978     UCNV_ISO_2022,
3979     1,
3980     8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3981     { 0x1a, 0, 0, 0 },
3982     1,
3983     FALSE,
3984     FALSE,
3985     0,
3986     0,
3987     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3988 };
3989
3990 namespace {
3991
3992 const UConverterSharedData _ISO2022CNData={
3993     sizeof(UConverterSharedData),
3994     ~((uint32_t) 0),
3995     NULL,
3996     NULL,
3997     &_ISO2022CNStaticData,
3998     FALSE,
3999     &_ISO2022CNImpl,
4000     0, UCNV_MBCS_TABLE_INITIALIZER
4001 };
4002
4003 }  // namespace
4004 #endif /* #if !UCONFIG_NO_NON_HTML5_CONVERSION */
4005
4006 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */