tizen 2.3.1 release
[framework/base/tizen-locale.git] / iconvdata / iso-2022-cn.c
1 /* Conversion module for ISO-2022-CN.
2    Copyright (C) 1999, 2000-2002, 2007, 2008 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 #include <dlfcn.h>
22 #include <gconv.h>
23 #include <stdint.h>
24 #include <string.h>
25 #include "gb2312.h"
26 #include "cns11643l1.h"
27 #include "cns11643l2.h"
28
29 #include <assert.h>
30
31 /* This makes obvious what everybody knows: 0x1b is the Esc character.  */
32 #define ESC     0x1b
33
34 /* We have single-byte shift-in and shift-out sequences, and the single
35    shift sequence SS2 which replaces the SS2 designation for the next
36    two bytes.  */
37 #define SI      0x0f
38 #define SO      0x0e
39 #define SS2_0   ESC
40 #define SS2_1   0x4e
41
42 /* Definitions used in the body of the `gconv' function.  */
43 #define CHARSET_NAME            "ISO-2022-CN//"
44 #define DEFINE_INIT             1
45 #define DEFINE_FINI             1
46 #define FROM_LOOP               from_iso2022cn_loop
47 #define TO_LOOP                 to_iso2022cn_loop
48 #define FROM_LOOP_MIN_NEEDED_FROM       1
49 #define FROM_LOOP_MAX_NEEDED_FROM       4
50 #define FROM_LOOP_MIN_NEEDED_TO         4
51 #define FROM_LOOP_MAX_NEEDED_TO         4
52 #define TO_LOOP_MIN_NEEDED_FROM         4
53 #define TO_LOOP_MAX_NEEDED_FROM         4
54 #define TO_LOOP_MIN_NEEDED_TO           1
55 #define TO_LOOP_MAX_NEEDED_TO           6
56 #define PREPARE_LOOP \
57   int save_set;                                                               \
58   int *setp = &data->__statep->__count;
59 #define EXTRA_LOOP_ARGS         , setp
60
61
62 /* The COUNT element of the state keeps track of the currently selected
63    character set.  The possible values are:  */
64 enum
65 {
66   ASCII_set = 0,
67   GB2312_set = 8,
68   CNS11643_1_set = 16,
69   CNS11643_2_set = 24,
70   CURRENT_SEL_MASK = 24,
71   GB2312_ann = 32,
72   CNS11643_1_ann = 64,
73   CNS11643_2_ann = 128,
74   CURRENT_ANN_MASK = 224
75 };
76
77
78 /* Since this is a stateful encoding we have to provide code which resets
79    the output state to the initial state.  This has to be done during the
80    flushing.  */
81 #define EMIT_SHIFT_TO_INIT \
82   if (data->__statep->__count != ASCII_set)                                   \
83     {                                                                         \
84       if (FROM_DIRECTION)                                                     \
85         /* It's easy, we don't have to emit anything, we just reset the       \
86            state for the input.  */                                           \
87         data->__statep->__count = ASCII_set;                                  \
88       else                                                                    \
89         {                                                                     \
90           /* We are not in the initial state.  To switch back we have         \
91              to emit `SI'.  */                                                \
92           if (__builtin_expect (outbuf == outend, 0))                         \
93             /* We don't have enough room in the output buffer.  */            \
94             status = __GCONV_FULL_OUTPUT;                                     \
95           else                                                                \
96             {                                                                 \
97               /* Write out the shift sequence.  */                            \
98               *outbuf++ = SI;                                                 \
99               data->__statep->__count = ASCII_set;                            \
100             }                                                                 \
101         }                                                                     \
102     }
103
104
105 /* Since we might have to reset input pointer we must be able to save
106    and retore the state.  */
107 #define SAVE_RESET_STATE(Save) \
108   if (Save)                                                                   \
109     save_set = *setp;                                                         \
110   else                                                                        \
111     *setp = save_set
112
113
114 /* First define the conversion function from ISO-2022-CN to UCS4.  */
115 #define MIN_NEEDED_INPUT        FROM_LOOP_MIN_NEEDED_FROM
116 #define MAX_NEEDED_INPUT        FROM_LOOP_MAX_NEEDED_FROM
117 #define MIN_NEEDED_OUTPUT       FROM_LOOP_MIN_NEEDED_TO
118 #define MAX_NEEDED_OUTPUT       FROM_LOOP_MAX_NEEDED_TO
119 #define LOOPFCT                 FROM_LOOP
120 #define BODY \
121   {                                                                           \
122     uint32_t ch = *inptr;                                                     \
123                                                                               \
124     /* This is a 7bit character set, disallow all 8bit characters.  */        \
125     if (__builtin_expect (ch >= 0x7f, 0))                                     \
126       STANDARD_FROM_LOOP_ERR_HANDLER (1);                                     \
127                                                                               \
128     /* Recognize escape sequences.  */                                        \
129     if (__builtin_expect (ch, 0) == ESC)                                      \
130       {                                                                       \
131         /* There are two kinds of escape sequences we have to handle:         \
132            - those announcing the use of GB and CNS characters on the         \
133              line; we can simply ignore them                                  \
134            - the initial byte of the SS2 sequence.                            \
135         */                                                                    \
136         if (__builtin_expect (inptr + 2 > inend, 0)                           \
137             || (inptr[1] == '$'                                               \
138                 && (__builtin_expect (inptr + 3 > inend, 0)                   \
139                     || (inptr[2] == ')'                                       \
140                         && __builtin_expect (inptr + 4 > inend, 0))           \
141                     || (inptr[2] == '*'                                       \
142                         && __builtin_expect (inptr + 4 > inend, 0))))         \
143             || (inptr[1] == SS2_1                                             \
144                 && __builtin_expect (inptr + 4 > inend, 0)))                  \
145           {                                                                   \
146             result = __GCONV_INCOMPLETE_INPUT;                                \
147             break;                                                            \
148           }                                                                   \
149         if (inptr[1] == '$'                                                   \
150             && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G'))     \
151                 || (inptr[2] == '*' && inptr[3] == 'H')))                     \
152           {                                                                   \
153             /* OK, we accept those character sets.  */                        \
154             if (inptr[3] == 'A')                                              \
155               ann = GB2312_ann;                                               \
156             else if (inptr[3] == 'G')                                         \
157               ann = CNS11643_1_ann;                                           \
158             inptr += 4;                                                       \
159             continue;                                                         \
160           }                                                                   \
161       }                                                                       \
162     else if (__builtin_expect (ch, 0) == SO)                                  \
163       {                                                                       \
164         /* Switch to use GB2312 or CNS 11643 plane 1, depending on which      \
165            S0 designation came last.  The only problem is what to do with     \
166            faulty input files where no designator came.                       \
167            XXX For now I'll default to use GB2312.  If this is not the        \
168            best behaviour (e.g., we should flag an error) let me know.  */    \
169         ++inptr;                                                              \
170         set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set;            \
171         continue;                                                             \
172       }                                                                       \
173     else if (__builtin_expect (ch, 0) == SI)                                  \
174       {                                                                       \
175         /* Switch to use ASCII.  */                                           \
176         ++inptr;                                                              \
177         set = ASCII_set;                                                      \
178         continue;                                                             \
179       }                                                                       \
180                                                                               \
181     if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1)                 \
182       {                                                                       \
183         /* This is a character from CNS 11643 plane 2.                        \
184            XXX We could test here whether the use of this character           \
185            set was announced.  */                                             \
186         inptr += 2;                                                           \
187         ch = cns11643l2_to_ucs4 (&inptr, 2, 0);                               \
188         if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR)                 \
189           {                                                                   \
190             inptr -= 2;                                                       \
191             STANDARD_FROM_LOOP_ERR_HANDLER (2);                               \
192           }                                                                   \
193       }                                                                       \
194     else if (set == ASCII_set)                                                \
195       {                                                                       \
196         /* Almost done, just advance the input pointer.  */                   \
197         ++inptr;                                                              \
198       }                                                                       \
199     else                                                                      \
200       {                                                                       \
201         /* That's pretty easy, we have a dedicated functions for this.  */    \
202         if (set == GB2312_set)                                                \
203           ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0);                     \
204         else                                                                  \
205           {                                                                   \
206             assert (set == CNS11643_1_set);                                   \
207             ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0);               \
208           }                                                                   \
209                                                                               \
210         if (__builtin_expect (ch, 1) == 0)                                    \
211           {                                                                   \
212             result = __GCONV_INCOMPLETE_INPUT;                                \
213             break;                                                            \
214           }                                                                   \
215         else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR)            \
216           {                                                                   \
217             STANDARD_FROM_LOOP_ERR_HANDLER (1);                               \
218           }                                                                   \
219       }                                                                       \
220                                                                               \
221     put32 (outptr, ch);                                                       \
222     outptr += 4;                                                              \
223   }
224 #define LOOP_NEED_FLAGS
225 #define EXTRA_LOOP_DECLS        , int *setp
226 #define INIT_PARAMS             int set = *setp & CURRENT_SEL_MASK; \
227                                 int ann = *setp & CURRENT_ANN_MASK
228 #define UPDATE_PARAMS           *setp = set | ann
229 #include <iconv/loop.c>
230
231
232 /* Next, define the other direction.  */
233 #define MIN_NEEDED_INPUT        TO_LOOP_MIN_NEEDED_FROM
234 #define MAX_NEEDED_INPUT        TO_LOOP_MAX_NEEDED_FROM
235 #define MIN_NEEDED_OUTPUT       TO_LOOP_MIN_NEEDED_TO
236 #define MAX_NEEDED_OUTPUT       TO_LOOP_MAX_NEEDED_TO
237 #define LOOPFCT                 TO_LOOP
238 #define BODY \
239   {                                                                           \
240     uint32_t ch = get32 (inptr);                                              \
241                                                                               \
242     /* First see whether we can write the character using the currently       \
243        selected character set.  */                                            \
244     if (ch < 0x80)                                                            \
245       {                                                                       \
246         if (set != ASCII_set)                                                 \
247           {                                                                   \
248             *outptr++ = SI;                                                   \
249             set = ASCII_set;                                                  \
250             if (__builtin_expect (outptr == outend, 0))                       \
251               {                                                               \
252                 result = __GCONV_FULL_OUTPUT;                                 \
253                 break;                                                        \
254               }                                                               \
255           }                                                                   \
256                                                                               \
257         *outptr++ = ch;                                                       \
258                                                                               \
259         /* At the end of the line we have to clear the `ann' flags since      \
260            every line must contain this information again.  */                \
261         if (ch == L'\n')                                                      \
262           ann = 0;                                                            \
263       }                                                                       \
264     else                                                                      \
265       {                                                                       \
266         unsigned char buf[2];                                                 \
267         /* Fake initialization to keep gcc quiet.  */                         \
268         asm ("" : "=m" (buf));                                                \
269                                                                               \
270         int used;                                                             \
271         size_t written = 0;                                                   \
272                                                                               \
273         if (set == GB2312_set || (ann & CNS11643_1_ann) == 0)                 \
274           {                                                                   \
275             written = ucs4_to_gb2312 (ch, buf, 2);                            \
276             used = GB2312_set;                                                \
277           }                                                                   \
278         else                                                                  \
279           {                                                                   \
280             written = ucs4_to_cns11643l1 (ch, buf, 2);                        \
281             used = CNS11643_1_set;                                            \
282           }                                                                   \
283                                                                               \
284         if (written == __UNKNOWN_10646_CHAR)                                  \
285           {                                                                   \
286             /* Cannot convert it using the currently selected SO set.         \
287                Next try the SS2 set.  */                                      \
288             written = ucs4_to_cns11643l2 (ch, buf, 2);                        \
289             if (written != __UNKNOWN_10646_CHAR)                              \
290               /* Yep, that worked.  */                                        \
291               used = CNS11643_2_set;                                          \
292             else                                                              \
293               {                                                               \
294                 /* Well, see whether we have to change the SO set.  */        \
295                 if (used == GB2312_set)                                       \
296                   written = ucs4_to_cns11643l1 (ch, buf, 2);                  \
297                 else                                                          \
298                   written = ucs4_to_gb2312 (ch, buf, 2);                      \
299                                                                               \
300                 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR)    \
301                   /* Oh well, then switch SO.  */                             \
302                   used = GB2312_set + CNS11643_1_set - used;                  \
303                 else                                                          \
304                   {                                                           \
305                     UNICODE_TAG_HANDLER (ch, 4);                              \
306                                                                               \
307                     /* Even this does not work.  Error.  */                   \
308                     STANDARD_TO_LOOP_ERR_HANDLER (4);                         \
309                   }                                                           \
310               }                                                               \
311           }                                                                   \
312         assert (written == 2);                                                \
313                                                                               \
314         /* See whether we have to emit an escape sequence.  */                \
315         if (set != used)                                                      \
316           {                                                                   \
317             /* First see whether we announced that we use this                \
318                character set.  */                                             \
319             if ((ann & (16 << (used >> 3))) == 0)                             \
320               {                                                               \
321                 const char *escseq;                                           \
322                                                                               \
323                 if (__builtin_expect (outptr + 4 > outend, 0))                \
324                   {                                                           \
325                     result = __GCONV_FULL_OUTPUT;                             \
326                     break;                                                    \
327                   }                                                           \
328                                                                               \
329                 assert ((used >> 3) >= 1 && (used >> 3) <= 3);                \
330                 escseq = ")A)G*H" + ((used >> 3) - 1) * 2;                    \
331                 *outptr++ = ESC;                                              \
332                 *outptr++ = '$';                                              \
333                 *outptr++ = *escseq++;                                        \
334                 *outptr++ = *escseq++;                                        \
335                                                                               \
336                 if (used == GB2312_set)                                       \
337                   ann = (ann & CNS11643_2_ann) | GB2312_ann;                  \
338                 else if (used == CNS11643_1_set)                              \
339                   ann = (ann & CNS11643_2_ann) | CNS11643_1_ann;              \
340                 else                                                          \
341                   ann |= CNS11643_2_ann;                                      \
342               }                                                               \
343                                                                               \
344             if (used == CNS11643_2_set)                                       \
345               {                                                               \
346                 if (__builtin_expect (outptr + 2 > outend, 0))                \
347                   {                                                           \
348                     result = __GCONV_FULL_OUTPUT;                             \
349                     break;                                                    \
350                   }                                                           \
351                 *outptr++ = SS2_0;                                            \
352                 *outptr++ = SS2_1;                                            \
353               }                                                               \
354             else                                                              \
355               {                                                               \
356                 /* We only have to emit something is currently ASCII is       \
357                    selected.  Otherwise we are switching within the           \
358                    SO charset.  */                                            \
359                 if (set == ASCII_set)                                         \
360                   {                                                           \
361                     if (__builtin_expect (outptr + 1 > outend, 0))            \
362                       {                                                       \
363                         result = __GCONV_FULL_OUTPUT;                         \
364                         break;                                                \
365                       }                                                       \
366                     *outptr++ = SO;                                           \
367                   }                                                           \
368               }                                                               \
369                                                                               \
370             /* Always test the length here since we have used up all the      \
371                guaranteed output buffer slots.  */                            \
372             if (__builtin_expect (outptr + 2 > outend, 0))                    \
373               {                                                               \
374                 result = __GCONV_FULL_OUTPUT;                                 \
375                 break;                                                        \
376               }                                                               \
377           }                                                                   \
378         else if (__builtin_expect (outptr + 2 > outend, 0))                   \
379           {                                                                   \
380             result = __GCONV_FULL_OUTPUT;                                     \
381             break;                                                            \
382           }                                                                   \
383                                                                               \
384         *outptr++ = buf[0];                                                   \
385         *outptr++ = buf[1];                                                   \
386         set = used;                                                           \
387       }                                                                       \
388                                                                               \
389     /* Now that we wrote the output increment the input pointer.  */          \
390     inptr += 4;                                                               \
391   }
392 #define LOOP_NEED_FLAGS
393 #define EXTRA_LOOP_DECLS        , int *setp
394 #define INIT_PARAMS             int set = *setp & CURRENT_SEL_MASK; \
395                                 int ann = *setp & CURRENT_ANN_MASK
396 #define REINIT_PARAMS           do                                            \
397                                   {                                           \
398                                     set = *setp & CURRENT_SEL_MASK;           \
399                                     ann = *setp & CURRENT_ANN_MASK;           \
400                                   }                                           \
401                                 while (0)
402 #define UPDATE_PARAMS           *setp = set | ann
403 #include <iconv/loop.c>
404
405
406 /* Now define the toplevel functions.  */
407 #include <iconv/skeleton.c>