Update.
[platform/upstream/glibc.git] / iconvdata / iso-2022-cn.c
1 /* Conversion module for ISO-2022-CN.
2    Copyright (C) 1999, 2000 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public License as
8    published by the Free Software Foundation; either version 2 of the
9    License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public
17    License along with the GNU C Library; see the file COPYING.LIB.  If not,
18    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19    Boston, MA 02111-1307, USA.  */
20
21 #include <dlfcn.h>
22 #include <gconv.h>
23 #include <stdint.h>
24 #include <string.h>
25 #include "gb2312.h"
26 #include "cns11643l1.h"
27 #include "cns11643l2.h"
28
29 #include <assert.h>
30
31 /* This makes obvious what everybody knows: 0x1b is the Esc character.  */
32 #define ESC     0x1b
33
34 /* We have single-byte shift-in and shift-out sequences, and the single
35    shift sequence SS2 which replaces the SS2 designation for the next
36    two bytes.  */
37 #define SI      0x0f
38 #define SO      0x0e
39 #define SS2_0   ESC
40 #define SS2_1   0x4e
41
42 /* Definitions used in the body of the `gconv' function.  */
43 #define CHARSET_NAME            "ISO-2022-CN//"
44 #define DEFINE_INIT             1
45 #define DEFINE_FINI             1
46 #define FROM_LOOP               from_iso2022cn_loop
47 #define TO_LOOP                 to_iso2022cn_loop
48 #define MIN_NEEDED_FROM         1
49 #define MAX_NEEDED_FROM         4
50 #define MIN_NEEDED_TO           4
51 #define MAX_NEEDED_TO           4
52 #define PREPARE_LOOP \
53   int save_set;                                                               \
54   int *setp = &data->__statep->__count;
55 #define EXTRA_LOOP_ARGS         , setp
56
57
58 /* The COUNT element of the state keeps track of the currently selected
59    character set.  The possible values are:  */
60 enum
61 {
62   ASCII_set = 0,
63   GB2312_set = 8,
64   CNS11643_1_set = 16,
65   CNS11643_2_set = 24,
66   CURRENT_SEL_MASK = 24,
67   GB2312_ann = 32,
68   CNS11643_1_ann = 64,
69   CNS11643_2_ann = 128,
70   CURRENT_ANN_MASK = 224
71 };
72
73
74 /* Since this is a stateful encoding we have to provide code which resets
75    the output state to the initial state.  This has to be done during the
76    flushing.  */
77 #define EMIT_SHIFT_TO_INIT \
78   if (data->__statep->__count != ASCII_set)                                   \
79     {                                                                         \
80       if (FROM_DIRECTION)                                                     \
81         /* It's easy, we don't have to emit anything, we just reset the       \
82            state for the input.  */                                           \
83         data->__statep->__count = ASCII_set;                                  \
84       else                                                                    \
85         {                                                                     \
86           unsigned char *outbuf = data->__outbuf;                             \
87                                                                               \
88           /* We are not in the initial state.  To switch back we have         \
89              to emit `SI'.  */                                                \
90           if (__builtin_expect (outbuf == data->__outbufend, 0))              \
91             /* We don't have enough room in the output buffer.  */            \
92             status = __GCONV_FULL_OUTPUT;                                     \
93           else                                                                \
94             {                                                                 \
95               /* Write out the shift sequence.  */                            \
96               *outbuf++ = SI;                                                 \
97               data->__outbuf = outbuf;                                        \
98               data->__statep->__count = ASCII_set;                            \
99             }                                                                 \
100         }                                                                     \
101     }
102
103
104 /* Since we might have to reset input pointer we must be able to save
105    and retore the state.  */
106 #define SAVE_RESET_STATE(Save) \
107   if (Save)                                                                   \
108     save_set = *setp;                                                         \
109   else                                                                        \
110     *setp = save_set
111
112
113 /* First define the conversion function from ISO-2022-CN to UCS4.  */
114 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
115 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
116 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
117 #define LOOPFCT                 FROM_LOOP
118 #define BODY \
119   {                                                                           \
120     uint32_t ch = *inptr;                                                     \
121                                                                               \
122     /* This is a 7bit character set, disallow all 8bit characters.  */        \
123     if (__builtin_expect (ch, 0) > 0x7f)                                      \
124       {                                                                       \
125         if (! ignore_errors_p ())                                             \
126           {                                                                   \
127             result = __GCONV_ILLEGAL_INPUT;                                   \
128             break;                                                            \
129           }                                                                   \
130                                                                               \
131         ++inptr;                                                              \
132         ++*irreversible;                                                      \
133         continue;                                                             \
134       }                                                                       \
135                                                                               \
136     /* Recognize escape sequences.  */                                        \
137     if (__builtin_expect (ch, 0) == ESC)                                      \
138       {                                                                       \
139         /* There are two kinds of escape sequences we have to handle:         \
140            - those announcing the use of GB and CNS characters on the         \
141              line; we can simply ignore them                                  \
142            - the initial byte of the SS2 sequence.                            \
143         */                                                                    \
144         if (__builtin_expect (inptr + 1 > inend, 0)                           \
145             || (inptr[1] == '$'                                               \
146                 && (__builtin_expect (inptr + 2 > inend, 0)                   \
147                     || (inptr[2] == ')'                                       \
148                         && __builtin_expect (inptr + 3 > inend, 0))           \
149                     || (inptr[2] == '*'                                       \
150                         && __builtin_expect (inptr + 3 > inend, 0))))         \
151             || (inptr[1] == SS2_1                                             \
152                 && __builtin_expect (inptr + 3 > inend, 0)))                  \
153           {                                                                   \
154             result = __GCONV_EMPTY_INPUT;                                     \
155             break;                                                            \
156           }                                                                   \
157         if (inptr[1] == '$'                                                   \
158             && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G'))     \
159                 || (inptr[2] == '*' && inptr[3] == 'H')))                     \
160           {                                                                   \
161             /* OK, we accept those character sets.  */                        \
162             if (inptr[3] == 'A')                                              \
163               ann = GB2312_ann;                                               \
164             else if (inptr[3] == 'G')                                         \
165               ann = CNS11643_1_ann;                                           \
166             inptr += 4;                                                       \
167             continue;                                                         \
168           }                                                                   \
169       }                                                                       \
170     else if (__builtin_expect (ch, 0) == SO)                                  \
171       {                                                                       \
172         /* Switch to use GB2312 or CNS 11643 plane 1, depending on which      \
173            S0 designation came last.  The only problem is what to do with     \
174            faulty input files where no designator came.                       \
175            XXX For now I'll default to use GB2312.  If this is not the        \
176            best behaviour (e.g., we should flag an error) let me know.  */    \
177         ++inptr;                                                              \
178         set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set;            \
179         continue;                                                             \
180       }                                                                       \
181     else if (__builtin_expect (ch, 0) == SI)                                  \
182       {                                                                       \
183         /* Switch to use ASCII.  */                                           \
184         ++inptr;                                                              \
185         set = ASCII_set;                                                      \
186         continue;                                                             \
187       }                                                                       \
188                                                                               \
189     if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1)                 \
190       {                                                                       \
191         /* This is a character from CNS 11643 plane 2.                        \
192            XXX We could test here whether the use of this character           \
193            set was announced.  */                                             \
194         inptr += 2;                                                           \
195         ch = cns11643l2_to_ucs4 (&inptr, 2, 0);                               \
196         if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR)                 \
197           {                                                                   \
198             if (! ignore_errors_p ())                                         \
199               {                                                               \
200                 /* This is an illegal character.  */                          \
201                 inptr -= 2;                                                   \
202                 result = __GCONV_ILLEGAL_INPUT;                               \
203                 break;                                                        \
204               }                                                               \
205                                                                               \
206             ++*irreversible;                                                  \
207             continue;                                                         \
208           }                                                                   \
209       }                                                                       \
210     else if (set == ASCII_set)                                                \
211       {                                                                       \
212         /* Almost done, just advance the input pointer.  */                   \
213         ++inptr;                                                              \
214       }                                                                       \
215     else                                                                      \
216       {                                                                       \
217         /* That's pretty easy, we have a dedicated functions for this.  */    \
218         if (set == GB2312_set)                                                \
219           ch = gb2312_to_ucs4 (&inptr, inend - inptr, 0);                     \
220         else                                                                  \
221           {                                                                   \
222             assert (set == CNS11643_1_set);                                   \
223             ch = cns11643l1_to_ucs4 (&inptr, inend - inptr, 0);               \
224           }                                                                   \
225                                                                               \
226         if (__builtin_expect (ch, 1) == 0)                                    \
227           {                                                                   \
228             result = __GCONV_EMPTY_INPUT;                                     \
229             break;                                                            \
230           }                                                                   \
231         else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR)            \
232           {                                                                   \
233             if (! ignore_errors_p ())                                         \
234               {                                                               \
235                 /* This is an illegal character.  */                          \
236                 result = __GCONV_ILLEGAL_INPUT;                               \
237                 break;                                                        \
238               }                                                               \
239                                                                               \
240             ++inptr;                                                          \
241             ++*irreversible;                                                  \
242             continue;                                                         \
243           }                                                                   \
244       }                                                                       \
245                                                                               \
246     put32 (outptr, ch);                                                       \
247     outptr += 4;                                                              \
248   }
249 #define LOOP_NEED_FLAGS
250 #define EXTRA_LOOP_DECLS        , int *setp
251 #define INIT_PARAMS             int set = *setp & CURRENT_SEL_MASK; \
252                                 int ann = *setp & CURRENT_ANN_MASK
253 #define UPDATE_PARAMS           *setp = set | ann
254 #include <iconv/loop.c>
255
256
257 /* Next, define the other direction.  */
258 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
259 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
260 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
261 #define LOOPFCT                 TO_LOOP
262 #define BODY \
263   {                                                                           \
264     uint32_t ch = get32 (inptr);                                              \
265                                                                               \
266     /* First see whether we can write the character using the currently       \
267        selected character set.  */                                            \
268     if (ch < 0x80)                                                            \
269       {                                                                       \
270         if (set != ASCII_set)                                                 \
271           {                                                                   \
272             *outptr++ = SI;                                                   \
273             set = ASCII_set;                                                  \
274             if (__builtin_expect (outptr == outend, 0))                       \
275               {                                                               \
276                 result = __GCONV_FULL_OUTPUT;                                 \
277                 break;                                                        \
278               }                                                               \
279           }                                                                   \
280                                                                               \
281         *outptr++ = ch;                                                       \
282                                                                               \
283         /* At the end of the line we have to clear the `ann' flags since      \
284            every line must contain this information again.  */                \
285         if (ch == L'\n')                                                      \
286           ann = 0;                                                            \
287       }                                                                       \
288     else                                                                      \
289       {                                                                       \
290         char buf[2];                                                          \
291         int used;                                                             \
292         size_t written = 0;                                                   \
293                                                                               \
294         if (set == GB2312_set || (ann & CNS11643_1_ann) == 0)                 \
295           {                                                                   \
296             written = ucs4_to_gb2312 (ch, buf, 2);                            \
297             used = GB2312_set;                                                \
298           }                                                                   \
299         else                                                                  \
300           {                                                                   \
301             written = ucs4_to_cns11643l1 (ch, buf, 2);                        \
302             used = CNS11643_1_set;                                            \
303           }                                                                   \
304                                                                               \
305         if (written == __UNKNOWN_10646_CHAR)                                  \
306           {                                                                   \
307             /* Cannot convert it using the currently selected SO set.         \
308                Next try the SS2 set.  */                                      \
309             written = ucs4_to_cns11643l2 (ch, buf, 2);                        \
310             if (written != __UNKNOWN_10646_CHAR)                              \
311               /* Yep, that worked.  */                                        \
312               used = CNS11643_2_set;                                          \
313             else                                                              \
314               {                                                               \
315                 /* Well, see whether we have to change the SO set.  */        \
316                 if (set == GB2312_set)                                        \
317                   written = ucs4_to_cns11643l1 (ch, buf, 2);                  \
318                 else                                                          \
319                   written = ucs4_to_gb2312 (ch, buf, 2);                      \
320                                                                               \
321                 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR)    \
322                   /* Oh well, then switch SO.  */                             \
323                   used = GB2312_set + CNS11643_1_set - set;                   \
324                 else                                                          \
325                   {                                                           \
326                     /* Even this does not work.  Error.  */                   \
327                     if (step_data->__trans.__trans_fct != NULL)               \
328                       {                                                       \
329                         result = DL_CALL_FCT (step_data->__trans.__trans_fct, \
330                                               (step, step_data, *inptrp,      \
331                                                &inptr, inend, *outptrp,       \
332                                                &outptr, outend,               \
333                                                irreversible));                \
334                         if (result != __GCONV_OK)                             \
335                           break;                                              \
336                       }                                                       \
337                     else if (! ignore_errors_p ())                            \
338                       {                                                       \
339                         result = __GCONV_ILLEGAL_INPUT;                       \
340                         break;                                                \
341                       }                                                       \
342                     else                                                      \
343                       {                                                       \
344                         inptr += 4;                                           \
345                         ++*irreversible;                                      \
346                       }                                                       \
347                     continue;                                                 \
348                   }                                                           \
349               }                                                               \
350           }                                                                   \
351         assert (written == 2);                                                \
352                                                                               \
353         /* See whether we have to emit an escape sequence.  */                \
354         if (set != used)                                                      \
355           {                                                                   \
356             /* First see whether we announced that we use this                \
357                character set.  */                                             \
358             if ((ann & (2 << used)) == 0)                                     \
359               {                                                               \
360                 const char *escseq;                                           \
361                                                                               \
362                 if (__builtin_expect (outptr + 4 > outend, 0))                \
363                   {                                                           \
364                     result = __GCONV_FULL_OUTPUT;                             \
365                     break;                                                    \
366                   }                                                           \
367                                                                               \
368                 assert (used >= 1 && used <= 3);                              \
369                 escseq = "\e$)A\e$)G\e$*H" + (used - 1) * 4;                  \
370                 *outptr++ = *escseq++;                                        \
371                 *outptr++ = *escseq++;                                        \
372                 *outptr++ = *escseq++;                                        \
373                 *outptr++ = *escseq++;                                        \
374                                                                               \
375                 if (used == GB2312_set)                                       \
376                   ann = (ann & CNS11643_2_ann) | GB2312_ann;                  \
377                 else if (used == CNS11643_1_set)                              \
378                   ann = (ann & CNS11643_2_ann) | CNS11643_1_ann;              \
379                 else                                                          \
380                   ann |= CNS11643_2_ann;                                      \
381               }                                                               \
382                                                                               \
383             if (used == CNS11643_2_set)                                       \
384               {                                                               \
385                 if (__builtin_expect (outptr + 2 > outend, 0))                \
386                   {                                                           \
387                     result = __GCONV_FULL_OUTPUT;                             \
388                     break;                                                    \
389                   }                                                           \
390                 *outptr++ = SS2_0;                                            \
391                 *outptr++ = SS2_1;                                            \
392               }                                                               \
393             else                                                              \
394               {                                                               \
395                 /* We only have to emit something is currently ASCII is       \
396                    selected.  Otherwise we are switching within the           \
397                    SO charset.  */                                            \
398                 if (set == ASCII_set)                                         \
399                   {                                                           \
400                     if (__builtin_expect (outptr + 1 > outend, 0))            \
401                       {                                                       \
402                         result = __GCONV_FULL_OUTPUT;                         \
403                         break;                                                \
404                       }                                                       \
405                     *outptr++ = SO;                                           \
406                   }                                                           \
407               }                                                               \
408                                                                               \
409             /* Always test the length here since we have used up all the      \
410                guaranteed output buffer slots.  */                            \
411             if (__builtin_expect (outptr + 2 > outend, 0))                    \
412               {                                                               \
413                 result = __GCONV_FULL_OUTPUT;                                 \
414                 break;                                                        \
415               }                                                               \
416           }                                                                   \
417         else if (__builtin_expect (outptr + 2 > outend, 0))                   \
418           {                                                                   \
419             result = __GCONV_FULL_OUTPUT;                                     \
420             break;                                                            \
421           }                                                                   \
422                                                                               \
423         *outptr++ = buf[0];                                                   \
424         *outptr++ = buf[1];                                                   \
425       }                                                                       \
426                                                                               \
427     /* Now that we wrote the output increment the input pointer.  */          \
428     inptr += 4;                                                               \
429   }
430 #define LOOP_NEED_FLAGS
431 #define EXTRA_LOOP_DECLS        , int *setp
432 #define INIT_PARAMS             int set = *setp & CURRENT_SEL_MASK; \
433                                 int ann = *setp & CURRENT_ANN_MASK
434 #define UPDATE_PARAMS           *setp = set | ann
435 #include <iconv/loop.c>
436
437
438 /* Now define the toplevel functions.  */
439 #include <iconv/skeleton.c>