tizen 2.4 release
[framework/base/tizen-locale.git] / iconvdata / utf-16.c
1 /* Conversion module for UTF-16.
2    Copyright (C) 1999-2015 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19
20 #include <byteswap.h>
21 #include <dlfcn.h>
22 #include <gconv.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 /* This is the Byte Order Mark character (BOM).  */
29 #define BOM     0xfeff
30 /* And in the other byte order.  */
31 #define BOM_OE  0xfffe
32
33
34 /* Definitions used in the body of the `gconv' function.  */
35 #define FROM_LOOP               from_utf16_loop
36 #define TO_LOOP                 to_utf16_loop
37 #define DEFINE_INIT             0
38 #define DEFINE_FINI             0
39 #define MIN_NEEDED_FROM         2
40 #define MAX_NEEDED_FROM         4
41 #define MIN_NEEDED_TO           4
42 #define ONE_DIRECTION           0
43 #define FROM_DIRECTION          (dir == from_utf16)
44 #define PREPARE_LOOP \
45   enum direction dir = ((struct utf16_data *) step->__data)->dir;             \
46   enum variant var = ((struct utf16_data *) step->__data)->var;               \
47   if (__glibc_unlikely (data->__invocation_counter == 0))                     \
48     {                                                                         \
49       if (var == UTF_16)                                                      \
50         {                                                                     \
51           if (FROM_DIRECTION)                                                 \
52             {                                                                 \
53               /* We have to find out which byte order the file is             \
54                  encoded in.  */                                              \
55               if (inptr + 2 > inend)                                          \
56                 return (inptr == inend                                        \
57                         ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);    \
58                                                                               \
59               if (get16u (inptr) == BOM)                                      \
60                 /* Simply ignore the BOM character.  */                       \
61                 *inptrp = inptr += 2;                                         \
62               else if (get16u (inptr) == BOM_OE)                              \
63                 {                                                             \
64                   data->__flags |= __GCONV_SWAP;                              \
65                   *inptrp = inptr += 2;                                       \
66                 }                                                             \
67             }                                                                 \
68           else if (!FROM_DIRECTION && !data->__internal_use)                  \
69             {                                                                 \
70               /* Emit the Byte Order Mark.  */                                \
71               if (__glibc_unlikely (outbuf + 2 > outend))                     \
72                 return __GCONV_FULL_OUTPUT;                                   \
73                                                                               \
74               put16u (outbuf, BOM);                                           \
75               outbuf += 2;                                                    \
76             }                                                                 \
77         }                                                                     \
78       else if ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)                  \
79                || (var == UTF_16BE && BYTE_ORDER == LITTLE_ENDIAN))           \
80         data->__flags |= __GCONV_SWAP;                                        \
81     }                                                                         \
82   const int swap = data->__flags & __GCONV_SWAP;
83 #define EXTRA_LOOP_ARGS         , swap
84
85
86 /* Direction of the transformation.  */
87 enum direction
88 {
89   illegal_dir,
90   to_utf16,
91   from_utf16
92 };
93
94 enum variant
95 {
96   illegal_var,
97   UTF_16,
98   UTF_16LE,
99   UTF_16BE
100 };
101
102 struct utf16_data
103 {
104   enum direction dir;
105   enum variant var;
106 };
107
108
109 extern int gconv_init (struct __gconv_step *step);
110 int
111 gconv_init (struct __gconv_step *step)
112 {
113   /* Determine which direction.  */
114   struct utf16_data *new_data;
115   enum direction dir = illegal_dir;
116   enum variant var = illegal_var;
117   int result;
118
119   if (__strcasecmp (step->__from_name, "UTF-16//") == 0)
120     {
121       dir = from_utf16;
122       var = UTF_16;
123     }
124   else if (__strcasecmp (step->__to_name, "UTF-16//") == 0)
125     {
126       dir = to_utf16;
127       var = UTF_16;
128     }
129   else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0)
130     {
131       dir = from_utf16;
132       var = UTF_16BE;
133     }
134   else if (__strcasecmp (step->__to_name, "UTF-16BE//") == 0)
135     {
136       dir = to_utf16;
137       var = UTF_16BE;
138     }
139   else if (__strcasecmp (step->__from_name, "UTF-16LE//") == 0)
140     {
141       dir = from_utf16;
142       var = UTF_16LE;
143     }
144   else if (__strcasecmp (step->__to_name, "UTF-16LE//") == 0)
145     {
146       dir = to_utf16;
147       var = UTF_16LE;
148     }
149
150   result = __GCONV_NOCONV;
151   if (__builtin_expect (dir, to_utf16) != illegal_dir)
152     {
153       new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
154
155       result = __GCONV_NOMEM;
156       if (new_data != NULL)
157         {
158           new_data->dir = dir;
159           new_data->var = var;
160           step->__data = new_data;
161
162           if (dir == from_utf16)
163             {
164               step->__min_needed_from = MIN_NEEDED_FROM;
165               step->__max_needed_from = MAX_NEEDED_FROM;
166               step->__min_needed_to = MIN_NEEDED_TO;
167               step->__max_needed_to = MIN_NEEDED_TO;
168             }
169           else
170             {
171               step->__min_needed_from = MIN_NEEDED_TO;
172               step->__max_needed_from = MIN_NEEDED_TO;
173               step->__min_needed_to = MIN_NEEDED_FROM;
174               step->__max_needed_to = MAX_NEEDED_FROM;
175             }
176
177           step->__stateful = 0;
178
179           result = __GCONV_OK;
180         }
181     }
182
183   return result;
184 }
185
186
187 extern void gconv_end (struct __gconv_step *data);
188 void
189 gconv_end (struct __gconv_step *data)
190 {
191   free (data->__data);
192 }
193
194
195 /* Convert from the internal (UCS4-like) format to UTF-16.  */
196 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
197 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
198 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
199 #define LOOPFCT                 TO_LOOP
200 #define BODY \
201   {                                                                           \
202     uint32_t c = get32 (inptr);                                               \
203                                                                               \
204     if (__glibc_unlikely (c >= 0xd800 && c < 0xe000))                         \
205       {                                                                       \
206         /* Surrogate characters in UCS-4 input are not valid.                 \
207            We must catch this.  If we let surrogates pass through,            \
208            attackers could make a security hole exploit by                    \
209            synthesizing any desired plane 1-16 character.  */                 \
210         result = __GCONV_ILLEGAL_INPUT;                                       \
211         if (! ignore_errors_p ())                                             \
212           break;                                                              \
213         inptr += 4;                                                           \
214         ++*irreversible;                                                      \
215         continue;                                                             \
216       }                                                                       \
217                                                                               \
218     if (swap)                                                                 \
219       {                                                                       \
220         if (__glibc_unlikely (c >= 0x10000))                                  \
221           {                                                                   \
222             if (__glibc_unlikely (c >= 0x110000))                             \
223               {                                                               \
224                 STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
225               }                                                               \
226                                                                               \
227             /* Generate a surrogate character.  */                            \
228             if (__glibc_unlikely (outptr + 4 > outend))                       \
229               {                                                               \
230                 /* Overflow in the output buffer.  */                         \
231                 result = __GCONV_FULL_OUTPUT;                                 \
232                 break;                                                        \
233               }                                                               \
234                                                                               \
235             put16 (outptr, bswap_16 (0xd7c0 + (c >> 10)));                    \
236             outptr += 2;                                                      \
237             put16 (outptr, bswap_16 (0xdc00 + (c & 0x3ff)));                  \
238           }                                                                   \
239         else                                                                  \
240           put16 (outptr, bswap_16 (c));                                       \
241       }                                                                       \
242     else                                                                      \
243       {                                                                       \
244         if (__glibc_unlikely (c >= 0x10000))                                  \
245           {                                                                   \
246             if (__glibc_unlikely (c >= 0x110000))                             \
247               {                                                               \
248                 STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
249               }                                                               \
250                                                                               \
251             /* Generate a surrogate character.  */                            \
252             if (__glibc_unlikely (outptr + 4 > outend))                       \
253               {                                                               \
254                 /* Overflow in the output buffer.  */                         \
255                 result = __GCONV_FULL_OUTPUT;                                 \
256                 break;                                                        \
257               }                                                               \
258                                                                               \
259             put16 (outptr, 0xd7c0 + (c >> 10));                               \
260             outptr += 2;                                                      \
261             put16 (outptr, 0xdc00 + (c & 0x3ff));                             \
262           }                                                                   \
263         else                                                                  \
264           put16 (outptr, c);                                                  \
265       }                                                                       \
266     outptr += 2;                                                              \
267     inptr += 4;                                                               \
268   }
269 #define LOOP_NEED_FLAGS
270 #define EXTRA_LOOP_DECLS \
271         , int swap
272 #include <iconv/loop.c>
273
274
275 /* Convert from UTF-16 to the internal (UCS4-like) format.  */
276 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
277 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
278 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
279 #define LOOPFCT                 FROM_LOOP
280 #define BODY \
281   {                                                                           \
282     uint16_t u1 = get16 (inptr);                                              \
283                                                                               \
284     if (swap)                                                                 \
285       {                                                                       \
286         u1 = bswap_16 (u1);                                                   \
287                                                                               \
288         if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                 \
289           {                                                                   \
290             /* No surrogate.  */                                              \
291             put32 (outptr, u1);                                               \
292             inptr += 2;                                                       \
293           }                                                                   \
294         else                                                                  \
295           {                                                                   \
296             uint16_t u2;                                                      \
297                                                                               \
298             /* It's a surrogate character.  At least the first word says      \
299                it is.  */                                                     \
300             if (__glibc_unlikely (inptr + 4 > inend))                         \
301               {                                                               \
302                 /* We don't have enough input for another complete input      \
303                    character.  */                                             \
304                 result = __GCONV_INCOMPLETE_INPUT;                            \
305                 break;                                                        \
306               }                                                               \
307                                                                               \
308             inptr += 2;                                                       \
309             u2 = bswap_16 (get16 (inptr));                                    \
310             if (__builtin_expect (u2 < 0xdc00, 0)                             \
311                 || __builtin_expect (u2 > 0xdfff, 0))                         \
312               {                                                               \
313                 /* This is no valid second word for a surrogate.  */          \
314                 inptr -= 2;                                                   \
315                 STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
316               }                                                               \
317                                                                               \
318             put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));            \
319             inptr += 2;                                                       \
320           }                                                                   \
321       }                                                                       \
322     else                                                                      \
323       {                                                                       \
324         if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                 \
325           {                                                                   \
326             /* No surrogate.  */                                              \
327             put32 (outptr, u1);                                               \
328             inptr += 2;                                                       \
329           }                                                                   \
330         else                                                                  \
331           {                                                                   \
332             /* It's a surrogate character.  At least the first word says      \
333                it is.  */                                                     \
334             if (__glibc_unlikely (inptr + 4 > inend))                         \
335               {                                                               \
336                 /* We don't have enough input for another complete input      \
337                    character.  */                                             \
338                 result = __GCONV_INCOMPLETE_INPUT;                            \
339                 break;                                                        \
340               }                                                               \
341                                                                               \
342             inptr += 2;                                                       \
343             uint16_t u2 = get16 (inptr);                                      \
344             if (__builtin_expect (u2 < 0xdc00, 0)                             \
345                 || __builtin_expect (u2 > 0xdfff, 0))                         \
346               {                                                               \
347                 /* This is no valid second word for a surrogate.  */          \
348                 inptr -= 2;                                                   \
349                 STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
350               }                                                               \
351                                                                               \
352             put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));            \
353             inptr += 2;                                                       \
354           }                                                                   \
355       }                                                                       \
356     outptr += 4;                                                              \
357   }
358 #define LOOP_NEED_FLAGS
359 #define EXTRA_LOOP_DECLS \
360         , int swap
361 #include <iconv/loop.c>
362
363
364 /* Now define the toplevel functions.  */
365 #include <iconv/skeleton.c>