1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2016 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
34 # include "localcharset.h"
39 verify (sizeof (mbstate_t) >= 4);
41 static char internal_state[4];
44 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
46 char *pstate = (char *)ps;
61 pstate = internal_state;
64 size_t nstate = pstate[0];
100 # if __GLIBC__ || defined __UCLIBC__
101 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
102 mbtowc (NULL, NULL, 0);
105 int res = mbtowc (pwc, p, m);
109 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
111 if (nstate >= (res > 0 ? res : 1))
118 /* mbtowc does not distinguish between invalid and incomplete multibyte
119 sequences. But mbrtowc needs to make this distinction.
120 There are two possible approaches:
121 - Use iconv() and its return value.
122 - Use built-in knowledge about the possible encodings.
123 Given the low quality of implementation of iconv() on the systems that
124 lack mbrtowc(), we use the second approach.
125 The possible encodings are:
127 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
129 Use specialized code for each. */
130 if (m >= 4 || m >= MB_CUR_MAX)
132 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
134 const char *encoding = locale_charset ();
136 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
138 /* Cf. unistr/u8-mblen.c. */
139 unsigned char c = (unsigned char) p[0];
154 unsigned char c2 = (unsigned char) p[1];
156 if ((c2 ^ 0x80) < 0x40
157 && (c >= 0xe1 || c2 >= 0xa0)
158 && (c != 0xed || c2 < 0xa0))
166 else /* m == 2 || m == 3 */
168 unsigned char c2 = (unsigned char) p[1];
170 if ((c2 ^ 0x80) < 0x40
171 && (c >= 0xf1 || c2 >= 0x90)
172 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
178 unsigned char c3 = (unsigned char) p[2];
180 if ((c3 ^ 0x80) < 0x40)
190 /* As a reference for this code, you can use the GNU libiconv
191 implementation. Look for uses of the RET_TOOFEW macro. */
193 if (STREQ_OPT (encoding,
194 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
198 unsigned char c = (unsigned char) p[0];
200 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
205 unsigned char c = (unsigned char) p[0];
209 unsigned char c2 = (unsigned char) p[1];
211 if (c2 >= 0xa1 && c2 < 0xff)
217 if (STREQ_OPT (encoding,
218 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
219 || STREQ_OPT (encoding,
220 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
221 || STREQ_OPT (encoding,
222 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
226 unsigned char c = (unsigned char) p[0];
228 if (c >= 0xa1 && c < 0xff)
233 if (STREQ_OPT (encoding,
234 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
238 unsigned char c = (unsigned char) p[0];
240 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
243 else /* m == 2 || m == 3 */
245 unsigned char c = (unsigned char) p[0];
252 if (STREQ_OPT (encoding,
253 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
257 unsigned char c = (unsigned char) p[0];
259 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
262 else /* m == 2 || m == 3 */
264 unsigned char c = (unsigned char) p[0];
266 if (c >= 0x90 && c <= 0xe3)
268 unsigned char c2 = (unsigned char) p[1];
270 if (c2 >= 0x30 && c2 <= 0x39)
276 unsigned char c3 = (unsigned char) p[2];
278 if (c3 >= 0x81 && c3 <= 0xfe)
286 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
290 unsigned char c = (unsigned char) p[0];
292 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
293 || (c >= 0xf0 && c <= 0xf9))
299 /* An unknown multibyte encoding. */
306 /* Here 0 <= k < m < 4. */
322 /* The conversion state is undefined, says POSIX. */
329 /* Override the system's mbrtowc() function. */
334 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
339 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
348 # if MBRTOWC_EMPTY_INPUT_BUG
356 # if MBRTOWC_RETVAL_BUG
358 static mbstate_t internal_state;
360 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
361 hidden internal state, but we can call it on our variable. */
363 ps = &internal_state;
367 /* Parse the rest of the multibyte character byte for byte. */
369 for (; n > 0; s++, n--)
371 ret = mbrtowc (&wc, s, 1, ps);
373 if (ret == (size_t)(-1))
376 if (ret != (size_t)(-2))
378 /* The multibyte character has been completed. */
380 return (wc == 0 ? 0 : count);
388 ret = mbrtowc (pwc, s, n, ps);
390 # if MBRTOWC_NUL_RETVAL_BUG
391 if (ret < (size_t) -2 && !*pwc)
395 # if C_LOCALE_MAYBE_EILSEQ
396 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
398 unsigned char uc = *s;