gdb/gnulib/import/mbrtowc.c

   1 /* Convert multibyte character to wide character.
   2    Copyright (C) 1999-2002, 2005-2016 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2008.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include <wchar.h>
  22
  23 #if C_LOCALE_MAYBE_EILSEQ
  24 # include "hard-locale.h"
  25 # include <locale.h>
  26 #endif
  27
  28 #if GNULIB_defined_mbstate_t
  29 /* Implement mbrtowc() on top of mbtowc().  */
  30
  31 # include <errno.h>
  32 # include <stdlib.h>
  33
  34 # include "localcharset.h"
  35 # include "streq.h"
  36 # include "verify.h"
  37
  38
  39 verify (sizeof (mbstate_t) >= 4);
  40
  41 static char internal_state[4];
  42
  43 size_t
  44 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  45 {
  46   char *pstate = (char *)ps;
  47
  48   if (s == NULL)
  49     {
  50       pwc = NULL;
  51       s = "";
  52       n = 1;
  53     }
  54
  55   if (n == 0)
  56     return (size_t)(-2);
  57
  58   /* Here n > 0.  */
  59
  60   if (pstate == NULL)
  61     pstate = internal_state;
  62
  63   {
  64     size_t nstate = pstate[0];
  65     char buf[4];
  66     const char *p;
  67     size_t m;
  68
  69     switch (nstate)
  70       {
  71       case 0:
  72         p = s;
  73         m = n;
  74         break;
  75       case 3:
  76         buf[2] = pstate[3];
  77         /*FALLTHROUGH*/
  78       case 2:
  79         buf[1] = pstate[2];
  80         /*FALLTHROUGH*/
  81       case 1:
  82         buf[0] = pstate[1];
  83         p = buf;
  84         m = nstate;
  85         buf[m++] = s[0];
  86         if (n >= 2 && m < 4)
  87           {
  88             buf[m++] = s[1];
  89             if (n >= 3 && m < 4)
  90               buf[m++] = s[2];
  91           }
  92         break;
  93       default:
  94         errno = EINVAL;
  95         return (size_t)(-1);
  96       }
  97
  98     /* Here m > 0.  */
  99
 100 # if __GLIBC__ || defined __UCLIBC__
 101     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
 102     mbtowc (NULL, NULL, 0);
 103 # endif
 104     {
 105       int res = mbtowc (pwc, p, m);
 106
 107       if (res >= 0)
 108         {
 109           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
 110             abort ();
 111           if (nstate >= (res > 0 ? res : 1))
 112             abort ();
 113           res -= nstate;
 114           pstate[0] = 0;
 115           return res;
 116         }
 117
 118       /* mbtowc does not distinguish between invalid and incomplete multibyte
 119          sequences.  But mbrtowc needs to make this distinction.
 120          There are two possible approaches:
 121            - Use iconv() and its return value.
 122            - Use built-in knowledge about the possible encodings.
 123          Given the low quality of implementation of iconv() on the systems that
 124          lack mbrtowc(), we use the second approach.
 125          The possible encodings are:
 126            - 8-bit encodings,
 127            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
 128            - UTF-8.
 129          Use specialized code for each.  */
 130       if (m >= 4 || m >= MB_CUR_MAX)
 131         goto invalid;
 132       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
 133       {
 134         const char *encoding = locale_charset ();
 135
 136         if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
 137           {
 138             /* Cf. unistr/u8-mblen.c.  */
 139             unsigned char c = (unsigned char) p[0];
 140
 141             if (c >= 0xc2)
 142               {
 143                 if (c < 0xe0)
 144                   {
 145                     if (m == 1)
 146                       goto incomplete;
 147                   }
 148                 else if (c < 0xf0)
 149                   {
 150                     if (m == 1)
 151                       goto incomplete;
 152                     if (m == 2)
 153                       {
 154                         unsigned char c2 = (unsigned char) p[1];
 155
 156                         if ((c2 ^ 0x80) < 0x40
 157                             && (c >= 0xe1 || c2 >= 0xa0)
 158                             && (c != 0xed || c2 < 0xa0))
 159                           goto incomplete;
 160                       }
 161                   }
 162                 else if (c <= 0xf4)
 163                   {
 164                     if (m == 1)
 165                       goto incomplete;
 166                     else /* m == 2 || m == 3 */
 167                       {
 168                         unsigned char c2 = (unsigned char) p[1];
 169
 170                         if ((c2 ^ 0x80) < 0x40
 171                             && (c >= 0xf1 || c2 >= 0x90)
 172                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
 173                           {
 174                             if (m == 2)
 175                               goto incomplete;
 176                             else /* m == 3 */
 177                               {
 178                                 unsigned char c3 = (unsigned char) p[2];
 179
 180                                 if ((c3 ^ 0x80) < 0x40)
 181                                   goto incomplete;
 182                               }
 183                           }
 184                       }
 185                   }
 186               }
 187             goto invalid;
 188           }
 189
 190         /* As a reference for this code, you can use the GNU libiconv
 191            implementation.  Look for uses of the RET_TOOFEW macro.  */
 192
 193         if (STREQ_OPT (encoding,
 194                        "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
 195           {
 196             if (m == 1)
 197               {
 198                 unsigned char c = (unsigned char) p[0];
 199
 200                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
 201                   goto incomplete;
 202               }
 203             if (m == 2)
 204               {
 205                 unsigned char c = (unsigned char) p[0];
 206
 207                 if (c == 0x8f)
 208                   {
 209                     unsigned char c2 = (unsigned char) p[1];
 210
 211                     if (c2 >= 0xa1 && c2 < 0xff)
 212                       goto incomplete;
 213                   }
 214               }
 215             goto invalid;
 216           }
 217         if (STREQ_OPT (encoding,
 218                        "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 219             || STREQ_OPT (encoding,
 220                           "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 221             || STREQ_OPT (encoding,
 222                           "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
 223           {
 224             if (m == 1)
 225               {
 226                 unsigned char c = (unsigned char) p[0];
 227
 228                 if (c >= 0xa1 && c < 0xff)
 229                   goto incomplete;
 230               }
 231             goto invalid;
 232           }
 233         if (STREQ_OPT (encoding,
 234                        "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
 235           {
 236             if (m == 1)
 237               {
 238                 unsigned char c = (unsigned char) p[0];
 239
 240                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
 241                   goto incomplete;
 242               }
 243             else /* m == 2 || m == 3 */
 244               {
 245                 unsigned char c = (unsigned char) p[0];
 246
 247                 if (c == 0x8e)
 248                   goto incomplete;
 249               }
 250             goto invalid;
 251           }
 252         if (STREQ_OPT (encoding,
 253                        "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
 254           {
 255             if (m == 1)
 256               {
 257                 unsigned char c = (unsigned char) p[0];
 258
 259                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
 260                   goto incomplete;
 261               }
 262             else /* m == 2 || m == 3 */
 263               {
 264                 unsigned char c = (unsigned char) p[0];
 265
 266                 if (c >= 0x90 && c <= 0xe3)
 267                   {
 268                     unsigned char c2 = (unsigned char) p[1];
 269
 270                     if (c2 >= 0x30 && c2 <= 0x39)
 271                       {
 272                         if (m == 2)
 273                           goto incomplete;
 274                         else /* m == 3 */
 275                           {
 276                             unsigned char c3 = (unsigned char) p[2];
 277
 278                             if (c3 >= 0x81 && c3 <= 0xfe)
 279                               goto incomplete;
 280                           }
 281                       }
 282                   }
 283               }
 284             goto invalid;
 285           }
 286         if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
 287           {
 288             if (m == 1)
 289               {
 290                 unsigned char c = (unsigned char) p[0];
 291
 292                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
 293                     || (c >= 0xf0 && c <= 0xf9))
 294                   goto incomplete;
 295               }
 296             goto invalid;
 297           }
 298
 299         /* An unknown multibyte encoding.  */
 300         goto incomplete;
 301       }
 302
 303      incomplete:
 304       {
 305         size_t k = nstate;
 306         /* Here 0 <= k < m < 4.  */
 307         pstate[++k] = s[0];
 308         if (k < m)
 309           {
 310             pstate[++k] = s[1];
 311             if (k < m)
 312               pstate[++k] = s[2];
 313           }
 314         if (k != m)
 315           abort ();
 316       }
 317       pstate[0] = m;
 318       return (size_t)(-2);
 319
 320      invalid:
 321       errno = EILSEQ;
 322       /* The conversion state is undefined, says POSIX.  */
 323       return (size_t)(-1);
 324     }
 325   }
 326 }
 327
 328 #else
 329 /* Override the system's mbrtowc() function.  */
 330
 331 # undef mbrtowc
 332
 333 size_t
 334 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 335 {
 336   size_t ret;
 337   wchar_t wc;
 338
 339 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
 340   if (s == NULL)
 341     {
 342       pwc = NULL;
 343       s = "";
 344       n = 1;
 345     }
 346 # endif
 347
 348 # if MBRTOWC_EMPTY_INPUT_BUG
 349   if (n == 0)
 350     return (size_t) -2;
 351 # endif
 352
 353   if (! pwc)
 354     pwc = &wc;
 355
 356 # if MBRTOWC_RETVAL_BUG
 357   {
 358     static mbstate_t internal_state;
 359
 360     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
 361        hidden internal state, but we can call it on our variable.  */
 362     if (ps == NULL)
 363       ps = &internal_state;
 364
 365     if (!mbsinit (ps))
 366       {
 367         /* Parse the rest of the multibyte character byte for byte.  */
 368         size_t count = 0;
 369         for (; n > 0; s++, n--)
 370           {
 371             ret = mbrtowc (&wc, s, 1, ps);
 372
 373             if (ret == (size_t)(-1))
 374               return (size_t)(-1);
 375             count++;
 376             if (ret != (size_t)(-2))
 377               {
 378                 /* The multibyte character has been completed.  */
 379                 *pwc = wc;
 380                 return (wc == 0 ? 0 : count);
 381               }
 382           }
 383         return (size_t)(-2);
 384       }
 385   }
 386 # endif
 387
 388   ret = mbrtowc (pwc, s, n, ps);
 389
 390 # if MBRTOWC_NUL_RETVAL_BUG
 391   if (ret < (size_t) -2 && !*pwc)
 392     return 0;
 393 # endif
 394
 395 # if C_LOCALE_MAYBE_EILSEQ
 396   if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
 397     {
 398       unsigned char uc = *s;
 399       *pwc = uc;
 400       return 1;
 401     }
 402 # endif
 403
 404   return ret;
 405 }
 406
 407 #endif