gettext-tools/src/po-charset.c

   1 /* Charset handling while reading PO files.
   2    Copyright (C) 2001-2007, 2010 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22 #include <alloca.h>
  23
  24 /* Specification.  */
  25 #include "po-charset.h"
  26
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "xmalloca.h"
  31 #include "xvasprintf.h"
  32 #include "po-xerror.h"
  33 #include "basename.h"
  34 #include "progname.h"
  35 #include "c-strstr.h"
  36 #include "c-strcase.h"
  37 #include "gettext.h"
  38
  39 #define _(str) gettext (str)
  40
  41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  42
  43 static const char ascii[] = "ASCII";
  44
  45 /* The canonicalized encoding name for ASCII.  */
  46 const char *po_charset_ascii = ascii;
  47
  48 static const char utf8[] = "UTF-8";
  49
  50 /* The canonicalized encoding name for UTF-8.  */
  51 const char *po_charset_utf8 = utf8;
  52
  53 /* Canonicalize an encoding name.  */
  54 const char *
  55 po_charset_canonicalize (const char *charset)
  56 {
  57   /* The list of charsets supported by glibc's iconv() and by the portable
  58      iconv() across platforms.  Taken from intl/config.charset.  */
  59   static const char *standard_charsets[] =
  60   {
  61     ascii, "ANSI_X3.4-1968", "US-ASCII",        /* i = 0..2 */
  62     "ISO-8859-1", "ISO_8859-1",                 /* i = 3, 4 */
  63     "ISO-8859-2", "ISO_8859-2",
  64     "ISO-8859-3", "ISO_8859-3",
  65     "ISO-8859-4", "ISO_8859-4",
  66     "ISO-8859-5", "ISO_8859-5",
  67     "ISO-8859-6", "ISO_8859-6",
  68     "ISO-8859-7", "ISO_8859-7",
  69     "ISO-8859-8", "ISO_8859-8",
  70     "ISO-8859-9", "ISO_8859-9",
  71     "ISO-8859-13", "ISO_8859-13",
  72     "ISO-8859-14", "ISO_8859-14",
  73     "ISO-8859-15", "ISO_8859-15",               /* i = 25, 26 */
  74     "KOI8-R",
  75     "KOI8-U",
  76     "KOI8-T",
  77     "CP850",
  78     "CP866",
  79     "CP874",
  80     "CP932",
  81     "CP949",
  82     "CP950",
  83     "CP1250",
  84     "CP1251",
  85     "CP1252",
  86     "CP1253",
  87     "CP1254",
  88     "CP1255",
  89     "CP1256",
  90     "CP1257",
  91     "GB2312",
  92     "EUC-JP",
  93     "EUC-KR",
  94     "EUC-TW",
  95     "BIG5",
  96     "BIG5-HKSCS",
  97     "GBK",
  98     "GB18030",
  99     "SHIFT_JIS",
 100     "JOHAB",
 101     "TIS-620",
 102     "VISCII",
 103     "GEORGIAN-PS",
 104     utf8
 105   };
 106   size_t i;
 107
 108   for (i = 0; i < SIZEOF (standard_charsets); i++)
 109     if (c_strcasecmp (charset, standard_charsets[i]) == 0)
 110       return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
 111   return NULL;
 112 }
 113
 114 /* Test for ASCII compatibility.  */
 115 bool
 116 po_charset_ascii_compatible (const char *canon_charset)
 117 {
 118   /* There are only a few exceptions to ASCII compatibility.  */
 119   if (strcmp (canon_charset, "SHIFT_JIS") == 0
 120       || strcmp (canon_charset, "JOHAB") == 0
 121       || strcmp (canon_charset, "VISCII") == 0)
 122     return false;
 123   else
 124     return true;
 125 }
 126
 127 /* Test for a weird encoding, i.e. an encoding which has double-byte
 128    characters ending in 0x5C.  */
 129 bool po_is_charset_weird (const char *canon_charset)
 130 {
 131   static const char *weird_charsets[] =
 132   {
 133     "BIG5",
 134     "BIG5-HKSCS",
 135     "GBK",
 136     "GB18030",
 137     "SHIFT_JIS",
 138     "JOHAB"
 139   };
 140   size_t i;
 141
 142   for (i = 0; i < SIZEOF (weird_charsets); i++)
 143     if (strcmp (canon_charset, weird_charsets[i]) == 0)
 144       return true;
 145   return false;
 146 }
 147
 148 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
 149    An encoding has CJK structure if every valid character stream is composed
 150    of single bytes in the range 0x{00..7F} and of byte pairs in the range
 151    0x{80..FF}{30..FF}.  */
 152 bool po_is_charset_weird_cjk (const char *canon_charset)
 153 {
 154   static const char *weird_cjk_charsets[] =
 155   {                     /* single bytes   double bytes       */
 156     "BIG5",             /* 0x{00..7F},    0x{A1..F9}{40..FE} */
 157     "BIG5-HKSCS",       /* 0x{00..7F},    0x{88..FE}{40..FE} */
 158     "GBK",              /* 0x{00..7F},    0x{81..FE}{40..FE} */
 159     "GB18030",          /* 0x{00..7F},    0x{81..FE}{30..FE} */
 160     "SHIFT_JIS",        /* 0x{00..7F},    0x{81..F9}{40..FC} */
 161     "JOHAB"             /* 0x{00..7F},    0x{84..F9}{31..FE} */
 162   };
 163   size_t i;
 164
 165   for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
 166     if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
 167       return true;
 168   return false;
 169 }
 170
 171 /* Hardcoded iterator functions for all kinds of encodings.
 172    We could also implement a general iterator function with iconv(),
 173    but we need a fast one.  */
 174
 175 /* Character iterator for 8-bit encodings.  */
 176 static size_t
 177 char_iterator (const char *s)
 178 {
 179   return 1;
 180 }
 181
 182 /* Character iterator for GB2312.  See libiconv/lib/euc_cn.h.  */
 183 /* Character iterator for EUC-KR.  See libiconv/lib/euc_kr.h.  */
 184 static size_t
 185 euc_character_iterator (const char *s)
 186 {
 187   unsigned char c = *s;
 188   if (c >= 0xa1 && c < 0xff)
 189     {
 190       unsigned char c2 = s[1];
 191       if (c2 >= 0xa1 && c2 < 0xff)
 192         return 2;
 193     }
 194   return 1;
 195 }
 196
 197 /* Character iterator for EUC-JP.  See libiconv/lib/euc_jp.h.  */
 198 static size_t
 199 euc_jp_character_iterator (const char *s)
 200 {
 201   unsigned char c = *s;
 202   if (c >= 0xa1 && c < 0xff)
 203     {
 204       unsigned char c2 = s[1];
 205       if (c2 >= 0xa1 && c2 < 0xff)
 206         return 2;
 207     }
 208   else if (c == 0x8e)
 209     {
 210       unsigned char c2 = s[1];
 211       if (c2 >= 0xa1 && c2 < 0xe0)
 212         return 2;
 213     }
 214   else if (c == 0x8f)
 215     {
 216       unsigned char c2 = s[1];
 217       if (c2 >= 0xa1 && c2 < 0xff)
 218         {
 219           unsigned char c3 = s[2];
 220           if (c3 >= 0xa1 && c3 < 0xff)
 221             return 3;
 222         }
 223     }
 224   return 1;
 225 }
 226
 227 /* Character iterator for EUC-TW.  See libiconv/lib/euc_tw.h.  */
 228 static size_t
 229 euc_tw_character_iterator (const char *s)
 230 {
 231   unsigned char c = *s;
 232   if (c >= 0xa1 && c < 0xff)
 233     {
 234       unsigned char c2 = s[1];
 235       if (c2 >= 0xa1 && c2 < 0xff)
 236         return 2;
 237     }
 238   else if (c == 0x8e)
 239     {
 240       unsigned char c2 = s[1];
 241       if (c2 >= 0xa1 && c2 <= 0xb0)
 242         {
 243           unsigned char c3 = s[2];
 244           if (c3 >= 0xa1 && c3 < 0xff)
 245             {
 246               unsigned char c4 = s[3];
 247               if (c4 >= 0xa1 && c4 < 0xff)
 248                 return 4;
 249             }
 250         }
 251     }
 252   return 1;
 253 }
 254
 255 /* Character iterator for BIG5.  See libiconv/lib/ces_big5.h.  */
 256 static size_t
 257 big5_character_iterator (const char *s)
 258 {
 259   unsigned char c = *s;
 260   if (c >= 0xa1 && c < 0xff)
 261     {
 262       unsigned char c2 = s[1];
 263       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
 264         return 2;
 265     }
 266   return 1;
 267 }
 268
 269 /* Character iterator for BIG5-HKSCS.  See libiconv/lib/big5hkscs.h.  */
 270 static size_t
 271 big5hkscs_character_iterator (const char *s)
 272 {
 273   unsigned char c = *s;
 274   if (c >= 0x88 && c < 0xff)
 275     {
 276       unsigned char c2 = s[1];
 277       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
 278         return 2;
 279     }
 280   return 1;
 281 }
 282
 283 /* Character iterator for GBK.  See libiconv/lib/ces_gbk.h and
 284    libiconv/lib/gbk.h.  */
 285 static size_t
 286 gbk_character_iterator (const char *s)
 287 {
 288   unsigned char c = *s;
 289   if (c >= 0x81 && c < 0xff)
 290     {
 291       unsigned char c2 = s[1];
 292       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
 293         return 2;
 294     }
 295   return 1;
 296 }
 297
 298 /* Character iterator for GB18030.  See libiconv/lib/gb18030.h.  */
 299 static size_t
 300 gb18030_character_iterator (const char *s)
 301 {
 302   unsigned char c = *s;
 303   if (c >= 0x81 && c < 0xff)
 304     {
 305       unsigned char c2 = s[1];
 306       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
 307         return 2;
 308     }
 309   if (c >= 0x81 && c <= 0x84)
 310     {
 311       unsigned char c2 = s[1];
 312       if (c2 >= 0x30 && c2 <= 0x39)
 313         {
 314           unsigned char c3 = s[2];
 315           if (c3 >= 0x81 && c3 < 0xff)
 316             {
 317               unsigned char c4 = s[3];
 318               if (c4 >= 0x30 && c4 <= 0x39)
 319                 return 4;
 320             }
 321         }
 322     }
 323   return 1;
 324 }
 325
 326 /* Character iterator for SHIFT_JIS.  See libiconv/lib/sjis.h.  */
 327 static size_t
 328 shift_jis_character_iterator (const char *s)
 329 {
 330   unsigned char c = *s;
 331   if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
 332     {
 333       unsigned char c2 = s[1];
 334       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
 335         return 2;
 336     }
 337   return 1;
 338 }
 339
 340 /* Character iterator for JOHAB.  See libiconv/lib/johab.h and
 341    libiconv/lib/johab_hangul.h.  */
 342 static size_t
 343 johab_character_iterator (const char *s)
 344 {
 345   unsigned char c = *s;
 346   if (c >= 0x84 && c <= 0xd3)
 347     {
 348       unsigned char c2 = s[1];
 349       if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
 350         return 2;
 351     }
 352   else if (c >= 0xd9 && c <= 0xf9)
 353     {
 354       unsigned char c2 = s[1];
 355       if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
 356         return 2;
 357     }
 358   return 1;
 359 }
 360
 361 /* Character iterator for UTF-8.  See libiconv/lib/utf8.h.  */
 362 static size_t
 363 utf8_character_iterator (const char *s)
 364 {
 365   unsigned char c = *s;
 366   if (c >= 0xc2)
 367     {
 368       if (c < 0xe0)
 369         {
 370           unsigned char c2 = s[1];
 371           if (c2 >= 0x80 && c2 < 0xc0)
 372             return 2;
 373         }
 374       else if (c < 0xf0)
 375         {
 376           unsigned char c2 = s[1];
 377           if (c2 >= 0x80 && c2 < 0xc0)
 378             {
 379               unsigned char c3 = s[2];
 380               if (c3 >= 0x80 && c3 < 0xc0)
 381                 return 3;
 382             }
 383         }
 384       else if (c < 0xf8)
 385         {
 386           unsigned char c2 = s[1];
 387           if (c2 >= 0x80 && c2 < 0xc0)
 388             {
 389               unsigned char c3 = s[2];
 390               if (c3 >= 0x80 && c3 < 0xc0)
 391                 {
 392                   unsigned char c4 = s[3];
 393                   if (c4 >= 0x80 && c4 < 0xc0)
 394                     return 4;
 395                 }
 396             }
 397         }
 398     }
 399   return 1;
 400 }
 401
 402 /* Returns a character iterator for a given encoding.
 403    Given a pointer into a string, it returns the number occupied by the next
 404    single character.  If the piece of string is not valid or if the *s == '\0',
 405    it returns 1.  */
 406 character_iterator_t
 407 po_charset_character_iterator (const char *canon_charset)
 408 {
 409   if (canon_charset == utf8)
 410     return utf8_character_iterator;
 411   if (strcmp (canon_charset, "GB2312") == 0
 412       || strcmp (canon_charset, "EUC-KR") == 0)
 413     return euc_character_iterator;
 414   if (strcmp (canon_charset, "EUC-JP") == 0)
 415     return euc_jp_character_iterator;
 416   if (strcmp (canon_charset, "EUC-TW") == 0)
 417     return euc_tw_character_iterator;
 418   if (strcmp (canon_charset, "BIG5") == 0)
 419     return big5_character_iterator;
 420   if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
 421     return big5hkscs_character_iterator;
 422   if (strcmp (canon_charset, "GBK") == 0)
 423     return gbk_character_iterator;
 424   if (strcmp (canon_charset, "GB18030") == 0)
 425     return gb18030_character_iterator;
 426   if (strcmp (canon_charset, "SHIFT_JIS") == 0)
 427     return shift_jis_character_iterator;
 428   if (strcmp (canon_charset, "JOHAB") == 0)
 429     return johab_character_iterator;
 430   return char_iterator;
 431 }
 432
 433
 434 /* The PO file's encoding, as specified in the header entry.  */
 435 const char *po_lex_charset;
 436
 437 #if HAVE_ICONV
 438 /* Converter from the PO file's encoding to UTF-8.  */
 439 iconv_t po_lex_iconv;
 440 #endif
 441 /* If no converter is available, some information about the structure of the
 442    PO file's encoding.  */
 443 bool po_lex_weird_cjk;
 444
 445 void
 446 po_lex_charset_init ()
 447 {
 448   po_lex_charset = NULL;
 449 #if HAVE_ICONV
 450   po_lex_iconv = (iconv_t)(-1);
 451 #endif
 452   po_lex_weird_cjk = false;
 453 }
 454
 455 void
 456 po_lex_charset_set (const char *header_entry, const char *filename)
 457 {
 458   /* Verify the validity of CHARSET.  It is necessary
 459      1. for the correct treatment of multibyte characters containing
 460         0x5C bytes in the PO lexer,
 461      2. so that at run time, gettext() can call iconv() to convert
 462         msgstr.  */
 463   const char *charsetstr = c_strstr (header_entry, "charset=");
 464
 465   if (charsetstr != NULL)
 466     {
 467       size_t len;
 468       char *charset;
 469       const char *canon_charset;
 470
 471       charsetstr += strlen ("charset=");
 472       len = strcspn (charsetstr, " \t\n");
 473       charset = (char *) xmalloca (len + 1);
 474       memcpy (charset, charsetstr, len);
 475       charset[len] = '\0';
 476
 477       canon_charset = po_charset_canonicalize (charset);
 478       if (canon_charset == NULL)
 479         {
 480           /* Don't warn for POT files, because POT files usually contain
 481              only ASCII msgids.  */
 482           size_t filenamelen = strlen (filename);
 483
 484           if (!(filenamelen >= 4
 485                 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
 486                 && strcmp (charset, "CHARSET") == 0))
 487             {
 488               char *warning_message =
 489                 xasprintf (_("\
 490 Charset \"%s\" is not a portable encoding name.\n\
 491 Message conversion to user's charset might not work.\n"),
 492                            charset);
 493               po_xerror (PO_SEVERITY_WARNING, NULL,
 494                          filename, (size_t)(-1), (size_t)(-1), true,
 495                          warning_message);
 496               free (warning_message);
 497             }
 498         }
 499       else
 500         {
 501           const char *envval;
 502
 503           po_lex_charset = canon_charset;
 504 #if HAVE_ICONV
 505           if (po_lex_iconv != (iconv_t)(-1))
 506             iconv_close (po_lex_iconv);
 507 #endif
 508
 509           /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
 510              don't know about multibyte encodings, and require a spurious
 511              backslash after every multibyte character whose last byte is
 512              0x5C.  Some programs, like vim, distribute PO files in this
 513              broken format.  GNU msgfmt must continue to support this old
 514              PO file format when the Makefile requests it.  */
 515           envval = getenv ("OLD_PO_FILE_INPUT");
 516           if (envval != NULL && *envval != '\0')
 517             {
 518               /* Assume the PO file is in old format, with extraneous
 519                  backslashes.  */
 520 #if HAVE_ICONV
 521               po_lex_iconv = (iconv_t)(-1);
 522 #endif
 523               po_lex_weird_cjk = false;
 524             }
 525           else
 526             {
 527               /* Use iconv() to parse multibyte characters.  */
 528 #if HAVE_ICONV
 529               /* Avoid glibc-2.1 bug with EUC-KR.  */
 530 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 531      && !defined _LIBICONV_VERSION
 532               if (strcmp (po_lex_charset, "EUC-KR") == 0)
 533                 po_lex_iconv = (iconv_t)(-1);
 534               else
 535 # endif
 536               /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
 537                  GBK, GB18030.  */
 538 # if defined __sun && !defined _LIBICONV_VERSION
 539               if (   strcmp (po_lex_charset, "GB2312") == 0
 540                   || strcmp (po_lex_charset, "EUC-TW") == 0
 541                   || strcmp (po_lex_charset, "BIG5") == 0
 542                   || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
 543                   || strcmp (po_lex_charset, "GBK") == 0
 544                   || strcmp (po_lex_charset, "GB18030") == 0)
 545                 po_lex_iconv = (iconv_t)(-1);
 546               else
 547 # endif
 548               po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
 549               if (po_lex_iconv == (iconv_t)(-1))
 550                 {
 551                   char *warning_message;
 552                   const char *recommendation;
 553                   const char *note;
 554                   char *whole_message;
 555
 556                   warning_message =
 557                     xasprintf (_("\
 558 Charset \"%s\" is not supported. %s relies on iconv(),\n\
 559 and iconv() does not support \"%s\".\n"),
 560                                po_lex_charset, basename (program_name),
 561                                po_lex_charset);
 562
 563 # if !defined _LIBICONV_VERSION
 564                   recommendation = _("\
 565 Installing GNU libiconv and then reinstalling GNU gettext\n\
 566 would fix this problem.\n");
 567 # else
 568                   recommendation = "";
 569 # endif
 570
 571                   /* Test for a charset which has double-byte characters
 572                      ending in 0x5C.  For these encodings, the string parser
 573                      is likely to be confused if it can't see the character
 574                      boundaries.  */
 575                   po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
 576                   if (po_is_charset_weird (po_lex_charset)
 577                       && !po_lex_weird_cjk)
 578                     note = _("Continuing anyway, expect parse errors.");
 579                   else
 580                     note = _("Continuing anyway.");
 581
 582                   whole_message =
 583                     xasprintf ("%s%s%s\n",
 584                                warning_message, recommendation, note);
 585
 586                   po_xerror (PO_SEVERITY_WARNING, NULL,
 587                              filename, (size_t)(-1), (size_t)(-1), true,
 588                              whole_message);
 589
 590                   free (whole_message);
 591                   free (warning_message);
 592                 }
 593 #else
 594               /* Test for a charset which has double-byte characters
 595                  ending in 0x5C.  For these encodings, the string parser
 596                  is likely to be confused if it can't see the character
 597                  boundaries.  */
 598               po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
 599               if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
 600                 {
 601                   char *warning_message;
 602                   const char *recommendation;
 603                   const char *note;
 604                   char *whole_message;
 605
 606                   warning_message =
 607                     xasprintf (_("\
 608 Charset \"%s\" is not supported. %s relies on iconv().\n\
 609 This version was built without iconv().\n"),
 610                                po_lex_charset, basename (program_name));
 611
 612                   recommendation = _("\
 613 Installing GNU libiconv and then reinstalling GNU gettext\n\
 614 would fix this problem.\n");
 615
 616                   note = _("Continuing anyway, expect parse errors.");
 617
 618                   whole_message =
 619                     xasprintf ("%s%s%s\n",
 620                                warning_message, recommendation, note);
 621
 622                   po_xerror (PO_SEVERITY_WARNING, NULL,
 623                              filename, (size_t)(-1), (size_t)(-1), true,
 624                              whole_message);
 625
 626                   free (whole_message);
 627                   free (warning_message);
 628                 }
 629 #endif
 630             }
 631         }
 632       freea (charset);
 633     }
 634   else
 635     {
 636       /* Don't warn for POT files, because POT files usually contain
 637          only ASCII msgids.  */
 638       size_t filenamelen = strlen (filename);
 639
 640       if (!(filenamelen >= 4
 641             && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
 642         po_xerror (PO_SEVERITY_WARNING,
 643                    NULL, filename, (size_t)(-1), (size_t)(-1), true,
 644                    _("\
 645 Charset missing in header.\n\
 646 Message conversion to user's charset will not work.\n"));
 647     }
 648 }
 649
 650 void
 651 po_lex_charset_close ()
 652 {
 653   po_lex_charset = NULL;
 654 #if HAVE_ICONV
 655   if (po_lex_iconv != (iconv_t)(-1))
 656     {
 657       iconv_close (po_lex_iconv);
 658       po_lex_iconv = (iconv_t)(-1);
 659     }
 660 #endif
 661   po_lex_weird_cjk = false;
 662 }