gettext-tools/src/po-charset.c

   1 /* Charset handling while reading PO files.
   2    Copyright (C) 2001-2007 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22 #include <alloca.h>
  23
  24 /* Specification.  */
  25 #include "po-charset.h"
  26
  27 #include <stdlib.h>
  28 #include <string.h>
  29
  30 #include "xmalloca.h"
  31 #include "xvasprintf.h"
  32 #include "po-xerror.h"
  33 #include "basename.h"
  34 #include "progname.h"
  35 #include "c-strstr.h"
  36 #include "c-strcase.h"
  37 #include "gettext.h"
  38
  39 #define _(str) gettext (str)
  40
  41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  42
  43 static const char ascii[] = "ASCII";
  44
  45 /* The canonicalized encoding name for ASCII.  */
  46 const char *po_charset_ascii = ascii;
  47
  48 static const char utf8[] = "UTF-8";
  49
  50 /* The canonicalized encoding name for UTF-8.  */
  51 const char *po_charset_utf8 = utf8;
  52
  53 /* Canonicalize an encoding name.  */
  54 const char *
  55 po_charset_canonicalize (const char *charset)
  56 {
  57   /* The list of charsets supported by glibc's iconv() and by the portable
  58      iconv() across platforms.  Taken from intl/config.charset.  */
  59   static const char *standard_charsets[] =
  60   {
  61     ascii, "ANSI_X3.4-1968", "US-ASCII",        /* i = 0..2 */
  62     "ISO-8859-1", "ISO_8859-1",                 /* i = 3, 4 */
  63     "ISO-8859-2", "ISO_8859-2",
  64     "ISO-8859-3", "ISO_8859-3",
  65     "ISO-8859-4", "ISO_8859-4",
  66     "ISO-8859-5", "ISO_8859-5",
  67     "ISO-8859-6", "ISO_8859-6",
  68     "ISO-8859-7", "ISO_8859-7",
  69     "ISO-8859-8", "ISO_8859-8",
  70     "ISO-8859-9", "ISO_8859-9",
  71     "ISO-8859-13", "ISO_8859-13",
  72     "ISO-8859-14", "ISO_8859-14",
  73     "ISO-8859-15", "ISO_8859-15",               /* i = 25, 26 */
  74     "KOI8-R",
  75     "KOI8-U",
  76     "KOI8-T",
  77     "CP850",
  78     "CP866",
  79     "CP874",
  80     "CP932",
  81     "CP949",
  82     "CP950",
  83     "CP1250",
  84     "CP1251",
  85     "CP1252",
  86     "CP1253",
  87     "CP1254",
  88     "CP1255",
  89     "CP1256",
  90     "CP1257",
  91     "GB2312",
  92     "EUC-JP",
  93     "EUC-KR",
  94     "EUC-TW",
  95     "BIG5",
  96     "BIG5-HKSCS",
  97     "GBK",
  98     "GB18030",
  99     "SHIFT_JIS",
 100     "JOHAB",
 101     "TIS-620",
 102     "VISCII",
 103     "GEORGIAN-PS",
 104     utf8
 105   };
 106   size_t i;
 107
 108   for (i = 0; i < SIZEOF (standard_charsets); i++)
 109     if (c_strcasecmp (charset, standard_charsets[i]) == 0)
 110       return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
 111   return NULL;
 112 }
 113
 114 /* Test for ASCII compatibility.  */
 115 bool
 116 po_charset_ascii_compatible (const char *canon_charset)
 117 {
 118   /* There are only a few exceptions to ASCII compatibility.  */
 119   if (strcmp (canon_charset, "SHIFT_JIS") == 0
 120       || strcmp (canon_charset, "JOHAB") == 0
 121       || strcmp (canon_charset, "VISCII") == 0)
 122     return false;
 123   else
 124     return true;
 125 }
 126
 127 /* Test for a weird encoding, i.e. an encoding which has double-byte
 128    characters ending in 0x5C.  */
 129 bool po_is_charset_weird (const char *canon_charset)
 130 {
 131   static const char *weird_charsets[] =
 132   {
 133     "BIG5",
 134     "BIG5-HKSCS",
 135     "GBK",
 136     "GB18030",
 137     "SHIFT_JIS",
 138     "JOHAB"
 139   };
 140   size_t i;
 141
 142   for (i = 0; i < SIZEOF (weird_charsets); i++)
 143     if (strcmp (canon_charset, weird_charsets[i]) == 0)
 144       return true;
 145   return false;
 146 }
 147
 148 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
 149    An encoding has CJK structure if every valid character stream is composed
 150    of single bytes in the range 0x{00..7F} and of byte pairs in the range
 151    0x{80..FF}{30..FF}.  */
 152 bool po_is_charset_weird_cjk (const char *canon_charset)
 153 {
 154   static const char *weird_cjk_charsets[] =
 155   {                     /* single bytes   double bytes       */
 156     "BIG5",             /* 0x{00..7F},    0x{A1..F9}{40..FE} */
 157     "BIG5-HKSCS",       /* 0x{00..7F},    0x{88..FE}{40..FE} */
 158     "GBK",              /* 0x{00..7F},    0x{81..FE}{40..FE} */
 159     "GB18030",          /* 0x{00..7F},    0x{81..FE}{30..FE} */
 160     "SHIFT_JIS",        /* 0x{00..7F},    0x{81..F9}{40..FC} */
 161     "JOHAB"             /* 0x{00..7F},    0x{84..F9}{31..FE} */
 162   };
 163   size_t i;
 164
 165   for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
 166     if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
 167       return true;
 168   return false;
 169 }
 170
 171 /* Hardcoded iterator functions for all kinds of encodings.
 172    We could also implement a general iterator function with iconv(),
 173    but we need a fast one.  */
 174
 175 /* Character iterator for 8-bit encodings.  */
 176 static size_t
 177 char_iterator (const char *s)
 178 {
 179   return 1;
 180 }
 181
 182 /* Character iterator for GB2312.  See libiconv/lib/euc_cn.h.  */
 183 /* Character iterator for EUC-KR.  See libiconv/lib/euc_kr.h.  */
 184 static size_t
 185 euc_character_iterator (const char *s)
 186 {
 187   unsigned char c = *s;
 188   if (c >= 0xa1 && c < 0xff)
 189     {
 190       unsigned char c2 = s[1];
 191       if (c2 >= 0xa1 && c2 < 0xff)
 192         return 2;
 193     }
 194   return 1;
 195 }
 196
 197 /* Character iterator for EUC-JP.  See libiconv/lib/euc_jp.h.  */
 198 static size_t
 199 euc_jp_character_iterator (const char *s)
 200 {
 201   unsigned char c = *s;
 202   if (c >= 0xa1 && c < 0xff)
 203     {
 204       unsigned char c2 = s[1];
 205       if (c2 >= 0xa1 && c2 < 0xff)
 206         return 2;
 207     }
 208   else if (c == 0x8e)
 209     {
 210       unsigned char c2 = s[1];
 211       if (c2 >= 0xa1 && c2 < 0xe0)
 212         return 2;
 213     }
 214   else if (c == 0x8f)
 215     {
 216       unsigned char c2 = s[1];
 217       if (c2 >= 0xa1 && c2 < 0xff)
 218         {
 219           unsigned char c3 = s[2];
 220           if (c3 >= 0xa1 && c3 < 0xff)
 221             return 3;
 222         }
 223     }
 224   return 1;
 225 }
 226
 227 /* Character iterator for EUC-TW.  See libiconv/lib/euc_tw.h.  */
 228 static size_t
 229 euc_tw_character_iterator (const char *s)
 230 {
 231   unsigned char c = *s;
 232   if (c >= 0xa1 && c < 0xff)
 233     {
 234       unsigned char c2 = s[1];
 235       if (c2 >= 0xa1 && c2 < 0xff)
 236         return 2;
 237     }
 238   else if (c == 0x8e)
 239     {
 240       unsigned char c2 = s[1];
 241       if (c2 >= 0xa1 && c2 <= 0xb0)
 242         {
 243           unsigned char c3 = s[2];
 244           if (c3 >= 0xa1 && c3 < 0xff)
 245             {
 246               unsigned char c4 = s[3];
 247               if (c4 >= 0xa1 && c4 < 0xff)
 248                 return 4;
 249             }
 250         }
 251     }
 252   return 1;
 253 }
 254
 255 /* Character iterator for BIG5.  See libiconv/lib/ces_big5.h.  */
 256 static size_t
 257 big5_character_iterator (const char *s)
 258 {
 259   unsigned char c = *s;
 260   if (c >= 0xa1 && c < 0xff)
 261     {
 262       unsigned char c2 = s[1];
 263       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
 264         return 2;
 265     }
 266   return 1;
 267 }
 268
 269 /* Character iterator for BIG5-HKSCS.  See libiconv/lib/big5hkscs.h.  */
 270 static size_t
 271 big5hkscs_character_iterator (const char *s)
 272 {
 273   unsigned char c = *s;
 274   if (c >= 0x88 && c < 0xff)
 275     {
 276       unsigned char c2 = s[1];
 277       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
 278         return 2;
 279     }
 280   return 1;
 281 }
 282
 283 /* Character iterator for GBK.  See libiconv/lib/ces_gbk.h and
 284    libiconv/lib/gbk.h.  */
 285 static size_t
 286 gbk_character_iterator (const char *s)
 287 {
 288   unsigned char c = *s;
 289   if (c >= 0x81 && c < 0xff)
 290     {
 291       unsigned char c2 = s[1];
 292       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
 293         return 2;
 294     }
 295   return 1;
 296 }
 297
 298 /* Character iterator for GB18030.  See libiconv/lib/gb18030.h.  */
 299 static size_t
 300 gb18030_character_iterator (const char *s)
 301 {
 302   unsigned char c = *s;
 303   if (c >= 0x81 && c < 0xff)
 304     {
 305       unsigned char c2 = s[1];
 306       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
 307         return 2;
 308     }
 309   if (c >= 0x81 && c <= 0x84)
 310     {
 311       unsigned char c2 = s[1];
 312       if (c2 >= 0x30 && c2 <= 0x39)
 313         {
 314           unsigned char c3 = s[2];
 315           if (c3 >= 0x81 && c3 < 0xff)
 316             {
 317               unsigned char c4 = s[3];
 318               if (c4 >= 0x30 && c4 <= 0x39)
 319                 return 4;
 320             }
 321         }
 322     }
 323   return 1;
 324 }
 325
 326 /* Character iterator for SHIFT_JIS.  See libiconv/lib/sjis.h.  */
 327 static size_t
 328 shift_jis_character_iterator (const char *s)
 329 {
 330   unsigned char c = *s;
 331   if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
 332     {
 333       unsigned char c2 = s[1];
 334       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
 335         return 2;
 336     }
 337   return 1;
 338 }
 339
 340 /* Character iterator for JOHAB.  See libiconv/lib/johab.h and
 341    libiconv/lib/johab_hangul.h.  */
 342 static size_t
 343 johab_character_iterator (const char *s)
 344 {
 345   unsigned char c = *s;
 346   if (c >= 0x84 && c <= 0xd3)
 347     {
 348       unsigned char c2 = s[1];
 349       if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
 350         return 2;
 351     }
 352   else if (c >= 0xd9 && c <= 0xf9)
 353     {
 354       unsigned char c2 = s[1];
 355       if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
 356         return 2;
 357     }
 358   return 1;
 359 }
 360
 361 /* Character iterator for UTF-8.  See libiconv/lib/utf8.h.  */
 362 static size_t
 363 utf8_character_iterator (const char *s)
 364 {
 365   unsigned char c = *s;
 366   if (c >= 0xc2)
 367     {
 368       if (c < 0xe0)
 369         {
 370           unsigned char c2 = s[1];
 371           if (c2 >= 0x80 && c2 < 0xc0)
 372             return 2;
 373         }
 374       else if (c < 0xf0)
 375         {
 376           unsigned char c2 = s[1];
 377           if (c2 >= 0x80 && c2 < 0xc0)
 378             {
 379               unsigned char c3 = s[2];
 380               if (c3 >= 0x80 && c3 < 0xc0)
 381                 return 3;
 382             }
 383         }
 384       else if (c < 0xf8)
 385         {
 386           unsigned char c2 = s[1];
 387           if (c2 >= 0x80 && c2 < 0xc0)
 388             {
 389               unsigned char c3 = s[2];
 390               if (c3 >= 0x80 && c3 < 0xc0)
 391                 {
 392                   unsigned char c4 = s[3];
 393                   if (c4 >= 0x80 && c4 < 0xc0)
 394                     return 4;
 395                 }
 396             }
 397         }
 398     }
 399   return 1;
 400 }
 401
 402 /* Returns a character iterator for a given encoding.
 403    Given a pointer into a string, it returns the number occupied by the next
 404    single character.  If the piece of string is not valid or if the *s == '\0',
 405    it returns 1.  */
 406 character_iterator_t
 407 po_charset_character_iterator (const char *canon_charset)
 408 {
 409   if (canon_charset == utf8)
 410     return utf8_character_iterator;
 411   if (strcmp (canon_charset, "GB2312") == 0
 412       || strcmp (canon_charset, "EUC-KR") == 0)
 413     return euc_character_iterator;
 414   if (strcmp (canon_charset, "EUC-JP") == 0)
 415     return euc_jp_character_iterator;
 416   if (strcmp (canon_charset, "EUC-TW") == 0)
 417     return euc_tw_character_iterator;
 418   if (strcmp (canon_charset, "BIG5") == 0)
 419     return big5_character_iterator;
 420   if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
 421     return big5hkscs_character_iterator;
 422   if (strcmp (canon_charset, "GBK") == 0)
 423     return gbk_character_iterator;
 424   if (strcmp (canon_charset, "GB18030") == 0)
 425     return gb18030_character_iterator;
 426   if (strcmp (canon_charset, "SHIFT_JIS") == 0)
 427     return shift_jis_character_iterator;
 428   if (strcmp (canon_charset, "JOHAB") == 0)
 429     return johab_character_iterator;
 430   return char_iterator;
 431 }
 432
 433
 434 /* The PO file's encoding, as specified in the header entry.  */
 435 const char *po_lex_charset;
 436
 437 #if HAVE_ICONV
 438 /* Converter from the PO file's encoding to UTF-8.  */
 439 iconv_t po_lex_iconv;
 440 #endif
 441 /* If no converter is available, some information about the structure of the
 442    PO file's encoding.  */
 443 bool po_lex_weird_cjk;
 444
 445 void
 446 po_lex_charset_init ()
 447 {
 448   po_lex_charset = NULL;
 449 #if HAVE_ICONV
 450   po_lex_iconv = (iconv_t)(-1);
 451 #endif
 452   po_lex_weird_cjk = false;
 453 }
 454
 455 void
 456 po_lex_charset_set (const char *header_entry, const char *filename)
 457 {
 458   /* Verify the validity of CHARSET.  It is necessary
 459      1. for the correct treatment of multibyte characters containing
 460         0x5C bytes in the PO lexer,
 461      2. so that at run time, gettext() can call iconv() to convert
 462         msgstr.  */
 463   const char *charsetstr = c_strstr (header_entry, "charset=");
 464
 465   if (charsetstr != NULL)
 466     {
 467       size_t len;
 468       char *charset;
 469       const char *canon_charset;
 470
 471       charsetstr += strlen ("charset=");
 472       len = strcspn (charsetstr, " \t\n");
 473       charset = (char *) xmalloca (len + 1);
 474       memcpy (charset, charsetstr, len);
 475       charset[len] = '\0';
 476
 477       canon_charset = po_charset_canonicalize (charset);
 478       if (canon_charset == NULL)
 479         {
 480           /* Don't warn for POT files, because POT files usually contain
 481              only ASCII msgids.  */
 482           size_t filenamelen = strlen (filename);
 483
 484           if (!(filenamelen >= 4
 485                 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
 486                 && strcmp (charset, "CHARSET") == 0))
 487             {
 488               char *warning_message =
 489                 xasprintf (_("\
 490 Charset \"%s\" is not a portable encoding name.\n\
 491 Message conversion to user's charset might not work.\n"),
 492                            charset);
 493               po_xerror (PO_SEVERITY_WARNING, NULL,
 494                          filename, (size_t)(-1), (size_t)(-1), true,
 495                          warning_message);
 496               free (warning_message);
 497             }
 498         }
 499       else
 500         {
 501           const char *envval;
 502
 503           po_lex_charset = canon_charset;
 504 #if HAVE_ICONV
 505           if (po_lex_iconv != (iconv_t)(-1))
 506             iconv_close (po_lex_iconv);
 507 #endif
 508
 509           /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
 510              don't know about multibyte encodings, and require a spurious
 511              backslash after every multibyte character whose last byte is
 512              0x5C.  Some programs, like vim, distribute PO files in this
 513              broken format.  GNU msgfmt must continue to support this old
 514              PO file format when the Makefile requests it.  */
 515           envval = getenv ("OLD_PO_FILE_INPUT");
 516           if (envval != NULL && *envval != '\0')
 517             {
 518               /* Assume the PO file is in old format, with extraneous
 519                  backslashes.  */
 520 #if HAVE_ICONV
 521               po_lex_iconv = (iconv_t)(-1);
 522 #endif
 523               po_lex_weird_cjk = false;
 524             }
 525           else
 526             {
 527               /* Use iconv() to parse multibyte characters.  */
 528 #if HAVE_ICONV
 529               /* Avoid glibc-2.1 bug with EUC-KR.  */
 530 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
 531               if (strcmp (po_lex_charset, "EUC-KR") == 0)
 532                 po_lex_iconv = (iconv_t)(-1);
 533               else
 534 # endif
 535               /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
 536                  GBK, GB18030.  */
 537 # if defined __sun && !defined _LIBICONV_VERSION
 538               if (   strcmp (po_lex_charset, "GB2312") == 0
 539                   || strcmp (po_lex_charset, "EUC-TW") == 0
 540                   || strcmp (po_lex_charset, "BIG5") == 0
 541                   || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
 542                   || strcmp (po_lex_charset, "GBK") == 0
 543                   || strcmp (po_lex_charset, "GB18030") == 0)
 544                 po_lex_iconv = (iconv_t)(-1);
 545               else
 546 # endif
 547               po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
 548               if (po_lex_iconv == (iconv_t)(-1))
 549                 {
 550                   char *warning_message;
 551                   const char *recommendation;
 552                   const char *note;
 553                   char *whole_message;
 554
 555                   warning_message =
 556                     xasprintf (_("\
 557 Charset \"%s\" is not supported. %s relies on iconv(),\n\
 558 and iconv() does not support \"%s\".\n"),
 559                                po_lex_charset, basename (program_name),
 560                                po_lex_charset);
 561
 562 # if !defined _LIBICONV_VERSION
 563                   recommendation = _("\
 564 Installing GNU libiconv and then reinstalling GNU gettext\n\
 565 would fix this problem.\n");
 566 # else
 567                   recommendation = "";
 568 # endif
 569
 570                   /* Test for a charset which has double-byte characters
 571                      ending in 0x5C.  For these encodings, the string parser
 572                      is likely to be confused if it can't see the character
 573                      boundaries.  */
 574                   po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
 575                   if (po_is_charset_weird (po_lex_charset)
 576                       && !po_lex_weird_cjk)
 577                     note = _("Continuing anyway, expect parse errors.");
 578                   else
 579                     note = _("Continuing anyway.");
 580
 581                   whole_message =
 582                     xasprintf ("%s%s%s\n",
 583                                warning_message, recommendation, note);
 584
 585                   po_xerror (PO_SEVERITY_WARNING, NULL,
 586                              filename, (size_t)(-1), (size_t)(-1), true,
 587                              whole_message);
 588
 589                   free (whole_message);
 590                   free (warning_message);
 591                 }
 592 #else
 593               /* Test for a charset which has double-byte characters
 594                  ending in 0x5C.  For these encodings, the string parser
 595                  is likely to be confused if it can't see the character
 596                  boundaries.  */
 597               po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
 598               if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
 599                 {
 600                   char *warning_message;
 601                   const char *recommendation;
 602                   const char *note;
 603                   char *whole_message;
 604
 605                   warning_message =
 606                     xasprintf (_("\
 607 Charset \"%s\" is not supported. %s relies on iconv().\n\
 608 This version was built without iconv().\n"),
 609                                po_lex_charset, basename (program_name));
 610
 611                   recommendation = _("\
 612 Installing GNU libiconv and then reinstalling GNU gettext\n\
 613 would fix this problem.\n");
 614
 615                   note = _("Continuing anyway, expect parse errors.");
 616
 617                   whole_message =
 618                     xasprintf ("%s%s%s\n",
 619                                warning_message, recommendation, note);
 620
 621                   po_xerror (PO_SEVERITY_WARNING, NULL,
 622                              filename, (size_t)(-1), (size_t)(-1), true,
 623                              whole_message);
 624
 625                   free (whole_message);
 626                   free (warning_message);
 627                 }
 628 #endif
 629             }
 630         }
 631       freea (charset);
 632     }
 633   else
 634     {
 635       /* Don't warn for POT files, because POT files usually contain
 636          only ASCII msgids.  */
 637       size_t filenamelen = strlen (filename);
 638
 639       if (!(filenamelen >= 4
 640             && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
 641         po_xerror (PO_SEVERITY_WARNING,
 642                    NULL, filename, (size_t)(-1), (size_t)(-1), true,
 643                    _("\
 644 Charset missing in header.\n\
 645 Message conversion to user's charset will not work.\n"));
 646     }
 647 }
 648
 649 void
 650 po_lex_charset_close ()
 651 {
 652   po_lex_charset = NULL;
 653 #if HAVE_ICONV
 654   if (po_lex_iconv != (iconv_t)(-1))
 655     {
 656       iconv_close (po_lex_iconv);
 657       po_lex_iconv = (iconv_t)(-1);
 658     }
 659 #endif
 660   po_lex_weird_cjk = false;
 661 }