1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2007 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 #include "po-charset.h"
31 #include "xvasprintf.h"
32 #include "po-xerror.h"
36 #include "c-strcase.h"
39 #define _(str) gettext (str)
41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
43 static const char ascii[] = "ASCII";
45 /* The canonicalized encoding name for ASCII. */
46 const char *po_charset_ascii = ascii;
48 static const char utf8[] = "UTF-8";
50 /* The canonicalized encoding name for UTF-8. */
51 const char *po_charset_utf8 = utf8;
53 /* Canonicalize an encoding name. */
55 po_charset_canonicalize (const char *charset)
57 /* The list of charsets supported by glibc's iconv() and by the portable
58 iconv() across platforms. Taken from intl/config.charset. */
59 static const char *standard_charsets[] =
61 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
62 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
63 "ISO-8859-2", "ISO_8859-2",
64 "ISO-8859-3", "ISO_8859-3",
65 "ISO-8859-4", "ISO_8859-4",
66 "ISO-8859-5", "ISO_8859-5",
67 "ISO-8859-6", "ISO_8859-6",
68 "ISO-8859-7", "ISO_8859-7",
69 "ISO-8859-8", "ISO_8859-8",
70 "ISO-8859-9", "ISO_8859-9",
71 "ISO-8859-13", "ISO_8859-13",
72 "ISO-8859-14", "ISO_8859-14",
73 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
108 for (i = 0; i < SIZEOF (standard_charsets); i++)
109 if (c_strcasecmp (charset, standard_charsets[i]) == 0)
110 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
114 /* Test for ASCII compatibility. */
116 po_charset_ascii_compatible (const char *canon_charset)
118 /* There are only a few exceptions to ASCII compatibility. */
119 if (strcmp (canon_charset, "SHIFT_JIS") == 0
120 || strcmp (canon_charset, "JOHAB") == 0
121 || strcmp (canon_charset, "VISCII") == 0)
127 /* Test for a weird encoding, i.e. an encoding which has double-byte
128 characters ending in 0x5C. */
129 bool po_is_charset_weird (const char *canon_charset)
131 static const char *weird_charsets[] =
142 for (i = 0; i < SIZEOF (weird_charsets); i++)
143 if (strcmp (canon_charset, weird_charsets[i]) == 0)
148 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
149 An encoding has CJK structure if every valid character stream is composed
150 of single bytes in the range 0x{00..7F} and of byte pairs in the range
151 0x{80..FF}{30..FF}. */
152 bool po_is_charset_weird_cjk (const char *canon_charset)
154 static const char *weird_cjk_charsets[] =
155 { /* single bytes double bytes */
156 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
157 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
158 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
159 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
160 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
161 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
165 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
166 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
171 /* Hardcoded iterator functions for all kinds of encodings.
172 We could also implement a general iterator function with iconv(),
173 but we need a fast one. */
175 /* Character iterator for 8-bit encodings. */
177 char_iterator (const char *s)
182 /* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
183 /* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
185 euc_character_iterator (const char *s)
187 unsigned char c = *s;
188 if (c >= 0xa1 && c < 0xff)
190 unsigned char c2 = s[1];
191 if (c2 >= 0xa1 && c2 < 0xff)
197 /* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
199 euc_jp_character_iterator (const char *s)
201 unsigned char c = *s;
202 if (c >= 0xa1 && c < 0xff)
204 unsigned char c2 = s[1];
205 if (c2 >= 0xa1 && c2 < 0xff)
210 unsigned char c2 = s[1];
211 if (c2 >= 0xa1 && c2 < 0xe0)
216 unsigned char c2 = s[1];
217 if (c2 >= 0xa1 && c2 < 0xff)
219 unsigned char c3 = s[2];
220 if (c3 >= 0xa1 && c3 < 0xff)
227 /* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
229 euc_tw_character_iterator (const char *s)
231 unsigned char c = *s;
232 if (c >= 0xa1 && c < 0xff)
234 unsigned char c2 = s[1];
235 if (c2 >= 0xa1 && c2 < 0xff)
240 unsigned char c2 = s[1];
241 if (c2 >= 0xa1 && c2 <= 0xb0)
243 unsigned char c3 = s[2];
244 if (c3 >= 0xa1 && c3 < 0xff)
246 unsigned char c4 = s[3];
247 if (c4 >= 0xa1 && c4 < 0xff)
255 /* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
257 big5_character_iterator (const char *s)
259 unsigned char c = *s;
260 if (c >= 0xa1 && c < 0xff)
262 unsigned char c2 = s[1];
263 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
269 /* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
271 big5hkscs_character_iterator (const char *s)
273 unsigned char c = *s;
274 if (c >= 0x88 && c < 0xff)
276 unsigned char c2 = s[1];
277 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
283 /* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
284 libiconv/lib/gbk.h. */
286 gbk_character_iterator (const char *s)
288 unsigned char c = *s;
289 if (c >= 0x81 && c < 0xff)
291 unsigned char c2 = s[1];
292 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
298 /* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
300 gb18030_character_iterator (const char *s)
302 unsigned char c = *s;
303 if (c >= 0x81 && c < 0xff)
305 unsigned char c2 = s[1];
306 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
309 if (c >= 0x81 && c <= 0x84)
311 unsigned char c2 = s[1];
312 if (c2 >= 0x30 && c2 <= 0x39)
314 unsigned char c3 = s[2];
315 if (c3 >= 0x81 && c3 < 0xff)
317 unsigned char c4 = s[3];
318 if (c4 >= 0x30 && c4 <= 0x39)
326 /* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
328 shift_jis_character_iterator (const char *s)
330 unsigned char c = *s;
331 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
333 unsigned char c2 = s[1];
334 if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
340 /* Character iterator for JOHAB. See libiconv/lib/johab.h and
341 libiconv/lib/johab_hangul.h. */
343 johab_character_iterator (const char *s)
345 unsigned char c = *s;
346 if (c >= 0x84 && c <= 0xd3)
348 unsigned char c2 = s[1];
349 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
352 else if (c >= 0xd9 && c <= 0xf9)
354 unsigned char c2 = s[1];
355 if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
361 /* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
363 utf8_character_iterator (const char *s)
365 unsigned char c = *s;
370 unsigned char c2 = s[1];
371 if (c2 >= 0x80 && c2 < 0xc0)
376 unsigned char c2 = s[1];
377 if (c2 >= 0x80 && c2 < 0xc0)
379 unsigned char c3 = s[2];
380 if (c3 >= 0x80 && c3 < 0xc0)
386 unsigned char c2 = s[1];
387 if (c2 >= 0x80 && c2 < 0xc0)
389 unsigned char c3 = s[2];
390 if (c3 >= 0x80 && c3 < 0xc0)
392 unsigned char c4 = s[3];
393 if (c4 >= 0x80 && c4 < 0xc0)
402 /* Returns a character iterator for a given encoding.
403 Given a pointer into a string, it returns the number occupied by the next
404 single character. If the piece of string is not valid or if the *s == '\0',
407 po_charset_character_iterator (const char *canon_charset)
409 if (canon_charset == utf8)
410 return utf8_character_iterator;
411 if (strcmp (canon_charset, "GB2312") == 0
412 || strcmp (canon_charset, "EUC-KR") == 0)
413 return euc_character_iterator;
414 if (strcmp (canon_charset, "EUC-JP") == 0)
415 return euc_jp_character_iterator;
416 if (strcmp (canon_charset, "EUC-TW") == 0)
417 return euc_tw_character_iterator;
418 if (strcmp (canon_charset, "BIG5") == 0)
419 return big5_character_iterator;
420 if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
421 return big5hkscs_character_iterator;
422 if (strcmp (canon_charset, "GBK") == 0)
423 return gbk_character_iterator;
424 if (strcmp (canon_charset, "GB18030") == 0)
425 return gb18030_character_iterator;
426 if (strcmp (canon_charset, "SHIFT_JIS") == 0)
427 return shift_jis_character_iterator;
428 if (strcmp (canon_charset, "JOHAB") == 0)
429 return johab_character_iterator;
430 return char_iterator;
434 /* The PO file's encoding, as specified in the header entry. */
435 const char *po_lex_charset;
438 /* Converter from the PO file's encoding to UTF-8. */
439 iconv_t po_lex_iconv;
441 /* If no converter is available, some information about the structure of the
442 PO file's encoding. */
443 bool po_lex_weird_cjk;
446 po_lex_charset_init ()
448 po_lex_charset = NULL;
450 po_lex_iconv = (iconv_t)(-1);
452 po_lex_weird_cjk = false;
456 po_lex_charset_set (const char *header_entry, const char *filename)
458 /* Verify the validity of CHARSET. It is necessary
459 1. for the correct treatment of multibyte characters containing
460 0x5C bytes in the PO lexer,
461 2. so that at run time, gettext() can call iconv() to convert
463 const char *charsetstr = c_strstr (header_entry, "charset=");
465 if (charsetstr != NULL)
469 const char *canon_charset;
471 charsetstr += strlen ("charset=");
472 len = strcspn (charsetstr, " \t\n");
473 charset = (char *) xmalloca (len + 1);
474 memcpy (charset, charsetstr, len);
477 canon_charset = po_charset_canonicalize (charset);
478 if (canon_charset == NULL)
480 /* Don't warn for POT files, because POT files usually contain
481 only ASCII msgids. */
482 size_t filenamelen = strlen (filename);
484 if (!(filenamelen >= 4
485 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
486 && strcmp (charset, "CHARSET") == 0))
488 char *warning_message =
490 Charset \"%s\" is not a portable encoding name.\n\
491 Message conversion to user's charset might not work.\n"),
493 po_xerror (PO_SEVERITY_WARNING, NULL,
494 filename, (size_t)(-1), (size_t)(-1), true,
496 free (warning_message);
503 po_lex_charset = canon_charset;
505 if (po_lex_iconv != (iconv_t)(-1))
506 iconv_close (po_lex_iconv);
509 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
510 don't know about multibyte encodings, and require a spurious
511 backslash after every multibyte character whose last byte is
512 0x5C. Some programs, like vim, distribute PO files in this
513 broken format. GNU msgfmt must continue to support this old
514 PO file format when the Makefile requests it. */
515 envval = getenv ("OLD_PO_FILE_INPUT");
516 if (envval != NULL && *envval != '\0')
518 /* Assume the PO file is in old format, with extraneous
521 po_lex_iconv = (iconv_t)(-1);
523 po_lex_weird_cjk = false;
527 /* Use iconv() to parse multibyte characters. */
529 /* Avoid glibc-2.1 bug with EUC-KR. */
530 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
531 if (strcmp (po_lex_charset, "EUC-KR") == 0)
532 po_lex_iconv = (iconv_t)(-1);
535 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
537 # if defined __sun && !defined _LIBICONV_VERSION
538 if ( strcmp (po_lex_charset, "GB2312") == 0
539 || strcmp (po_lex_charset, "EUC-TW") == 0
540 || strcmp (po_lex_charset, "BIG5") == 0
541 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
542 || strcmp (po_lex_charset, "GBK") == 0
543 || strcmp (po_lex_charset, "GB18030") == 0)
544 po_lex_iconv = (iconv_t)(-1);
547 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
548 if (po_lex_iconv == (iconv_t)(-1))
550 char *warning_message;
551 const char *recommendation;
557 Charset \"%s\" is not supported. %s relies on iconv(),\n\
558 and iconv() does not support \"%s\".\n"),
559 po_lex_charset, basename (program_name),
562 # if !defined _LIBICONV_VERSION
563 recommendation = _("\
564 Installing GNU libiconv and then reinstalling GNU gettext\n\
565 would fix this problem.\n");
570 /* Test for a charset which has double-byte characters
571 ending in 0x5C. For these encodings, the string parser
572 is likely to be confused if it can't see the character
574 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
575 if (po_is_charset_weird (po_lex_charset)
576 && !po_lex_weird_cjk)
577 note = _("Continuing anyway, expect parse errors.");
579 note = _("Continuing anyway.");
582 xasprintf ("%s%s%s\n",
583 warning_message, recommendation, note);
585 po_xerror (PO_SEVERITY_WARNING, NULL,
586 filename, (size_t)(-1), (size_t)(-1), true,
589 free (whole_message);
590 free (warning_message);
593 /* Test for a charset which has double-byte characters
594 ending in 0x5C. For these encodings, the string parser
595 is likely to be confused if it can't see the character
597 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
598 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
600 char *warning_message;
601 const char *recommendation;
607 Charset \"%s\" is not supported. %s relies on iconv().\n\
608 This version was built without iconv().\n"),
609 po_lex_charset, basename (program_name));
611 recommendation = _("\
612 Installing GNU libiconv and then reinstalling GNU gettext\n\
613 would fix this problem.\n");
615 note = _("Continuing anyway, expect parse errors.");
618 xasprintf ("%s%s%s\n",
619 warning_message, recommendation, note);
621 po_xerror (PO_SEVERITY_WARNING, NULL,
622 filename, (size_t)(-1), (size_t)(-1), true,
625 free (whole_message);
626 free (warning_message);
635 /* Don't warn for POT files, because POT files usually contain
636 only ASCII msgids. */
637 size_t filenamelen = strlen (filename);
639 if (!(filenamelen >= 4
640 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
641 po_xerror (PO_SEVERITY_WARNING,
642 NULL, filename, (size_t)(-1), (size_t)(-1), true,
644 Charset missing in header.\n\
645 Message conversion to user's charset will not work.\n"));
650 po_lex_charset_close ()
652 po_lex_charset = NULL;
654 if (po_lex_iconv != (iconv_t)(-1))
656 iconv_close (po_lex_iconv);
657 po_lex_iconv = (iconv_t)(-1);
660 po_lex_weird_cjk = false;