1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2007, 2010 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
25 #include "po-charset.h"
31 #include "xvasprintf.h"
32 #include "po-xerror.h"
36 #include "c-strcase.h"
39 #define _(str) gettext (str)
41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
43 static const char ascii[] = "ASCII";
45 /* The canonicalized encoding name for ASCII. */
46 const char *po_charset_ascii = ascii;
48 static const char utf8[] = "UTF-8";
50 /* The canonicalized encoding name for UTF-8. */
51 const char *po_charset_utf8 = utf8;
53 /* Canonicalize an encoding name. */
55 po_charset_canonicalize (const char *charset)
57 /* The list of charsets supported by glibc's iconv() and by the portable
58 iconv() across platforms. Taken from intl/config.charset. */
59 static const char *standard_charsets[] =
61 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
62 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
63 "ISO-8859-2", "ISO_8859-2",
64 "ISO-8859-3", "ISO_8859-3",
65 "ISO-8859-4", "ISO_8859-4",
66 "ISO-8859-5", "ISO_8859-5",
67 "ISO-8859-6", "ISO_8859-6",
68 "ISO-8859-7", "ISO_8859-7",
69 "ISO-8859-8", "ISO_8859-8",
70 "ISO-8859-9", "ISO_8859-9",
71 "ISO-8859-13", "ISO_8859-13",
72 "ISO-8859-14", "ISO_8859-14",
73 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
108 for (i = 0; i < SIZEOF (standard_charsets); i++)
109 if (c_strcasecmp (charset, standard_charsets[i]) == 0)
110 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
114 /* Test for ASCII compatibility. */
116 po_charset_ascii_compatible (const char *canon_charset)
118 /* There are only a few exceptions to ASCII compatibility. */
119 if (strcmp (canon_charset, "SHIFT_JIS") == 0
120 || strcmp (canon_charset, "JOHAB") == 0
121 || strcmp (canon_charset, "VISCII") == 0)
127 /* Test for a weird encoding, i.e. an encoding which has double-byte
128 characters ending in 0x5C. */
129 bool po_is_charset_weird (const char *canon_charset)
131 static const char *weird_charsets[] =
142 for (i = 0; i < SIZEOF (weird_charsets); i++)
143 if (strcmp (canon_charset, weird_charsets[i]) == 0)
148 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
149 An encoding has CJK structure if every valid character stream is composed
150 of single bytes in the range 0x{00..7F} and of byte pairs in the range
151 0x{80..FF}{30..FF}. */
152 bool po_is_charset_weird_cjk (const char *canon_charset)
154 static const char *weird_cjk_charsets[] =
155 { /* single bytes double bytes */
156 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
157 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
158 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
159 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
160 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
161 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
165 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
166 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
171 /* Hardcoded iterator functions for all kinds of encodings.
172 We could also implement a general iterator function with iconv(),
173 but we need a fast one. */
175 /* Character iterator for 8-bit encodings. */
177 char_iterator (const char *s)
182 /* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
183 /* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
185 euc_character_iterator (const char *s)
187 unsigned char c = *s;
188 if (c >= 0xa1 && c < 0xff)
190 unsigned char c2 = s[1];
191 if (c2 >= 0xa1 && c2 < 0xff)
197 /* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
199 euc_jp_character_iterator (const char *s)
201 unsigned char c = *s;
202 if (c >= 0xa1 && c < 0xff)
204 unsigned char c2 = s[1];
205 if (c2 >= 0xa1 && c2 < 0xff)
210 unsigned char c2 = s[1];
211 if (c2 >= 0xa1 && c2 < 0xe0)
216 unsigned char c2 = s[1];
217 if (c2 >= 0xa1 && c2 < 0xff)
219 unsigned char c3 = s[2];
220 if (c3 >= 0xa1 && c3 < 0xff)
227 /* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
229 euc_tw_character_iterator (const char *s)
231 unsigned char c = *s;
232 if (c >= 0xa1 && c < 0xff)
234 unsigned char c2 = s[1];
235 if (c2 >= 0xa1 && c2 < 0xff)
240 unsigned char c2 = s[1];
241 if (c2 >= 0xa1 && c2 <= 0xb0)
243 unsigned char c3 = s[2];
244 if (c3 >= 0xa1 && c3 < 0xff)
246 unsigned char c4 = s[3];
247 if (c4 >= 0xa1 && c4 < 0xff)
255 /* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
257 big5_character_iterator (const char *s)
259 unsigned char c = *s;
260 if (c >= 0xa1 && c < 0xff)
262 unsigned char c2 = s[1];
263 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
269 /* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
271 big5hkscs_character_iterator (const char *s)
273 unsigned char c = *s;
274 if (c >= 0x88 && c < 0xff)
276 unsigned char c2 = s[1];
277 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
283 /* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
284 libiconv/lib/gbk.h. */
286 gbk_character_iterator (const char *s)
288 unsigned char c = *s;
289 if (c >= 0x81 && c < 0xff)
291 unsigned char c2 = s[1];
292 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
298 /* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
300 gb18030_character_iterator (const char *s)
302 unsigned char c = *s;
303 if (c >= 0x81 && c < 0xff)
305 unsigned char c2 = s[1];
306 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
309 if (c >= 0x81 && c <= 0x84)
311 unsigned char c2 = s[1];
312 if (c2 >= 0x30 && c2 <= 0x39)
314 unsigned char c3 = s[2];
315 if (c3 >= 0x81 && c3 < 0xff)
317 unsigned char c4 = s[3];
318 if (c4 >= 0x30 && c4 <= 0x39)
326 /* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
328 shift_jis_character_iterator (const char *s)
330 unsigned char c = *s;
331 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
333 unsigned char c2 = s[1];
334 if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
340 /* Character iterator for JOHAB. See libiconv/lib/johab.h and
341 libiconv/lib/johab_hangul.h. */
343 johab_character_iterator (const char *s)
345 unsigned char c = *s;
346 if (c >= 0x84 && c <= 0xd3)
348 unsigned char c2 = s[1];
349 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
352 else if (c >= 0xd9 && c <= 0xf9)
354 unsigned char c2 = s[1];
355 if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
361 /* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
363 utf8_character_iterator (const char *s)
365 unsigned char c = *s;
370 unsigned char c2 = s[1];
371 if (c2 >= 0x80 && c2 < 0xc0)
376 unsigned char c2 = s[1];
377 if (c2 >= 0x80 && c2 < 0xc0)
379 unsigned char c3 = s[2];
380 if (c3 >= 0x80 && c3 < 0xc0)
386 unsigned char c2 = s[1];
387 if (c2 >= 0x80 && c2 < 0xc0)
389 unsigned char c3 = s[2];
390 if (c3 >= 0x80 && c3 < 0xc0)
392 unsigned char c4 = s[3];
393 if (c4 >= 0x80 && c4 < 0xc0)
402 /* Returns a character iterator for a given encoding.
403 Given a pointer into a string, it returns the number occupied by the next
404 single character. If the piece of string is not valid or if the *s == '\0',
407 po_charset_character_iterator (const char *canon_charset)
409 if (canon_charset == utf8)
410 return utf8_character_iterator;
411 if (strcmp (canon_charset, "GB2312") == 0
412 || strcmp (canon_charset, "EUC-KR") == 0)
413 return euc_character_iterator;
414 if (strcmp (canon_charset, "EUC-JP") == 0)
415 return euc_jp_character_iterator;
416 if (strcmp (canon_charset, "EUC-TW") == 0)
417 return euc_tw_character_iterator;
418 if (strcmp (canon_charset, "BIG5") == 0)
419 return big5_character_iterator;
420 if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
421 return big5hkscs_character_iterator;
422 if (strcmp (canon_charset, "GBK") == 0)
423 return gbk_character_iterator;
424 if (strcmp (canon_charset, "GB18030") == 0)
425 return gb18030_character_iterator;
426 if (strcmp (canon_charset, "SHIFT_JIS") == 0)
427 return shift_jis_character_iterator;
428 if (strcmp (canon_charset, "JOHAB") == 0)
429 return johab_character_iterator;
430 return char_iterator;
434 /* The PO file's encoding, as specified in the header entry. */
435 const char *po_lex_charset;
438 /* Converter from the PO file's encoding to UTF-8. */
439 iconv_t po_lex_iconv;
441 /* If no converter is available, some information about the structure of the
442 PO file's encoding. */
443 bool po_lex_weird_cjk;
446 po_lex_charset_init ()
448 po_lex_charset = NULL;
450 po_lex_iconv = (iconv_t)(-1);
452 po_lex_weird_cjk = false;
456 po_lex_charset_set (const char *header_entry, const char *filename)
458 /* Verify the validity of CHARSET. It is necessary
459 1. for the correct treatment of multibyte characters containing
460 0x5C bytes in the PO lexer,
461 2. so that at run time, gettext() can call iconv() to convert
463 const char *charsetstr = c_strstr (header_entry, "charset=");
465 if (charsetstr != NULL)
469 const char *canon_charset;
471 charsetstr += strlen ("charset=");
472 len = strcspn (charsetstr, " \t\n");
473 charset = (char *) xmalloca (len + 1);
474 memcpy (charset, charsetstr, len);
477 canon_charset = po_charset_canonicalize (charset);
478 if (canon_charset == NULL)
480 /* Don't warn for POT files, because POT files usually contain
481 only ASCII msgids. */
482 size_t filenamelen = strlen (filename);
484 if (!(filenamelen >= 4
485 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
486 && strcmp (charset, "CHARSET") == 0))
488 char *warning_message =
490 Charset \"%s\" is not a portable encoding name.\n\
491 Message conversion to user's charset might not work.\n"),
493 po_xerror (PO_SEVERITY_WARNING, NULL,
494 filename, (size_t)(-1), (size_t)(-1), true,
496 free (warning_message);
503 po_lex_charset = canon_charset;
505 if (po_lex_iconv != (iconv_t)(-1))
506 iconv_close (po_lex_iconv);
509 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
510 don't know about multibyte encodings, and require a spurious
511 backslash after every multibyte character whose last byte is
512 0x5C. Some programs, like vim, distribute PO files in this
513 broken format. GNU msgfmt must continue to support this old
514 PO file format when the Makefile requests it. */
515 envval = getenv ("OLD_PO_FILE_INPUT");
516 if (envval != NULL && *envval != '\0')
518 /* Assume the PO file is in old format, with extraneous
521 po_lex_iconv = (iconv_t)(-1);
523 po_lex_weird_cjk = false;
527 /* Use iconv() to parse multibyte characters. */
529 /* Avoid glibc-2.1 bug with EUC-KR. */
530 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
531 && !defined _LIBICONV_VERSION
532 if (strcmp (po_lex_charset, "EUC-KR") == 0)
533 po_lex_iconv = (iconv_t)(-1);
536 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
538 # if defined __sun && !defined _LIBICONV_VERSION
539 if ( strcmp (po_lex_charset, "GB2312") == 0
540 || strcmp (po_lex_charset, "EUC-TW") == 0
541 || strcmp (po_lex_charset, "BIG5") == 0
542 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
543 || strcmp (po_lex_charset, "GBK") == 0
544 || strcmp (po_lex_charset, "GB18030") == 0)
545 po_lex_iconv = (iconv_t)(-1);
548 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
549 if (po_lex_iconv == (iconv_t)(-1))
551 char *warning_message;
552 const char *recommendation;
558 Charset \"%s\" is not supported. %s relies on iconv(),\n\
559 and iconv() does not support \"%s\".\n"),
560 po_lex_charset, basename (program_name),
563 # if !defined _LIBICONV_VERSION
564 recommendation = _("\
565 Installing GNU libiconv and then reinstalling GNU gettext\n\
566 would fix this problem.\n");
571 /* Test for a charset which has double-byte characters
572 ending in 0x5C. For these encodings, the string parser
573 is likely to be confused if it can't see the character
575 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
576 if (po_is_charset_weird (po_lex_charset)
577 && !po_lex_weird_cjk)
578 note = _("Continuing anyway, expect parse errors.");
580 note = _("Continuing anyway.");
583 xasprintf ("%s%s%s\n",
584 warning_message, recommendation, note);
586 po_xerror (PO_SEVERITY_WARNING, NULL,
587 filename, (size_t)(-1), (size_t)(-1), true,
590 free (whole_message);
591 free (warning_message);
594 /* Test for a charset which has double-byte characters
595 ending in 0x5C. For these encodings, the string parser
596 is likely to be confused if it can't see the character
598 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
599 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
601 char *warning_message;
602 const char *recommendation;
608 Charset \"%s\" is not supported. %s relies on iconv().\n\
609 This version was built without iconv().\n"),
610 po_lex_charset, basename (program_name));
612 recommendation = _("\
613 Installing GNU libiconv and then reinstalling GNU gettext\n\
614 would fix this problem.\n");
616 note = _("Continuing anyway, expect parse errors.");
619 xasprintf ("%s%s%s\n",
620 warning_message, recommendation, note);
622 po_xerror (PO_SEVERITY_WARNING, NULL,
623 filename, (size_t)(-1), (size_t)(-1), true,
626 free (whole_message);
627 free (warning_message);
636 /* Don't warn for POT files, because POT files usually contain
637 only ASCII msgids. */
638 size_t filenamelen = strlen (filename);
640 if (!(filenamelen >= 4
641 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
642 po_xerror (PO_SEVERITY_WARNING,
643 NULL, filename, (size_t)(-1), (size_t)(-1), true,
645 Charset missing in header.\n\
646 Message conversion to user's charset will not work.\n"));
651 po_lex_charset_close ()
653 po_lex_charset = NULL;
655 if (po_lex_iconv != (iconv_t)(-1))
657 iconv_close (po_lex_iconv);
658 po_lex_iconv = (iconv_t)(-1);
661 po_lex_weird_cjk = false;