1 /* GNU gettext - internationalization aids
2 Copyright (C) 1995-1999, 2000-2009, 2011 Free Software Foundation, Inc.
4 This file was written by Peter Miller <millerp@canb.auug.org.au>.
5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
42 #include "po-charset.h"
45 #include "error-progname.h"
46 #include "xvasprintf.h"
48 #include "po-xerror.h"
52 #include "po-gram-gen2.h"
54 #define _(str) gettext(str)
60 #if HAVE_DECL_GETC_UNLOCKED
62 # define getc getc_unlocked
66 /* Current position within the PO file. */
71 /* Error handling during the parsing of a PO file.
72 These functions can access gram_pos and gram_pos_column. */
76 po_gram_error (const char *fmt, ...)
82 if (vasprintf (&buffer, fmt, ap) < 0)
83 error (EXIT_FAILURE, 0, _("memory exhausted"));
85 po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
86 gram_pos_column + 1, false, buffer);
89 if (error_message_count >= gram_max_allowed_errors)
90 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
95 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
101 if (vasprintf (&buffer, fmt, ap) < 0)
102 error (EXIT_FAILURE, 0, _("memory exhausted"));
104 po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
105 (size_t)(-1), false, buffer);
108 if (error_message_count >= gram_max_allowed_errors)
109 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
113 /* The lowest level of PO file parsing converts bytes to multibyte characters.
115 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
116 translation phase maps bytes to characters.
117 2. to keep track of the current column, for the sake of precise error
118 location. Emacs compile.el interprets the column in error messages
119 by default as a screen column number, not as character number.
120 3. to avoid skipping backslash-newline in the midst of a multibyte
121 character. If XY is a multibyte character, X \ newline Y is invalid.
124 /* Multibyte character data type. */
125 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
126 while the file is being parsed. */
128 #define MBCHAR_BUF_SIZE 24
132 size_t bytes; /* number of bytes of current character, > 0 */
134 bool uc_valid; /* true if uc is a valid Unicode character */
135 ucs4_t uc; /* if uc_valid: the current character */
137 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
140 /* We want to pass multibyte characters by reference automatically,
141 therefore we use an array type. */
142 typedef struct mbchar mbchar_t[1];
144 /* A version of memcpy optimized for the case n <= 1. */
146 memcpy_small (void *dst, const void *src, size_t n)
150 char *q = (char *) dst;
151 const char *p = (const char *) src;
155 do *++q = *++p; while (--n > 0);
159 /* EOF (not a real character) is represented with bytes = 0 and
162 mb_iseof (const mbchar_t mbc)
164 return (mbc->bytes == 0);
167 /* Access the current character. */
168 static inline const char *
169 mb_ptr (const mbchar_t mbc)
174 mb_len (const mbchar_t mbc)
179 /* Comparison of characters. */
182 mb_iseq (const mbchar_t mbc, char sc)
184 /* Note: It is wrong to compare only mbc->uc, because when the encoding is
185 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
186 want to treat it as an escape character, although it looks like a Yen
190 return (mbc->uc == sc); /* wrong! */
193 return (mbc->bytes == 1 && mbc->buf[0] == sc);
197 mb_isnul (const mbchar_t mbc)
201 return (mbc->uc == 0);
204 return (mbc->bytes == 1 && mbc->buf[0] == 0);
208 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
211 if (mbc1->uc_valid && mbc2->uc_valid)
212 return (int) mbc1->uc - (int) mbc2->uc;
215 return (mbc1->bytes == mbc2->bytes
216 ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
217 : mbc1->bytes < mbc2->bytes
218 ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
219 : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
223 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
226 if (mbc1->uc_valid && mbc2->uc_valid)
227 return mbc1->uc == mbc2->uc;
230 return (mbc1->bytes == mbc2->bytes
231 && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
234 /* <ctype.h>, <wctype.h> classification. */
237 mb_isascii (const mbchar_t mbc)
241 return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
244 return (mbc->bytes == 1
245 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
246 && mbc->buf[0] >= 0x00
248 #if CHAR_MAX > 0x7F /* to avoid gcc warning */
249 && mbc->buf[0] <= 0x7F
254 /* Extra <wchar.h> function. */
256 /* Unprintable characters appear as a small box of width 1. */
257 #define MB_UNPRINTABLE_WIDTH 1
260 mb_width (const mbchar_t mbc)
266 const char *encoding =
267 (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
268 int w = uc_width (uc, encoding);
269 /* For unprintable characters, arbitrarily return 0 for control
270 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */
273 if (uc >= 0x0000 && uc <= 0x001F)
276 return 8 - (gram_pos_column & 7);
279 if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
281 return MB_UNPRINTABLE_WIDTH;
289 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
290 mbc->buf[0] >= 0x00 &&
294 if (mbc->buf[0] == 0x09)
295 return 8 - (gram_pos_column & 7);
298 if (mbc->buf[0] == 0x7F)
301 return MB_UNPRINTABLE_WIDTH;
307 mb_putc (const mbchar_t mbc, FILE *stream)
309 fwrite (mbc->buf, 1, mbc->bytes, stream);
314 mb_setascii (mbchar_t mbc, char sc)
324 /* Copying a character. */
326 mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc)
328 memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
329 new_mbc->bytes = old_mbc->bytes;
331 if ((new_mbc->uc_valid = old_mbc->uc_valid))
332 new_mbc->uc = old_mbc->uc;
337 /* Multibyte character input. */
339 /* Number of characters that can be pushed back.
340 We need 1 for lex_getc, plus 1 for lex_ungetc. */
343 /* Data type of a multibyte character input stream. */
349 unsigned int bufcount;
350 char buf[MBCHAR_BUF_SIZE];
351 struct mbchar pushback[NPUSHBACK];
354 /* We want to pass multibyte streams by reference automatically,
355 therefore we use an array type. */
356 typedef struct mbfile mbfile_t[1];
358 /* Whether invalid multibyte sequences in the input shall be signalled
359 or silently tolerated. */
360 static bool signal_eilseq;
363 mbfile_init (mbfile_t mbf, FILE *stream)
366 mbf->eof_seen = false;
367 mbf->have_pushback = 0;
371 /* Read the next multibyte character from mbf and put it into mbc.
372 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */
374 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
378 /* If EOF has already been seen, don't use getc. This matters if
379 mbf->fp is connected to an interactive tty. */
383 /* Return character pushed back, if there is one. */
384 if (mbf->have_pushback > 0)
386 mbf->have_pushback--;
387 mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
391 /* Before using iconv, we need at least one byte. */
392 if (mbf->bufcount == 0)
394 int c = getc (mbf->fp);
397 mbf->eof_seen = true;
400 mbf->buf[0] = (unsigned char) c;
405 if (po_lex_iconv != (iconv_t)(-1))
407 /* Use iconv on an increasing number of bytes. Read only as many
408 bytes from mbf->fp as needed. This is needed to give reasonable
409 interactive behaviour when mbf->fp is connected to an interactive
413 unsigned char scratchbuf[64];
414 const char *inptr = &mbf->buf[0];
415 size_t insize = mbf->bufcount;
416 char *outptr = (char *) &scratchbuf[0];
417 size_t outsize = sizeof (scratchbuf);
419 size_t res = iconv (po_lex_iconv,
420 (ICONV_CONST char **) &inptr, &insize,
422 /* We expect that a character has been produced if and only if
423 some input bytes have been consumed. */
424 if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
426 if (outsize == sizeof (scratchbuf))
428 /* No character has been produced. Must be an error. */
429 if (res != (size_t)(-1))
434 /* An invalid multibyte sequence was encountered. */
435 /* Return a single byte. */
437 po_gram_error (_("invalid multibyte sequence"));
439 mbc->uc_valid = false;
442 else if (errno == EINVAL)
444 /* An incomplete multibyte character. */
447 if (mbf->bufcount == MBCHAR_BUF_SIZE)
449 /* An overlong incomplete multibyte sequence was
451 /* Return a single byte. */
453 mbc->uc_valid = false;
457 /* Read one more byte and retry iconv. */
461 mbf->eof_seen = true;
462 if (ferror (mbf->fp))
466 incomplete multibyte sequence at end of file"));
467 bytes = mbf->bufcount;
468 mbc->uc_valid = false;
471 mbf->buf[mbf->bufcount++] = (unsigned char) c;
476 incomplete multibyte sequence at end of line"));
477 bytes = mbf->bufcount - 1;
478 mbc->uc_valid = false;
484 const char *errno_description = strerror (errno);
485 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
493 size_t outbytes = sizeof (scratchbuf) - outsize;
494 bytes = mbf->bufcount - insize;
496 /* We expect that one character has been produced. */
501 /* Convert it from UTF-8 to UCS-4. */
502 if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes)
504 /* scratchbuf contains an out-of-range Unicode character
507 po_gram_error (_("invalid multibyte sequence"));
508 mbc->uc_valid = false;
511 mbc->uc_valid = true;
520 /* Special handling of encodings with CJK structure. */
521 && (unsigned char) mbf->buf[0] >= 0x80)
523 if (mbf->bufcount == 1)
525 /* Read one more byte. */
526 int c = getc (mbf->fp);
529 if (ferror (mbf->fp))
531 mbf->eof_seen = true;
537 mbf->buf[1] = (unsigned char) c;
541 if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
542 /* Return a double byte. */
545 /* Return a single byte. */
550 /* Return a single byte. */
554 mbc->uc_valid = false;
558 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
559 memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
562 mbf->bufcount -= bytes;
563 if (mbf->bufcount > 0)
565 /* It's not worth calling memmove() for so few bytes. */
566 unsigned int count = mbf->bufcount;
567 char *p = &mbf->buf[0];
579 /* An mbchar_t with bytes == 0 is used to indicate EOF. */
582 mbc->uc_valid = false;
588 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
590 if (mbf->have_pushback >= NPUSHBACK)
592 mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
593 mbf->have_pushback++;
597 /* Lexer variables. */
600 unsigned int gram_max_allowed_errors = 20;
601 static bool po_lex_obsolete;
602 static bool po_lex_previous;
603 static bool pass_comments = false;
604 bool pass_obsolete_entries = false;
607 /* Prepare lexical analysis. */
609 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
611 /* Ignore the logical_filename, because PO file entries already have
612 their file names attached. But use real_filename for error messages. */
613 gram_pos.file_name = xstrdup (real_filename);
615 mbfile_init (mbf, fp);
617 gram_pos.line_number = 1;
619 signal_eilseq = true;
620 po_lex_obsolete = false;
621 po_lex_previous = false;
622 po_lex_charset_init ();
625 /* Terminate lexical analysis. */
630 gram_pos.file_name = NULL;
631 gram_pos.line_number = 0;
633 signal_eilseq = false;
634 po_lex_obsolete = false;
635 po_lex_previous = false;
636 po_lex_charset_close ();
640 /* Read a single character, dealing with backslash-newline.
641 Also keep track of the current line number and column number. */
643 lex_getc (mbchar_t mbc)
647 mbfile_getc (mbc, mbf);
651 if (ferror (mbf->fp))
654 const char *errno_description = strerror (errno);
655 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
657 xasprintf (_("error while reading \"%s\""),
664 if (mb_iseq (mbc, '\n'))
666 gram_pos.line_number++;
671 gram_pos_column += mb_width (mbc);
673 if (mb_iseq (mbc, '\\'))
677 mbfile_getc (mbc2, mbf);
681 if (ferror (mbf->fp))
686 if (!mb_iseq (mbc2, '\n'))
688 mbfile_ungetc (mbc2, mbf);
692 gram_pos.line_number++;
702 lex_ungetc (const mbchar_t mbc)
706 if (mb_iseq (mbc, '\n'))
707 /* Decrement the line number, but don't care about the column. */
708 gram_pos.line_number--;
710 /* Decrement the column number. Also works well enough for tabs. */
711 gram_pos_column -= mb_width (mbc);
713 mbfile_ungetc (mbc, mbf);
719 keyword_p (const char *s)
721 if (!po_lex_previous)
723 if (!strcmp (s, "domain"))
725 if (!strcmp (s, "msgid"))
727 if (!strcmp (s, "msgid_plural"))
729 if (!strcmp (s, "msgstr"))
731 if (!strcmp (s, "msgctxt"))
736 /* Inside a "#|" context, the keywords have a different meaning. */
737 if (!strcmp (s, "msgid"))
739 if (!strcmp (s, "msgid_plural"))
740 return PREV_MSGID_PLURAL;
741 if (!strcmp (s, "msgctxt"))
744 po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
757 if (mb_len (mbc) == 1)
758 switch (mb_ptr (mbc) [0])
783 return mb_ptr (mbc) [0];
785 case '0': case '1': case '2': case '3':
786 case '4': case '5': case '6': case '7':
791 char c = mb_ptr (mbc) [0];
792 /* Warning: not portable, can't depend on '0'..'7' ordering. */
793 val = val * 8 + (c - '0');
797 if (mb_len (mbc) == 1)
798 switch (mb_ptr (mbc) [0])
800 case '0': case '1': case '2': case '3':
801 case '4': case '5': case '6': case '7':
814 if (mb_iseof (mbc) || mb_len (mbc) != 1
815 || !c_isxdigit (mb_ptr (mbc) [0]))
821 char c = mb_ptr (mbc) [0];
824 /* Warning: not portable, can't depend on '0'..'9' ordering */
826 else if (c_isupper (c))
827 /* Warning: not portable, can't depend on 'A'..'F' ordering */
830 /* Warning: not portable, can't depend on 'a'..'f' ordering */
834 if (mb_len (mbc) == 1)
835 switch (mb_ptr (mbc) [0])
837 case '0': case '1': case '2': case '3': case '4':
838 case '5': case '6': case '7': case '8': case '9':
839 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
840 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
851 /* FIXME: \u and \U are not handled. */
854 po_gram_error (_("invalid control sequence"));
859 /* Return the next token in the PO file. The return codes are defined
860 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */
865 static size_t bufmax;
874 /* Yacc want this for end of file. */
877 if (mb_len (mbc) == 1)
878 switch (mb_ptr (mbc) [0])
881 po_lex_obsolete = false;
882 po_lex_previous = false;
883 /* Ignore whitespace, not relevant for the grammar. */
891 /* Ignore whitespace, not relevant for the grammar. */
896 if (mb_iseq (mbc, '~'))
897 /* A pseudo-comment beginning with #~ is found. This is
898 not a comment. It is the format for obsolete entries.
899 We simply discard the "#~" prefix. The following
900 characters are expected to be well formed. */
902 po_lex_obsolete = true;
903 /* A pseudo-comment beginning with #~| denotes a previous
904 untranslated string in an obsolete entry. This does not
905 make much sense semantically, and is implemented here
906 for completeness only. */
908 if (mb_iseq (mbc, '|'))
909 po_lex_previous = true;
914 if (mb_iseq (mbc, '|'))
915 /* A pseudo-comment beginning with #| is found. This is
916 the previous untranslated string. We discard the "#|"
917 prefix, but change the keywords and string returns
920 po_lex_previous = true;
924 /* Accumulate comments into a buffer. If we have been asked
925 to pass comments, generate a COMMENT token, otherwise
927 signal_eilseq = false;
933 while (bufpos + mb_len (mbc) >= bufmax)
936 buf = xrealloc (buf, bufmax);
938 if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
941 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
942 bufpos += mb_len (mbc);
948 po_gram_lval.string.string = buf;
949 po_gram_lval.string.pos = gram_pos;
950 po_gram_lval.string.obsolete = po_lex_obsolete;
951 po_lex_obsolete = false;
952 signal_eilseq = true;
957 /* We do this in separate loop because collecting large
958 comments while they get not passed to the upper layers
959 is not very efficient. */
960 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
962 po_lex_obsolete = false;
963 signal_eilseq = true;
968 /* Accumulate a string. */
973 while (bufpos + mb_len (mbc) >= bufmax)
976 buf = xrealloc (buf, bufmax);
980 po_gram_error_at_line (&gram_pos,
981 _("end-of-file within string"));
984 if (mb_iseq (mbc, '\n'))
986 po_gram_error_at_line (&gram_pos,
987 _("end-of-line within string"));
990 if (mb_iseq (mbc, '"'))
992 if (mb_iseq (mbc, '\\'))
994 buf[bufpos++] = control_sequence ();
998 /* Add mbc to the accumulator. */
999 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
1000 bufpos += mb_len (mbc);
1004 /* Strings cannot contain the msgctxt separator, because it cannot
1005 be faithfully represented in the msgid of a .mo file. */
1006 if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1007 po_gram_error_at_line (&gram_pos,
1008 _("context separator <EOT> within string"));
1010 /* FIXME: Treatment of embedded \000 chars is incorrect. */
1011 po_gram_lval.string.string = xstrdup (buf);
1012 po_gram_lval.string.pos = gram_pos;
1013 po_gram_lval.string.obsolete = po_lex_obsolete;
1014 return (po_lex_previous ? PREV_STRING : STRING);
1016 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1017 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1018 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1019 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1021 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1022 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1023 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1024 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1030 char c = mb_ptr (mbc) [0];
1031 if (bufpos + 1 >= bufmax)
1034 buf = xrealloc (buf, bufmax);
1038 if (mb_len (mbc) == 1)
1039 switch (mb_ptr (mbc) [0])
1043 case 'a': case 'b': case 'c': case 'd': case 'e':
1044 case 'f': case 'g': case 'h': case 'i': case 'j':
1045 case 'k': case 'l': case 'm': case 'n': case 'o':
1046 case 'p': case 'q': case 'r': case 's': case 't':
1047 case 'u': case 'v': case 'w': case 'x': case 'y':
1049 case 'A': case 'B': case 'C': case 'D': case 'E':
1050 case 'F': case 'G': case 'H': case 'I': case 'J':
1051 case 'K': case 'L': case 'M': case 'N': case 'O':
1052 case 'P': case 'Q': case 'R': case 'S': case 'T':
1053 case 'U': case 'V': case 'W': case 'X': case 'Y':
1056 case '0': case '1': case '2': case '3': case '4':
1057 case '5': case '6': case '7': case '8': case '9':
1067 int k = keyword_p (buf);
1070 po_gram_lval.string.string = xstrdup (buf);
1071 po_gram_lval.string.pos = gram_pos;
1072 po_gram_lval.string.obsolete = po_lex_obsolete;
1076 po_gram_lval.pos.pos = gram_pos;
1077 po_gram_lval.pos.obsolete = po_lex_obsolete;
1082 case '0': case '1': case '2': case '3': case '4':
1083 case '5': case '6': case '7': case '8': case '9':
1087 char c = mb_ptr (mbc) [0];
1088 if (bufpos + 1 >= bufmax)
1091 buf = xrealloc (buf, bufmax + 1);
1095 if (mb_len (mbc) == 1)
1096 switch (mb_ptr (mbc) [0])
1101 case '0': case '1': case '2': case '3': case '4':
1102 case '5': case '6': case '7': case '8': case '9':
1111 po_gram_lval.number.number = atol (buf);
1112 po_gram_lval.number.pos = gram_pos;
1113 po_gram_lval.number.obsolete = po_lex_obsolete;
1117 po_gram_lval.pos.pos = gram_pos;
1118 po_gram_lval.pos.obsolete = po_lex_obsolete;
1122 po_gram_lval.pos.pos = gram_pos;
1123 po_gram_lval.pos.obsolete = po_lex_obsolete;
1127 /* This will cause a syntax error. */
1131 /* This will cause a syntax error. */
1137 /* po_gram_lex() can return comments as COMMENT. Switch this on or off. */
1139 po_lex_pass_comments (bool flag)
1141 pass_comments = flag;
1145 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1146 Switch this on or off. */
1148 po_lex_pass_obsolete_entries (bool flag)
1150 pass_obsolete_entries = flag;