1 /* Reading NeXTstep/GNUstep .strings files.
2 Copyright (C) 2003, 2005-2007, 2009, 2015 Free Software Foundation,
4 Written by Bruno Haible <bruno@clisp.org>, 2003.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
24 #include "read-stringtable.h"
34 #include "error-progname.h"
35 #include "read-catalog-abstract.h"
37 #include "xvasprintf.h"
38 #include "po-xerror.h"
42 #define _(str) gettext (str)
44 /* The format of NeXTstep/GNUstep .strings files is documented in
45 gnustep-base-1.8.0/Tools/make_strings/Using.txt
46 and in the comments of method propertyListFromStringsFileFormat in
47 gnustep-base-1.8.0/Source/NSString.m
48 In summary, it's a Objective-C like file with pseudo-assignments of the form
50 where the key is the msgid and the value is the msgstr.
52 The implementation of the parser of .strings files is in
53 gnustep-base-1.8.0/Source/NSString.m
54 function GSPropertyListFromStringsFormat
55 (indirectly called from NSBundle's method localizedStringForKey).
58 gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
61 /* Handling of comments: We copy all comments from the .strings file to
62 the PO file. This is not really needed; it's a service for translators
63 who don't like PO files and prefer to maintain the .strings file. */
66 /* Real filename, used in error messages about the input file. */
67 static const char *real_file_name;
69 /* File name and line number. */
70 extern lex_pos_ty gram_pos;
72 /* The input file stream. */
76 /* Phase 1: Read a byte.
77 Max. 4 pushback characters. */
79 static unsigned char phase1_pushback[4];
80 static int phase1_pushback_length;
87 if (phase1_pushback_length)
88 return phase1_pushback[--phase1_pushback_length];
96 const char *errno_description = strerror (errno);
97 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
99 xasprintf (_("error while reading \"%s\""),
110 phase1_ungetc (int c)
113 phase1_pushback[phase1_pushback_length++] = c;
117 /* Phase 2: Read an UCS-4 character.
118 Max. 2 pushback characters. */
120 /* End-of-file indicator for functions returning an UCS-4 character. */
123 static int phase2_pushback[4];
124 static int phase2_pushback_length;
126 /* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
127 with a BOM!), or otherwise the locale-dependent default encoding is used.
128 Since we don't want to depend on the locale here, we use ISO-8859-1
138 static enum enc encoding;
143 if (phase2_pushback_length)
144 return phase2_pushback[--phase2_pushback_length];
146 if (encoding == enc_undetermined)
148 /* Determine the input file's encoding. */
158 encoding = enc_iso8859_1;
160 else if (c0 == 0xfe && c1 == 0xff)
161 encoding = enc_ucs2be;
162 else if (c0 == 0xff && c1 == 0xfe)
163 encoding = enc_ucs2le;
173 encoding = enc_iso8859_1;
175 else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
182 encoding = enc_iso8859_1;
190 /* Read an UCS-2BE encoded character. */
200 return (c0 << 8) + c1;
204 /* Read an UCS-2LE encoded character. */
214 return c0 + (c1 << 8);
218 /* Read an UTF-8 encoded character. */
220 unsigned char buf[6];
240 && ((buf[1] ^ 0x80) < 0x40))
249 && ((buf[2] ^ 0x80) < 0x40))
258 && ((buf[3] ^ 0x80) < 0x40))
267 && ((buf[4] ^ 0x80) < 0x40))
280 u8_mbtouc (&uc, buf, count);
285 /* Read an ISO-8859-1 encoded character. */
287 int c = phase1_getc ();
300 phase2_ungetc (int c)
303 phase2_pushback[phase2_pushback_length++] = c;
307 /* Phase 3: Read an UCS-4 character, with line number handling. */
312 int c = phase2_getc ();
315 gram_pos.line_number++;
321 phase3_ungetc (int c)
324 --gram_pos.line_number;
329 /* Convert from UCS-4 to UTF-8. */
331 conv_from_ucs4 (const int *buffer, size_t buflen)
333 unsigned char *utf8_string;
337 /* Each UCS-4 word needs 6 bytes at worst. */
338 utf8_string = XNMALLOC (6 * buflen + 1, unsigned char);
340 for (pos = 0, q = utf8_string; pos < buflen; )
346 n = u8_uctomb (q, uc, 6);
351 assert (q - utf8_string <= 6 * buflen);
353 return (char *) utf8_string;
357 /* Parse a string enclosed in double-quotes. Input is UCS-4 encoded.
358 Return the string in UTF-8 encoding, or NULL if the input doesn't represent
359 a valid string enclosed in double-quotes. */
361 parse_escaped_string (const int *string, size_t length)
364 static size_t bufmax;
365 static size_t buflen;
366 const int *string_limit = string + length;
369 if (string == string_limit)
377 if (string == string_limit)
384 if (string == string_limit)
387 if (c >= '0' && c <= '7')
393 n = n * 8 + (c - '0');
396 if (string == string_limit)
399 if (!(c >= '0' && c <= '7'))
405 else if (c == 'u' || c == 'U')
409 for (j = 0; j < 4; j++)
411 if (string == string_limit)
414 if (c >= '0' && c <= '9')
415 n = n * 16 + (c - '0');
416 else if (c >= 'A' && c <= 'F')
417 n = n * 16 + (c - 'A' + 10);
418 else if (c >= 'a' && c <= 'f')
419 n = n * 16 + (c - 'a' + 10);
429 case 'a': c = '\a'; break;
430 case 'b': c = '\b'; break;
431 case 't': c = '\t'; break;
432 case 'r': c = '\r'; break;
433 case 'n': c = '\n'; break;
434 case 'v': c = '\v'; break;
435 case 'f': c = '\f'; break;
438 if (buflen >= bufmax)
440 bufmax = 2 * bufmax + 10;
441 buffer = xrealloc (buffer, bufmax * sizeof (int));
443 buffer[buflen++] = c;
446 return conv_from_ucs4 (buffer, buflen);
450 /* Accumulating flag comments. */
452 static char *special_comment;
455 special_comment_reset ()
457 if (special_comment != NULL)
458 free (special_comment);
459 special_comment = NULL;
463 special_comment_add (const char *flag)
465 if (special_comment == NULL)
466 special_comment = xstrdup (flag);
469 size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
470 special_comment = xrealloc (special_comment, total_len);
471 strcat (special_comment, ", ");
472 strcat (special_comment, flag);
477 special_comment_finish ()
479 if (special_comment != NULL)
481 po_callback_comment_special (special_comment);
482 free (special_comment);
483 special_comment = NULL;
488 /* Accumulating comments. */
491 static size_t bufmax;
492 static size_t buflen;
493 static bool next_is_obsolete;
494 static bool next_is_fuzzy;
495 static char *fuzzy_msgstr;
496 static bool expect_fuzzy_msgstr_as_c_comment;
497 static bool expect_fuzzy_msgstr_as_cxx_comment;
508 if (buflen >= bufmax)
510 bufmax = 2 * bufmax + 10;
511 buffer = xrealloc (buffer, bufmax * sizeof (int));
513 buffer[buflen++] = c;
517 comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
521 buflen -= chars_to_remove;
522 /* Drop trailing white space, but not EOLs. */
524 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
527 /* At special positions we interpret a comment of the form
529 with an optional trailing semicolon as being the fuzzy msgstr, not a
531 if (test_for_fuzzy_msgstr
532 && buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
534 parse_escaped_string (buffer + 2,
535 buflen - (buffer[buflen - 1] == ';') - 2)))
538 line = conv_from_ucs4 (buffer, buflen);
540 if (strcmp (line, "Flag: untranslated") == 0)
542 special_comment_add ("fuzzy");
543 next_is_fuzzy = true;
545 else if (strcmp (line, "Flag: unmatched") == 0)
546 next_is_obsolete = true;
547 else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
548 special_comment_add (line + 6);
549 else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
550 /* A comment extracted from the source. */
551 po_callback_comment_dot (line + 9);
555 unsigned long number;
558 if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
559 && (last_colon = strrchr (line + 6, ':')) != NULL
560 && *(last_colon + 1) != '\0'
561 && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
563 /* A "File: <filename>:<number>" type comment. */
565 po_callback_comment_filepos (line + 6, number);
568 po_callback_comment (line);
573 /* Phase 4: Replace each comment that is not inside a string with a space
592 /* C style comment. */
595 size_t trailing_stars;
599 last_was_star = false;
601 seen_newline = false;
602 /* Drop additional stars at the beginning of the comment. */
608 last_was_star = true;
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen == 0 && (c == ' ' || c == '\t')))
623 comment_line_end (1, false);
625 last_was_star = false;
630 last_was_star = true;
637 /* Drop additional stars at the end of the comment. */
638 comment_line_end (trailing_stars + 1,
639 expect_fuzzy_msgstr_as_c_comment
646 last_was_star = false;
656 /* C++ style comment. */
661 if (c == '\n' || c == UEOF)
663 /* We skip all leading white space, but not EOLs. */
664 if (!(buflen == 0 && (c == ' ' || c == '\t')))
667 comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
673 phase4_ungetc (int c)
679 /* Return true if a character is considered as whitespace. */
681 is_whitespace (int c)
683 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'
687 /* Return true if a character needs quoting, i.e. cannot be used in unquoted
692 if ((c >= '0' && c <= '9')
693 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
697 case '!': case '#': case '$': case '%': case '&': case '*':
698 case '+': case '-': case '.': case '/': case ':': case '?':
699 case '@': case '|': case '~': case '_': case '^':
707 /* Read a key or value string.
708 Return the string in UTF-8 encoding, or NULL if no string is seen.
709 Return the start position of the string in *pos. */
711 read_string (lex_pos_ty *pos)
714 static size_t bufmax;
715 static size_t buflen;
718 /* Skip whitespace before the string. */
721 while (is_whitespace (c));
724 /* No more string. */
731 /* Read a string enclosed in double-quotes. */
735 if (c == UEOF || c == '"')
742 if (c >= '0' && c <= '7')
748 n = n * 8 + (c - '0');
752 if (!(c >= '0' && c <= '7'))
760 else if (c == 'u' || c == 'U')
764 for (j = 0; j < 4; j++)
767 if (c >= '0' && c <= '9')
768 n = n * 16 + (c - '0');
769 else if (c >= 'A' && c <= 'F')
770 n = n * 16 + (c - 'A' + 10);
771 else if (c >= 'a' && c <= 'f')
772 n = n * 16 + (c - 'a' + 10);
784 case 'a': c = '\a'; break;
785 case 'b': c = '\b'; break;
786 case 't': c = '\t'; break;
787 case 'r': c = '\r'; break;
788 case 'n': c = '\n'; break;
789 case 'v': c = '\v'; break;
790 case 'f': c = '\f'; break;
793 if (buflen >= bufmax)
795 bufmax = 2 * bufmax + 10;
796 buffer = xrealloc (buffer, bufmax * sizeof (int));
798 buffer[buflen++] = c;
801 po_xerror (PO_SEVERITY_ERROR, NULL,
802 real_file_name, gram_pos.line_number, (size_t)(-1), false,
803 _("warning: unterminated string"));
807 /* Read a token outside quotes. */
809 po_xerror (PO_SEVERITY_ERROR, NULL,
810 real_file_name, gram_pos.line_number, (size_t)(-1), false,
811 _("warning: syntax error"));
812 for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
814 if (buflen >= bufmax)
816 bufmax = 2 * bufmax + 10;
817 buffer = xrealloc (buffer, bufmax * sizeof (int));
819 buffer[buflen++] = c;
823 return conv_from_ucs4 (buffer, buflen);
827 /* Read a .strings file from a stream, and dispatch to the various
828 abstract_catalog_reader_class_ty methods. */
830 stringtable_parse (abstract_catalog_reader_ty *pop, FILE *file,
831 const char *real_filename, const char *logical_filename)
834 real_file_name = real_filename;
835 gram_pos.file_name = xstrdup (real_file_name);
836 gram_pos.line_number = 1;
837 encoding = enc_undetermined;
838 expect_fuzzy_msgstr_as_c_comment = false;
839 expect_fuzzy_msgstr_as_cxx_comment = false;
844 lex_pos_ty msgid_pos;
846 lex_pos_ty msgstr_pos;
849 /* Prepare for next msgid/msgstr pair. */
850 special_comment_reset ();
851 next_is_obsolete = false;
852 next_is_fuzzy = false;
855 /* Read the key and all the comments preceding it. */
856 msgid = read_string (&msgid_pos);
860 special_comment_finish ();
862 /* Skip whitespace. */
865 while (is_whitespace (c));
867 /* Expect a '=' or ';'. */
870 po_xerror (PO_SEVERITY_ERROR, NULL,
871 real_file_name, gram_pos.line_number, (size_t)(-1), false,
872 _("warning: unterminated key/value pair"));
877 /* "key"; is an abbreviation for "key"=""; and does not
878 necessarily designate an untranslated entry. */
879 msgstr = xstrdup ("");
880 msgstr_pos = msgid_pos;
881 po_callback_message (NULL, msgid, &msgid_pos, NULL,
882 msgstr, strlen (msgstr) + 1, &msgstr_pos,
884 false, next_is_obsolete);
888 /* Read the value. */
889 msgstr = read_string (&msgstr_pos);
892 po_xerror (PO_SEVERITY_ERROR, NULL,
893 real_file_name, gram_pos.line_number, (size_t)(-1),
894 false, _("warning: unterminated key/value pair"));
898 /* Skip whitespace. But for fuzzy key/value pairs, look for the
899 tentative msgstr in the form of a C style comment. */
900 expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
904 if (fuzzy_msgstr != NULL)
905 expect_fuzzy_msgstr_as_c_comment = false;
907 while (is_whitespace (c));
908 expect_fuzzy_msgstr_as_c_comment = false;
913 /* But for fuzzy key/value pairs, look for the tentative msgstr
914 in the form of a C++ style comment. */
915 if (fuzzy_msgstr == NULL && next_is_fuzzy)
922 expect_fuzzy_msgstr_as_cxx_comment = true;
925 expect_fuzzy_msgstr_as_cxx_comment = false;
927 if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
928 msgstr = fuzzy_msgstr;
930 /* A key/value pair. */
931 po_callback_message (NULL, msgid, &msgid_pos, NULL,
932 msgstr, strlen (msgstr) + 1, &msgstr_pos,
934 false, next_is_obsolete);
938 po_xerror (PO_SEVERITY_ERROR, NULL,
939 real_file_name, gram_pos.line_number, (size_t)(-1),
941 warning: syntax error, expected ';' after string"));
947 po_xerror (PO_SEVERITY_ERROR, NULL,
948 real_file_name, gram_pos.line_number, (size_t)(-1), false,
950 warning: syntax error, expected '=' or ';' after string"));
956 real_file_name = NULL;
957 gram_pos.line_number = 0;
960 const struct catalog_input_format input_format_stringtable =
962 stringtable_parse, /* parse */
963 true /* produces_utf8 */