1 /* Reading NeXTstep/GNUstep .strings files.
2 Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "read-stringtable.h"
33 #include "error-progname.h"
34 #include "read-catalog-abstract.h"
36 #include "xvasprintf.h"
37 #include "po-xerror.h"
41 #define _(str) gettext (str)
43 /* The format of NeXTstep/GNUstep .strings files is documented in
44 gnustep-base-1.8.0/Tools/make_strings/Using.txt
45 and in the comments of method propertyListFromStringsFileFormat in
46 gnustep-base-1.8.0/Source/NSString.m
47 In summary, it's a Objective-C like file with pseudo-assignments of the form
49 where the key is the msgid and the value is the msgstr.
51 The implementation of the parser of .strings files is in
52 gnustep-base-1.8.0/Source/NSString.m
53 function GSPropertyListFromStringsFormat
54 (indirectly called from NSBundle's method localizedStringForKey).
57 gnustep-base-1.8.0/Testing/English.lproj/NXStringTable.example
60 /* Handling of comments: We copy all comments from the .strings file to
61 the PO file. This is not really needed; it's a service for translators
62 who don't like PO files and prefer to maintain the .strings file. */
65 /* Real filename, used in error messages about the input file. */
66 static const char *real_file_name;
68 /* File name and line number. */
69 extern lex_pos_ty gram_pos;
71 /* The input file stream. */
75 /* Phase 1: Read a byte.
76 Max. 4 pushback characters. */
78 static unsigned char phase1_pushback[4];
79 static int phase1_pushback_length;
86 if (phase1_pushback_length)
87 return phase1_pushback[--phase1_pushback_length];
95 const char *errno_description = strerror (errno);
96 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
98 xasprintf (_("error while reading \"%s\""),
109 phase1_ungetc (int c)
112 phase1_pushback[phase1_pushback_length++] = c;
116 /* Phase 2: Read an UCS-4 character.
117 Max. 2 pushback characters. */
119 /* End-of-file indicator for functions returning an UCS-4 character. */
122 static int phase2_pushback[4];
123 static int phase2_pushback_length;
125 /* The input file can be in Unicode encoding (UCS-2BE, UCS-2LE, UTF-8, each
126 with a BOM!), or otherwise the locale-dependent default encoding is used.
127 Since we don't want to depend on the locale here, we use ISO-8859-1
137 static enum enc encoding;
142 if (phase2_pushback_length)
143 return phase2_pushback[--phase2_pushback_length];
145 if (encoding == enc_undetermined)
147 /* Determine the input file's encoding. */
157 encoding = enc_iso8859_1;
159 else if (c0 == 0xfe && c1 == 0xff)
160 encoding = enc_ucs2be;
161 else if (c0 == 0xff && c1 == 0xfe)
162 encoding = enc_ucs2le;
172 encoding = enc_iso8859_1;
174 else if (c0 == 0xef && c1 == 0xbb && c2 == 0xbf)
181 encoding = enc_iso8859_1;
189 /* Read an UCS-2BE encoded character. */
199 return (c0 << 8) + c1;
203 /* Read an UCS-2LE encoded character. */
213 return c0 + (c1 << 8);
217 /* Read an UTF-8 encoded character. */
219 unsigned char buf[6];
239 && ((buf[1] ^ 0x80) < 0x40))
248 && ((buf[2] ^ 0x80) < 0x40))
257 && ((buf[3] ^ 0x80) < 0x40))
266 && ((buf[4] ^ 0x80) < 0x40))
279 u8_mbtouc (&uc, buf, count);
284 /* Read an ISO-8859-1 encoded character. */
286 int c = phase1_getc ();
299 phase2_ungetc (int c)
302 phase2_pushback[phase2_pushback_length++] = c;
306 /* Phase 3: Read an UCS-4 character, with line number handling. */
311 int c = phase2_getc ();
314 gram_pos.line_number++;
320 phase3_ungetc (int c)
323 --gram_pos.line_number;
328 /* Convert from UCS-4 to UTF-8. */
330 conv_from_ucs4 (const int *buffer, size_t buflen)
332 unsigned char *utf8_string;
336 /* Each UCS-4 word needs 6 bytes at worst. */
337 utf8_string = XNMALLOC (6 * buflen + 1, unsigned char);
339 for (pos = 0, q = utf8_string; pos < buflen; )
345 n = u8_uctomb (q, uc, 6);
350 assert (q - utf8_string <= 6 * buflen);
352 return (char *) utf8_string;
356 /* Parse a string enclosed in double-quotes. Input is UCS-4 encoded.
357 Return the string in UTF-8 encoding, or NULL if the input doesn't represent
358 a valid string enclosed in double-quotes. */
360 parse_escaped_string (const int *string, size_t length)
363 static size_t bufmax;
364 static size_t buflen;
365 const int *string_limit = string + length;
368 if (string == string_limit)
376 if (string == string_limit)
383 if (string == string_limit)
386 if (c >= '0' && c <= '7')
392 n = n * 8 + (c - '0');
395 if (string == string_limit)
398 if (!(c >= '0' && c <= '7'))
404 else if (c == 'u' || c == 'U')
408 for (j = 0; j < 4; j++)
410 if (string == string_limit)
413 if (c >= '0' && c <= '9')
414 n = n * 16 + (c - '0');
415 else if (c >= 'A' && c <= 'F')
416 n = n * 16 + (c - 'A' + 10);
417 else if (c >= 'a' && c <= 'f')
418 n = n * 16 + (c - 'a' + 10);
428 case 'a': c = '\a'; break;
429 case 'b': c = '\b'; break;
430 case 't': c = '\t'; break;
431 case 'r': c = '\r'; break;
432 case 'n': c = '\n'; break;
433 case 'v': c = '\v'; break;
434 case 'f': c = '\f'; break;
437 if (buflen >= bufmax)
439 bufmax = 2 * bufmax + 10;
440 buffer = xrealloc (buffer, bufmax * sizeof (int));
442 buffer[buflen++] = c;
445 return conv_from_ucs4 (buffer, buflen);
449 /* Accumulating flag comments. */
451 static char *special_comment;
454 special_comment_reset ()
456 if (special_comment != NULL)
457 free (special_comment);
458 special_comment = NULL;
462 special_comment_add (const char *flag)
464 if (special_comment == NULL)
465 special_comment = xstrdup (flag);
468 size_t total_len = strlen (special_comment) + 2 + strlen (flag) + 1;
469 special_comment = xrealloc (special_comment, total_len);
470 strcat (special_comment, ", ");
471 strcat (special_comment, flag);
476 special_comment_finish ()
478 if (special_comment != NULL)
480 po_callback_comment_special (special_comment);
481 free (special_comment);
482 special_comment = NULL;
487 /* Accumulating comments. */
490 static size_t bufmax;
491 static size_t buflen;
492 static bool next_is_obsolete;
493 static bool next_is_fuzzy;
494 static char *fuzzy_msgstr;
495 static bool expect_fuzzy_msgstr_as_c_comment;
496 static bool expect_fuzzy_msgstr_as_cxx_comment;
507 if (buflen >= bufmax)
509 bufmax = 2 * bufmax + 10;
510 buffer = xrealloc (buffer, bufmax * sizeof (int));
512 buffer[buflen++] = c;
516 comment_line_end (size_t chars_to_remove, bool test_for_fuzzy_msgstr)
520 buflen -= chars_to_remove;
521 /* Drop trailing white space, but not EOLs. */
523 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
526 /* At special positions we interpret a comment of the form
528 with an optional trailing semicolon as being the fuzzy msgstr, not a
530 if (test_for_fuzzy_msgstr
531 && buflen > 2 && buffer[0] == '=' && buffer[1] == ' '
533 parse_escaped_string (buffer + 2,
534 buflen - (buffer[buflen - 1] == ';') - 2)))
537 line = conv_from_ucs4 (buffer, buflen);
539 if (strcmp (line, "Flag: untranslated") == 0)
541 special_comment_add ("fuzzy");
542 next_is_fuzzy = true;
544 else if (strcmp (line, "Flag: unmatched") == 0)
545 next_is_obsolete = true;
546 else if (strlen (line) >= 6 && memcmp (line, "Flag: ", 6) == 0)
547 special_comment_add (line + 6);
548 else if (strlen (line) >= 9 && memcmp (line, "Comment: ", 9) == 0)
549 /* A comment extracted from the source. */
550 po_callback_comment_dot (line + 9);
554 unsigned long number;
557 if (strlen (line) >= 6 && memcmp (line, "File: ", 6) == 0
558 && (last_colon = strrchr (line + 6, ':')) != NULL
559 && *(last_colon + 1) != '\0'
560 && (number = strtoul (last_colon + 1, &endp, 10), *endp == '\0'))
562 /* A "File: <filename>:<number>" type comment. */
564 po_callback_comment_filepos (line + 6, number);
567 po_callback_comment (line);
572 /* Phase 4: Replace each comment that is not inside a string with a space
591 /* C style comment. */
594 size_t trailing_stars;
598 last_was_star = false;
600 seen_newline = false;
601 /* Drop additional stars at the beginning of the comment. */
607 last_was_star = true;
615 /* We skip all leading white space, but not EOLs. */
616 if (!(buflen == 0 && (c == ' ' || c == '\t')))
622 comment_line_end (1, false);
624 last_was_star = false;
629 last_was_star = true;
636 /* Drop additional stars at the end of the comment. */
637 comment_line_end (trailing_stars + 1,
638 expect_fuzzy_msgstr_as_c_comment
645 last_was_star = false;
655 /* C++ style comment. */
660 if (c == '\n' || c == UEOF)
662 /* We skip all leading white space, but not EOLs. */
663 if (!(buflen == 0 && (c == ' ' || c == '\t')))
666 comment_line_end (0, expect_fuzzy_msgstr_as_cxx_comment);
672 phase4_ungetc (int c)
678 /* Return true if a character is considered as whitespace. */
680 is_whitespace (int c)
682 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'
686 /* Return true if a character needs quoting, i.e. cannot be used in unquoted
691 if ((c >= '0' && c <= '9')
692 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
696 case '!': case '#': case '$': case '%': case '&': case '*':
697 case '+': case '-': case '.': case '/': case ':': case '?':
698 case '@': case '|': case '~': case '_': case '^':
706 /* Read a key or value string.
707 Return the string in UTF-8 encoding, or NULL if no string is seen.
708 Return the start position of the string in *pos. */
710 read_string (lex_pos_ty *pos)
713 static size_t bufmax;
714 static size_t buflen;
717 /* Skip whitespace before the string. */
720 while (is_whitespace (c));
723 /* No more string. */
730 /* Read a string enclosed in double-quotes. */
734 if (c == UEOF || c == '"')
741 if (c >= '0' && c <= '7')
747 n = n * 8 + (c - '0');
751 if (!(c >= '0' && c <= '7'))
759 else if (c == 'u' || c == 'U')
763 for (j = 0; j < 4; j++)
766 if (c >= '0' && c <= '9')
767 n = n * 16 + (c - '0');
768 else if (c >= 'A' && c <= 'F')
769 n = n * 16 + (c - 'A' + 10);
770 else if (c >= 'a' && c <= 'f')
771 n = n * 16 + (c - 'a' + 10);
783 case 'a': c = '\a'; break;
784 case 'b': c = '\b'; break;
785 case 't': c = '\t'; break;
786 case 'r': c = '\r'; break;
787 case 'n': c = '\n'; break;
788 case 'v': c = '\v'; break;
789 case 'f': c = '\f'; break;
792 if (buflen >= bufmax)
794 bufmax = 2 * bufmax + 10;
795 buffer = xrealloc (buffer, bufmax * sizeof (int));
797 buffer[buflen++] = c;
800 po_xerror (PO_SEVERITY_ERROR, NULL,
801 real_file_name, gram_pos.line_number, (size_t)(-1), false,
802 _("warning: unterminated string"));
806 /* Read a token outside quotes. */
808 po_xerror (PO_SEVERITY_ERROR, NULL,
809 real_file_name, gram_pos.line_number, (size_t)(-1), false,
810 _("warning: syntax error"));
811 for (; c != UEOF && !is_quotable (c); c = phase4_getc ())
813 if (buflen >= bufmax)
815 bufmax = 2 * bufmax + 10;
816 buffer = xrealloc (buffer, bufmax * sizeof (int));
818 buffer[buflen++] = c;
822 return conv_from_ucs4 (buffer, buflen);
826 /* Read a .strings file from a stream, and dispatch to the various
827 abstract_catalog_reader_class_ty methods. */
829 stringtable_parse (abstract_catalog_reader_ty *pop, FILE *file,
830 const char *real_filename, const char *logical_filename)
833 real_file_name = real_filename;
834 gram_pos.file_name = xstrdup (real_file_name);
835 gram_pos.line_number = 1;
836 encoding = enc_undetermined;
837 expect_fuzzy_msgstr_as_c_comment = false;
838 expect_fuzzy_msgstr_as_cxx_comment = false;
843 lex_pos_ty msgid_pos;
845 lex_pos_ty msgstr_pos;
848 /* Prepare for next msgid/msgstr pair. */
849 special_comment_reset ();
850 next_is_obsolete = false;
851 next_is_fuzzy = false;
854 /* Read the key and all the comments preceding it. */
855 msgid = read_string (&msgid_pos);
859 special_comment_finish ();
861 /* Skip whitespace. */
864 while (is_whitespace (c));
866 /* Expect a '=' or ';'. */
869 po_xerror (PO_SEVERITY_ERROR, NULL,
870 real_file_name, gram_pos.line_number, (size_t)(-1), false,
871 _("warning: unterminated key/value pair"));
876 /* "key"; is an abbreviation for "key"=""; and does not
877 necessarily designate an untranslated entry. */
878 msgstr = xstrdup ("");
879 msgstr_pos = msgid_pos;
880 po_callback_message (NULL, msgid, &msgid_pos, NULL,
881 msgstr, strlen (msgstr) + 1, &msgstr_pos,
883 false, next_is_obsolete);
887 /* Read the value. */
888 msgstr = read_string (&msgstr_pos);
891 po_xerror (PO_SEVERITY_ERROR, NULL,
892 real_file_name, gram_pos.line_number, (size_t)(-1),
893 false, _("warning: unterminated key/value pair"));
897 /* Skip whitespace. But for fuzzy key/value pairs, look for the
898 tentative msgstr in the form of a C style comment. */
899 expect_fuzzy_msgstr_as_c_comment = next_is_fuzzy;
903 if (fuzzy_msgstr != NULL)
904 expect_fuzzy_msgstr_as_c_comment = false;
906 while (is_whitespace (c));
907 expect_fuzzy_msgstr_as_c_comment = false;
912 /* But for fuzzy key/value pairs, look for the tentative msgstr
913 in the form of a C++ style comment. */
914 if (fuzzy_msgstr == NULL && next_is_fuzzy)
921 expect_fuzzy_msgstr_as_cxx_comment = true;
924 expect_fuzzy_msgstr_as_cxx_comment = false;
926 if (fuzzy_msgstr != NULL && strcmp (msgstr, msgid) == 0)
927 msgstr = fuzzy_msgstr;
929 /* A key/value pair. */
930 po_callback_message (NULL, msgid, &msgid_pos, NULL,
931 msgstr, strlen (msgstr) + 1, &msgstr_pos,
933 false, next_is_obsolete);
937 po_xerror (PO_SEVERITY_ERROR, NULL,
938 real_file_name, gram_pos.line_number, (size_t)(-1),
940 warning: syntax error, expected ';' after string"));
946 po_xerror (PO_SEVERITY_ERROR, NULL,
947 real_file_name, gram_pos.line_number, (size_t)(-1), false,
949 warning: syntax error, expected '=' or ';' after string"));
955 real_file_name = NULL;
956 gram_pos.line_number = 0;
959 const struct catalog_input_format input_format_stringtable =
961 stringtable_parse, /* parse */
962 true /* produces_utf8 */