1 /* Reading Java .properties files.
2 Copyright (C) 2003, 2005-2007, 2009, 2015 Free Software Foundation,
4 Written by Bruno Haible <bruno@clisp.org>, 2003.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
24 #include "read-properties.h"
34 #include "error-progname.h"
36 #include "read-catalog-abstract.h"
38 #include "xvasprintf.h"
39 #include "po-xerror.h"
40 #include "msgl-ascii.h"
44 #define _(str) gettext (str)
46 /* For compiling this file in C++ mode. */
52 /* The format of the Java .properties files is documented in the JDK
53 documentation for class java.util.Properties. In the case of .properties
54 files for PropertyResourceBundle, each non-comment line contains a
55 key/value pair in the form "key = value" or "key : value" or "key value",
56 where the key is the msgid and the value is the msgstr. Messages with
57 plurals are not supported in this format. */
59 /* Handling of comments: We copy all comments from the .properties file to
60 the PO file. This is not really needed; it's a service for translators
61 who don't like PO files and prefer to maintain the .properties file. */
63 /* Real filename, used in error messages about the input file. */
64 static const char *real_file_name;
66 /* File name and line number. */
67 extern lex_pos_ty gram_pos;
69 /* The input file stream. */
73 /* Phase 1: Read an ISO-8859-1 character.
74 Max. 1 pushback character. */
87 const char *errno_description = strerror (errno);
88 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
90 xasprintf (_("error while reading \"%s\""),
101 phase1_ungetc (int c)
108 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
109 Max. 2 pushback characters. */
111 static unsigned char phase2_pushback[2];
112 static int phase2_pushback_length;
119 if (phase2_pushback_length)
120 c = phase2_pushback[--phase2_pushback_length];
127 int c2 = phase1_getc ();
136 gram_pos.line_number++;
142 phase2_ungetc (int c)
145 --gram_pos.line_number;
147 phase2_pushback[phase2_pushback_length++] = c;
151 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
152 with handling of continuation lines.
153 Max. 1 pushback character. */
158 int c = phase2_getc ();
172 /* Skip the backslash-newline and all whitespace that follows it. */
175 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
180 phase3_ungetc (int c)
186 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
187 with handling of continuation lines and of \uxxxx sequences. */
192 int c = phase3_getc ();
198 int c2 = phase3_getc ();
213 for (i = 0; i < 4; i++)
215 int c1 = phase3_getc ();
217 if (c1 >= '0' && c1 <= '9')
218 n = (n << 4) + (c1 - '0');
219 else if (c1 >= 'A' && c1 <= 'F')
220 n = (n << 4) + (c1 - 'A' + 10);
221 else if (c1 >= 'a' && c1 <= 'f')
222 n = (n << 4) + (c1 - 'a' + 10);
226 po_xerror (PO_SEVERITY_ERROR, NULL,
227 real_file_name, gram_pos.line_number, (size_t)(-1),
228 false, _("warning: invalid \\uxxxx syntax for Unicode character"));
242 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
244 conv_from_iso_8859_1 (char *string)
246 if (is_ascii_string (string))
250 size_t length = strlen (string);
251 /* Each ISO-8859-1 character needs 2 bytes at worst. */
252 unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
253 unsigned char *q = utf8_string;
254 const char *str = string;
255 const char *str_limit = str + length;
257 while (str < str_limit)
259 unsigned int uc = (unsigned char) *str++;
260 int n = u8_uctomb (q, uc, 6);
265 assert (q - utf8_string <= 2 * length);
267 return (char *) utf8_string;
272 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
273 encoding. May destructively modify the argument string. */
275 conv_from_java (char *string)
277 /* This conversion can only shrink the string, never increase its size.
278 So there is no need to xmalloc the result freshly. */
279 const char *p = string;
280 unsigned char *q = (unsigned char *) string;
284 if (p[0] == '\\' && p[1] == 'u')
289 for (i = 0; i < 4; i++)
291 int c1 = (unsigned char) p[2 + i];
293 if (c1 >= '0' && c1 <= '9')
294 n = (n << 4) + (c1 - '0');
295 else if (c1 >= 'A' && c1 <= 'F')
296 n = (n << 4) + (c1 - 'A' + 10);
297 else if (c1 >= 'a' && c1 <= 'f')
298 n = (n << 4) + (c1 - 'a' + 10);
307 if (n >= 0xd800 && n < 0xdc00)
309 if (p[6] == '\\' && p[7] == 'u')
313 for (i = 0; i < 4; i++)
315 int c1 = (unsigned char) p[8 + i];
317 if (c1 >= '0' && c1 <= '9')
318 m = (m << 4) + (c1 - '0');
319 else if (c1 >= 'A' && c1 <= 'F')
320 m = (m << 4) + (c1 - 'A' + 10);
321 else if (c1 >= 'a' && c1 <= 'f')
322 m = (m << 4) + (c1 - 'a' + 10);
327 if (i == 4 && (m >= 0xdc00 && m < 0xe000))
329 /* Combine two UTF-16 words to a character. */
330 uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
345 q += u8_uctomb (q, uc, 6);
350 *q++ = (unsigned char) *p++;
357 /* Reads a key or value string.
358 Returns the string in UTF-8 encoding, or NULL if the end of the logical
361 - when returning NULL, after the end of the logical line,
362 - otherwise, if in_key is true, after the whitespace and possibly the
363 separator that follows after the string,
364 - otherwise, if in_key is false, after the end of the logical line. */
367 read_escaped_string (bool in_key)
369 static unsigned short *buffer;
370 static size_t bufmax;
371 static size_t buflen;
374 /* Skip whitespace before the string. */
377 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
379 if (c == EOF || c == '\n')
383 /* Start accumulating the string. We store the string in UTF-16 before
384 converting it to UTF-8. Why not converting every character directly to
385 UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
386 we must combine them to a single UTF-8 character. */
390 if (in_key && (c == '=' || c == ':'
391 || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
393 /* Skip whitespace after the string. */
394 while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
396 /* Skip '=' or ':' separator. */
397 if (!(c == '=' || c == ':'))
404 /* Read the next UTF-16 codepoint. */
408 /* Append it to the buffer. */
409 if (buflen >= bufmax)
412 buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
414 buffer[buflen++] = c;
417 if (c == EOF || c == '\n')
425 /* Now convert from UTF-16 to UTF-8. */
428 unsigned char *utf8_string;
431 /* Each UTF-16 word needs 3 bytes at worst. */
432 utf8_string = XNMALLOC (3 * buflen + 1, unsigned char);
433 for (pos = 0, q = utf8_string; pos < buflen; )
438 pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
439 n = u8_uctomb (q, uc, 6);
444 assert (q - utf8_string <= 3 * buflen);
446 return (char *) utf8_string;
451 /* Read a .properties file from a stream, and dispatch to the various
452 abstract_catalog_reader_class_ty methods. */
454 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
455 const char *real_filename, const char *logical_filename)
458 real_file_name = real_filename;
459 gram_pos.file_name = xstrdup (real_file_name);
460 gram_pos.line_number = 1;
479 /* For compatibility with write-properties.c, we treat '!' not
480 followed by space as a fuzzy or untranslated message. */
481 int c2 = phase2_getc ();
482 if (c2 == ' ' || c2 == '\n' || c2 == EOF)
493 /* A comment line. */
495 static size_t bufmax;
496 static size_t buflen;
503 if (buflen >= bufmax)
506 buffer = xrealloc (buffer, bufmax);
509 if (c == EOF || c == '\n')
512 buffer[buflen++] = c;
514 buffer[buflen] = '\0';
516 po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
520 /* A key/value pair. */
522 lex_pos_ty msgid_pos;
524 msgid_pos = gram_pos;
525 msgid = read_escaped_string (true);
527 /* Skip blank line. */
532 lex_pos_ty msgstr_pos;
535 msgstr_pos = gram_pos;
536 msgstr = read_escaped_string (false);
538 msgstr = xstrdup ("");
540 /* Be sure to make the message fuzzy if it was commented out
541 and if it is not already header/fuzzy/untranslated. */
542 force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
544 po_callback_message (NULL, msgid, &msgid_pos, NULL,
545 msgstr, strlen (msgstr) + 1, &msgstr_pos,
553 real_file_name = NULL;
554 gram_pos.line_number = 0;
557 const struct catalog_input_format input_format_properties =
559 properties_parse, /* parse */
560 true /* produces_utf8 */