1 /* Reading Java .properties files.
2 Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "read-properties.h"
33 #include "error-progname.h"
35 #include "read-catalog-abstract.h"
37 #include "xvasprintf.h"
38 #include "po-xerror.h"
39 #include "msgl-ascii.h"
43 #define _(str) gettext (str)
45 /* For compiling this file in C++ mode. */
51 /* The format of the Java .properties files is documented in the JDK
52 documentation for class java.util.Properties. In the case of .properties
53 files for PropertyResourceBundle, each non-comment line contains a
54 key/value pair in the form "key = value" or "key : value" or "key value",
55 where the key is the msgid and the value is the msgstr. Messages with
56 plurals are not supported in this format. */
58 /* Handling of comments: We copy all comments from the .properties file to
59 the PO file. This is not really needed; it's a service for translators
60 who don't like PO files and prefer to maintain the .properties file. */
62 /* Real filename, used in error messages about the input file. */
63 static const char *real_file_name;
65 /* File name and line number. */
66 extern lex_pos_ty gram_pos;
68 /* The input file stream. */
72 /* Phase 1: Read an ISO-8859-1 character.
73 Max. 1 pushback character. */
86 const char *errno_description = strerror (errno);
87 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
89 xasprintf (_("error while reading \"%s\""),
100 phase1_ungetc (int c)
107 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
108 Max. 2 pushback characters. */
110 static unsigned char phase2_pushback[2];
111 static int phase2_pushback_length;
118 if (phase2_pushback_length)
119 c = phase2_pushback[--phase2_pushback_length];
126 int c2 = phase1_getc ();
135 gram_pos.line_number++;
141 phase2_ungetc (int c)
144 --gram_pos.line_number;
146 phase2_pushback[phase2_pushback_length++] = c;
150 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
151 with handling of continuation lines.
152 Max. 1 pushback character. */
157 int c = phase2_getc ();
171 /* Skip the backslash-newline and all whitespace that follows it. */
174 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
179 phase3_ungetc (int c)
185 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
186 with handling of continuation lines and of \uxxxx sequences. */
191 int c = phase3_getc ();
197 int c2 = phase3_getc ();
212 for (i = 0; i < 4; i++)
214 int c1 = phase3_getc ();
216 if (c1 >= '0' && c1 <= '9')
217 n = (n << 4) + (c1 - '0');
218 else if (c1 >= 'A' && c1 <= 'F')
219 n = (n << 4) + (c1 - 'A' + 10);
220 else if (c1 >= 'a' && c1 <= 'f')
221 n = (n << 4) + (c1 - 'a' + 10);
225 po_xerror (PO_SEVERITY_ERROR, NULL,
226 real_file_name, gram_pos.line_number, (size_t)(-1),
227 false, _("warning: invalid \\uxxxx syntax for Unicode character"));
241 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding. */
243 conv_from_iso_8859_1 (char *string)
245 if (is_ascii_string (string))
249 size_t length = strlen (string);
250 /* Each ISO-8859-1 character needs 2 bytes at worst. */
251 unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
252 unsigned char *q = utf8_string;
253 const char *str = string;
254 const char *str_limit = str + length;
256 while (str < str_limit)
258 unsigned int uc = (unsigned char) *str++;
259 int n = u8_uctomb (q, uc, 6);
264 assert (q - utf8_string <= 2 * length);
266 return (char *) utf8_string;
271 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
272 encoding. May destructively modify the argument string. */
274 conv_from_java (char *string)
276 /* This conversion can only shrink the string, never increase its size.
277 So there is no need to xmalloc the result freshly. */
278 const char *p = string;
279 unsigned char *q = (unsigned char *) string;
283 if (p[0] == '\\' && p[1] == 'u')
288 for (i = 0; i < 4; i++)
290 int c1 = (unsigned char) p[2 + i];
292 if (c1 >= '0' && c1 <= '9')
293 n = (n << 4) + (c1 - '0');
294 else if (c1 >= 'A' && c1 <= 'F')
295 n = (n << 4) + (c1 - 'A' + 10);
296 else if (c1 >= 'a' && c1 <= 'f')
297 n = (n << 4) + (c1 - 'a' + 10);
306 if (n >= 0xd800 && n < 0xdc00)
308 if (p[6] == '\\' && p[7] == 'u')
312 for (i = 0; i < 4; i++)
314 int c1 = (unsigned char) p[8 + i];
316 if (c1 >= '0' && c1 <= '9')
317 m = (m << 4) + (c1 - '0');
318 else if (c1 >= 'A' && c1 <= 'F')
319 m = (m << 4) + (c1 - 'A' + 10);
320 else if (c1 >= 'a' && c1 <= 'f')
321 m = (m << 4) + (c1 - 'a' + 10);
326 if (i == 4 && (m >= 0xdc00 && m < 0xe000))
328 /* Combine two UTF-16 words to a character. */
329 uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
344 q += u8_uctomb (q, uc, 6);
349 *q++ = (unsigned char) *p++;
356 /* Reads a key or value string.
357 Returns the string in UTF-8 encoding, or NULL if the end of the logical
360 - when returning NULL, after the end of the logical line,
361 - otherwise, if in_key is true, after the whitespace and possibly the
362 separator that follows after the string,
363 - otherwise, if in_key is false, after the end of the logical line. */
366 read_escaped_string (bool in_key)
368 static unsigned short *buffer;
369 static size_t bufmax;
370 static size_t buflen;
373 /* Skip whitespace before the string. */
376 while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
378 if (c == EOF || c == '\n')
382 /* Start accumulating the string. We store the string in UTF-16 before
383 converting it to UTF-8. Why not converting every character directly to
384 UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
385 we must combine them to a single UTF-8 character. */
389 if (in_key && (c == '=' || c == ':'
390 || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
392 /* Skip whitespace after the string. */
393 while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
395 /* Skip '=' or ':' separator. */
396 if (!(c == '=' || c == ':'))
403 /* Read the next UTF-16 codepoint. */
407 /* Append it to the buffer. */
408 if (buflen >= bufmax)
411 buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
413 buffer[buflen++] = c;
416 if (c == EOF || c == '\n')
424 /* Now convert from UTF-16 to UTF-8. */
427 unsigned char *utf8_string;
430 /* Each UTF-16 word needs 3 bytes at worst. */
431 utf8_string = XNMALLOC (3 * buflen + 1, unsigned char);
432 for (pos = 0, q = utf8_string; pos < buflen; )
437 pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
438 n = u8_uctomb (q, uc, 6);
443 assert (q - utf8_string <= 3 * buflen);
445 return (char *) utf8_string;
450 /* Read a .properties file from a stream, and dispatch to the various
451 abstract_catalog_reader_class_ty methods. */
453 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
454 const char *real_filename, const char *logical_filename)
457 real_file_name = real_filename;
458 gram_pos.file_name = xstrdup (real_file_name);
459 gram_pos.line_number = 1;
478 /* For compatibility with write-properties.c, we treat '!' not
479 followed by space as a fuzzy or untranslated message. */
480 int c2 = phase2_getc ();
481 if (c2 == ' ' || c2 == '\n' || c2 == EOF)
492 /* A comment line. */
494 static size_t bufmax;
495 static size_t buflen;
502 if (buflen >= bufmax)
505 buffer = xrealloc (buffer, bufmax);
508 if (c == EOF || c == '\n')
511 buffer[buflen++] = c;
513 buffer[buflen] = '\0';
515 po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
519 /* A key/value pair. */
521 lex_pos_ty msgid_pos;
523 msgid_pos = gram_pos;
524 msgid = read_escaped_string (true);
526 /* Skip blank line. */
531 lex_pos_ty msgstr_pos;
534 msgstr_pos = gram_pos;
535 msgstr = read_escaped_string (false);
537 msgstr = xstrdup ("");
539 /* Be sure to make the message fuzzy if it was commented out
540 and if it is not already header/fuzzy/untranslated. */
541 force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
543 po_callback_message (NULL, msgid, &msgid_pos, NULL,
544 msgstr, strlen (msgstr) + 1, &msgstr_pos,
552 real_file_name = NULL;
553 gram_pos.line_number = 0;
556 const struct catalog_input_format input_format_properties =
558 properties_parse, /* parse */
559 true /* produces_utf8 */