gettext-tools/src/read-properties.c

   1 /* Reading Java .properties files.
   2    Copyright (C) 2003, 2005-2007, 2009 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2003.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  17
  18 #ifdef HAVE_CONFIG_H
  19 # include <config.h>
  20 #endif
  21
  22 /* Specification.  */
  23 #include "read-properties.h"
  24
  25 #include <assert.h>
  26 #include <errno.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31
  32 #include "error.h"
  33 #include "error-progname.h"
  34 #include "message.h"
  35 #include "read-catalog-abstract.h"
  36 #include "xalloc.h"
  37 #include "xvasprintf.h"
  38 #include "po-xerror.h"
  39 #include "msgl-ascii.h"
  40 #include "unistr.h"
  41 #include "gettext.h"
  42
  43 #define _(str) gettext (str)
  44
  45 /* For compiling this file in C++ mode.  */
  46 #ifdef __cplusplus
  47 # define this thiss
  48 #endif
  49
  50
  51 /* The format of the Java .properties files is documented in the JDK
  52    documentation for class java.util.Properties.  In the case of .properties
  53    files for PropertyResourceBundle, each non-comment line contains a
  54    key/value pair in the form "key = value" or "key : value" or "key value",
  55    where the key is the msgid and the value is the msgstr.  Messages with
  56    plurals are not supported in this format.  */
  57
  58 /* Handling of comments: We copy all comments from the .properties file to
  59    the PO file. This is not really needed; it's a service for translators
  60    who don't like PO files and prefer to maintain the .properties file.  */
  61
  62 /* Real filename, used in error messages about the input file.  */
  63 static const char *real_file_name;
  64
  65 /* File name and line number.  */
  66 extern lex_pos_ty gram_pos;
  67
  68 /* The input file stream.  */
  69 static FILE *fp;
  70
  71
  72 /* Phase 1: Read an ISO-8859-1 character.
  73    Max. 1 pushback character.  */
  74
  75 static int
  76 phase1_getc ()
  77 {
  78   int c;
  79
  80   c = getc (fp);
  81
  82   if (c == EOF)
  83     {
  84       if (ferror (fp))
  85         {
  86           const char *errno_description = strerror (errno);
  87           po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
  88                      xasprintf ("%s: %s",
  89                                 xasprintf (_("error while reading \"%s\""),
  90                                            real_file_name),
  91                                 errno_description));
  92         }
  93       return EOF;
  94     }
  95
  96   return c;
  97 }
  98
  99 static inline void
 100 phase1_ungetc (int c)
 101 {
 102   if (c != EOF)
 103     ungetc (c, fp);
 104 }
 105
 106
 107 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
 108    Max. 2 pushback characters.  */
 109
 110 static unsigned char phase2_pushback[2];
 111 static int phase2_pushback_length;
 112
 113 static int
 114 phase2_getc ()
 115 {
 116   int c;
 117
 118   if (phase2_pushback_length)
 119     c = phase2_pushback[--phase2_pushback_length];
 120   else
 121     {
 122       c = phase1_getc ();
 123
 124       if (c == '\r')
 125         {
 126           int c2 = phase1_getc ();
 127           if (c2 == '\n')
 128             c = c2;
 129           else
 130             phase1_ungetc (c2);
 131         }
 132     }
 133
 134   if (c == '\n')
 135     gram_pos.line_number++;
 136
 137   return c;
 138 }
 139
 140 static void
 141 phase2_ungetc (int c)
 142 {
 143   if (c == '\n')
 144     --gram_pos.line_number;
 145   if (c != EOF)
 146     phase2_pushback[phase2_pushback_length++] = c;
 147 }
 148
 149
 150 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
 151    with handling of continuation lines.
 152    Max. 1 pushback character.  */
 153
 154 static int
 155 phase3_getc ()
 156 {
 157   int c = phase2_getc ();
 158
 159   for (;;)
 160     {
 161       if (c != '\\')
 162         return c;
 163
 164       c = phase2_getc ();
 165       if (c != '\n')
 166         {
 167           phase2_ungetc (c);
 168           return '\\';
 169         }
 170
 171       /* Skip the backslash-newline and all whitespace that follows it.  */
 172       do
 173         c = phase2_getc ();
 174       while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
 175     }
 176 }
 177
 178 static inline void
 179 phase3_ungetc (int c)
 180 {
 181   phase2_ungetc (c);
 182 }
 183
 184
 185 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
 186    with handling of continuation lines and of \uxxxx sequences.  */
 187
 188 static int
 189 phase4_getuc ()
 190 {
 191   int c = phase3_getc ();
 192
 193   if (c == EOF)
 194     return -1;
 195   if (c == '\\')
 196     {
 197       int c2 = phase3_getc ();
 198
 199       if (c2 == 't')
 200         return '\t';
 201       if (c2 == 'n')
 202         return '\n';
 203       if (c2 == 'r')
 204         return '\r';
 205       if (c2 == 'f')
 206         return '\f';
 207       if (c2 == 'u')
 208         {
 209           unsigned int n = 0;
 210           int i;
 211
 212           for (i = 0; i < 4; i++)
 213             {
 214               int c1 = phase3_getc ();
 215
 216               if (c1 >= '0' && c1 <= '9')
 217                 n = (n << 4) + (c1 - '0');
 218               else if (c1 >= 'A' && c1 <= 'F')
 219                 n = (n << 4) + (c1 - 'A' + 10);
 220               else if (c1 >= 'a' && c1 <= 'f')
 221                 n = (n << 4) + (c1 - 'a' + 10);
 222               else
 223                 {
 224                   phase3_ungetc (c1);
 225                   po_xerror (PO_SEVERITY_ERROR, NULL,
 226                              real_file_name, gram_pos.line_number, (size_t)(-1),
 227                              false, _("warning: invalid \\uxxxx syntax for Unicode character"));
 228                   return 'u';
 229                 }
 230             }
 231           return n;
 232         }
 233
 234       return c2;
 235     }
 236   else
 237     return c;
 238 }
 239
 240
 241 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
 242 static char *
 243 conv_from_iso_8859_1 (char *string)
 244 {
 245   if (is_ascii_string (string))
 246     return string;
 247   else
 248     {
 249       size_t length = strlen (string);
 250       /* Each ISO-8859-1 character needs 2 bytes at worst.  */
 251       unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
 252       unsigned char *q = utf8_string;
 253       const char *str = string;
 254       const char *str_limit = str + length;
 255
 256       while (str < str_limit)
 257         {
 258           unsigned int uc = (unsigned char) *str++;
 259           int n = u8_uctomb (q, uc, 6);
 260           assert (n > 0);
 261           q += n;
 262         }
 263       *q = '\0';
 264       assert (q - utf8_string <= 2 * length);
 265
 266       return (char *) utf8_string;
 267     }
 268 }
 269
 270
 271 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
 272    encoding.  May destructively modify the argument string.  */
 273 static char *
 274 conv_from_java (char *string)
 275 {
 276   /* This conversion can only shrink the string, never increase its size.
 277      So there is no need to xmalloc the result freshly.  */
 278   const char *p = string;
 279   unsigned char *q = (unsigned char *) string;
 280
 281   while (*p != '\0')
 282     {
 283       if (p[0] == '\\' && p[1] == 'u')
 284         {
 285           unsigned int n = 0;
 286           int i;
 287
 288           for (i = 0; i < 4; i++)
 289             {
 290               int c1 = (unsigned char) p[2 + i];
 291
 292               if (c1 >= '0' && c1 <= '9')
 293                 n = (n << 4) + (c1 - '0');
 294               else if (c1 >= 'A' && c1 <= 'F')
 295                 n = (n << 4) + (c1 - 'A' + 10);
 296               else if (c1 >= 'a' && c1 <= 'f')
 297                 n = (n << 4) + (c1 - 'a' + 10);
 298               else
 299                 goto just_one_byte;
 300             }
 301
 302           if (i == 4)
 303             {
 304               unsigned int uc;
 305
 306               if (n >= 0xd800 && n < 0xdc00)
 307                 {
 308                   if (p[6] == '\\' && p[7] == 'u')
 309                     {
 310                       unsigned int m = 0;
 311
 312                       for (i = 0; i < 4; i++)
 313                         {
 314                           int c1 = (unsigned char) p[8 + i];
 315
 316                           if (c1 >= '0' && c1 <= '9')
 317                             m = (m << 4) + (c1 - '0');
 318                           else if (c1 >= 'A' && c1 <= 'F')
 319                             m = (m << 4) + (c1 - 'A' + 10);
 320                           else if (c1 >= 'a' && c1 <= 'f')
 321                             m = (m << 4) + (c1 - 'a' + 10);
 322                           else
 323                             goto just_one_byte;
 324                         }
 325
 326                       if (i == 4 && (m >= 0xdc00 && m < 0xe000))
 327                         {
 328                           /* Combine two UTF-16 words to a character.  */
 329                           uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
 330                           p += 12;
 331                         }
 332                       else
 333                         goto just_one_byte;
 334                     }
 335                   else
 336                     goto just_one_byte;
 337                 }
 338               else
 339                 {
 340                   uc = n;
 341                   p += 6;
 342                 }
 343
 344               q += u8_uctomb (q, uc, 6);
 345               continue;
 346             }
 347         }
 348       just_one_byte:
 349         *q++ = (unsigned char) *p++;
 350     }
 351   *q = '\0';
 352   return string;
 353 }
 354
 355
 356 /* Reads a key or value string.
 357    Returns the string in UTF-8 encoding, or NULL if the end of the logical
 358    line is reached.
 359    Parsing ends:
 360      - when returning NULL, after the end of the logical line,
 361      - otherwise, if in_key is true, after the whitespace and possibly the
 362        separator that follows after the string,
 363      - otherwise, if in_key is false, after the end of the logical line. */
 364
 365 static char *
 366 read_escaped_string (bool in_key)
 367 {
 368   static unsigned short *buffer;
 369   static size_t bufmax;
 370   static size_t buflen;
 371   int c;
 372
 373   /* Skip whitespace before the string.  */
 374   do
 375     c = phase3_getc ();
 376   while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
 377
 378   if (c == EOF || c == '\n')
 379     /* Empty string.  */
 380     return NULL;
 381
 382   /* Start accumulating the string.  We store the string in UTF-16 before
 383      converting it to UTF-8.  Why not converting every character directly to
 384      UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
 385      we must combine them to a single UTF-8 character.  */
 386   buflen = 0;
 387   for (;;)
 388     {
 389       if (in_key && (c == '=' || c == ':'
 390                      || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
 391         {
 392           /* Skip whitespace after the string.  */
 393           while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
 394             c = phase3_getc ();
 395           /* Skip '=' or ':' separator.  */
 396           if (!(c == '=' || c == ':'))
 397             phase3_ungetc (c);
 398           break;
 399         }
 400
 401       phase3_ungetc (c);
 402
 403       /* Read the next UTF-16 codepoint.  */
 404       c = phase4_getuc ();
 405       if (c < 0)
 406         break;
 407       /* Append it to the buffer.  */
 408       if (buflen >= bufmax)
 409         {
 410           bufmax += 100;
 411           buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
 412         }
 413       buffer[buflen++] = c;
 414
 415       c = phase3_getc ();
 416       if (c == EOF || c == '\n')
 417         {
 418           if (in_key)
 419             phase3_ungetc (c);
 420           break;
 421         }
 422     }
 423
 424   /* Now convert from UTF-16 to UTF-8.  */
 425   {
 426     size_t pos;
 427     unsigned char *utf8_string;
 428     unsigned char *q;
 429
 430     /* Each UTF-16 word needs 3 bytes at worst.  */
 431     utf8_string = XNMALLOC (3 * buflen + 1, unsigned char);
 432     for (pos = 0, q = utf8_string; pos < buflen; )
 433       {
 434         ucs4_t uc;
 435         int n;
 436
 437         pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
 438         n = u8_uctomb (q, uc, 6);
 439         assert (n > 0);
 440         q += n;
 441       }
 442     *q = '\0';
 443     assert (q - utf8_string <= 3 * buflen);
 444
 445     return (char *) utf8_string;
 446   }
 447 }
 448
 449
 450 /* Read a .properties file from a stream, and dispatch to the various
 451    abstract_catalog_reader_class_ty methods.  */
 452 static void
 453 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
 454                   const char *real_filename, const char *logical_filename)
 455 {
 456   fp = file;
 457   real_file_name = real_filename;
 458   gram_pos.file_name = xstrdup (real_file_name);
 459   gram_pos.line_number = 1;
 460
 461   for (;;)
 462     {
 463       int c;
 464       bool comment;
 465       bool hidden;
 466
 467       c = phase2_getc ();
 468
 469       if (c == EOF)
 470         break;
 471
 472       comment = false;
 473       hidden = false;
 474       if (c == '#')
 475         comment = true;
 476       else if (c == '!')
 477         {
 478           /* For compatibility with write-properties.c, we treat '!' not
 479              followed by space as a fuzzy or untranslated message.  */
 480           int c2 = phase2_getc ();
 481           if (c2 == ' ' || c2 == '\n' || c2 == EOF)
 482             comment = true;
 483           else
 484             hidden = true;
 485           phase2_ungetc (c2);
 486         }
 487       else
 488         phase2_ungetc (c);
 489
 490       if (comment)
 491         {
 492           /* A comment line.  */
 493           static char *buffer;
 494           static size_t bufmax;
 495           static size_t buflen;
 496
 497           buflen = 0;
 498           for (;;)
 499             {
 500               c = phase2_getc ();
 501
 502               if (buflen >= bufmax)
 503                 {
 504                   bufmax += 100;
 505                   buffer = xrealloc (buffer, bufmax);
 506                 }
 507
 508               if (c == EOF || c == '\n')
 509                 break;
 510
 511               buffer[buflen++] = c;
 512             }
 513           buffer[buflen] = '\0';
 514
 515           po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
 516         }
 517       else
 518         {
 519           /* A key/value pair.  */
 520           char *msgid;
 521           lex_pos_ty msgid_pos;
 522
 523           msgid_pos = gram_pos;
 524           msgid = read_escaped_string (true);
 525           if (msgid == NULL)
 526             /* Skip blank line.  */
 527             ;
 528           else
 529             {
 530               char *msgstr;
 531               lex_pos_ty msgstr_pos;
 532               bool force_fuzzy;
 533
 534               msgstr_pos = gram_pos;
 535               msgstr = read_escaped_string (false);
 536               if (msgstr == NULL)
 537                 msgstr = xstrdup ("");
 538
 539               /* Be sure to make the message fuzzy if it was commented out
 540                  and if it is not already header/fuzzy/untranslated.  */
 541               force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
 542
 543               po_callback_message (NULL, msgid, &msgid_pos, NULL,
 544                                    msgstr, strlen (msgstr) + 1, &msgstr_pos,
 545                                    NULL, NULL, NULL,
 546                                    force_fuzzy, false);
 547             }
 548         }
 549     }
 550
 551   fp = NULL;
 552   real_file_name = NULL;
 553   gram_pos.line_number = 0;
 554 }
 555
 556 const struct catalog_input_format input_format_properties =
 557 {
 558   properties_parse,                     /* parse */
 559   true                                  /* produces_utf8 */
 560 };