gettext-tools/src/read-properties.c

   1 /* Reading Java .properties files.
   2    Copyright (C) 2003, 2005-2007, 2009, 2015 Free Software Foundation,
   3    Inc.
   4    Written by Bruno Haible <bruno@clisp.org>, 2003.
   5
   6    This program is free software: you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include <config.h>
  21 #endif
  22
  23 /* Specification.  */
  24 #include "read-properties.h"
  25
  26 #include <assert.h>
  27 #include <errno.h>
  28 #include <stdbool.h>
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "error.h"
  34 #include "error-progname.h"
  35 #include "message.h"
  36 #include "read-catalog-abstract.h"
  37 #include "xalloc.h"
  38 #include "xvasprintf.h"
  39 #include "po-xerror.h"
  40 #include "msgl-ascii.h"
  41 #include "unistr.h"
  42 #include "gettext.h"
  43
  44 #define _(str) gettext (str)
  45
  46 /* For compiling this file in C++ mode.  */
  47 #ifdef __cplusplus
  48 # define this thiss
  49 #endif
  50
  51
  52 /* The format of the Java .properties files is documented in the JDK
  53    documentation for class java.util.Properties.  In the case of .properties
  54    files for PropertyResourceBundle, each non-comment line contains a
  55    key/value pair in the form "key = value" or "key : value" or "key value",
  56    where the key is the msgid and the value is the msgstr.  Messages with
  57    plurals are not supported in this format.  */
  58
  59 /* Handling of comments: We copy all comments from the .properties file to
  60    the PO file. This is not really needed; it's a service for translators
  61    who don't like PO files and prefer to maintain the .properties file.  */
  62
  63 /* Real filename, used in error messages about the input file.  */
  64 static const char *real_file_name;
  65
  66 /* File name and line number.  */
  67 extern lex_pos_ty gram_pos;
  68
  69 /* The input file stream.  */
  70 static FILE *fp;
  71
  72
  73 /* Phase 1: Read an ISO-8859-1 character.
  74    Max. 1 pushback character.  */
  75
  76 static int
  77 phase1_getc ()
  78 {
  79   int c;
  80
  81   c = getc (fp);
  82
  83   if (c == EOF)
  84     {
  85       if (ferror (fp))
  86         {
  87           const char *errno_description = strerror (errno);
  88           po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
  89                      xasprintf ("%s: %s",
  90                                 xasprintf (_("error while reading \"%s\""),
  91                                            real_file_name),
  92                                 errno_description));
  93         }
  94       return EOF;
  95     }
  96
  97   return c;
  98 }
  99
 100 static inline void
 101 phase1_ungetc (int c)
 102 {
 103   if (c != EOF)
 104     ungetc (c, fp);
 105 }
 106
 107
 108 /* Phase 2: Read an ISO-8859-1 character, treating CR/LF like a single LF.
 109    Max. 2 pushback characters.  */
 110
 111 static unsigned char phase2_pushback[2];
 112 static int phase2_pushback_length;
 113
 114 static int
 115 phase2_getc ()
 116 {
 117   int c;
 118
 119   if (phase2_pushback_length)
 120     c = phase2_pushback[--phase2_pushback_length];
 121   else
 122     {
 123       c = phase1_getc ();
 124
 125       if (c == '\r')
 126         {
 127           int c2 = phase1_getc ();
 128           if (c2 == '\n')
 129             c = c2;
 130           else
 131             phase1_ungetc (c2);
 132         }
 133     }
 134
 135   if (c == '\n')
 136     gram_pos.line_number++;
 137
 138   return c;
 139 }
 140
 141 static void
 142 phase2_ungetc (int c)
 143 {
 144   if (c == '\n')
 145     --gram_pos.line_number;
 146   if (c != EOF)
 147     phase2_pushback[phase2_pushback_length++] = c;
 148 }
 149
 150
 151 /* Phase 3: Read an ISO-8859-1 character, treating CR/LF like a single LF,
 152    with handling of continuation lines.
 153    Max. 1 pushback character.  */
 154
 155 static int
 156 phase3_getc ()
 157 {
 158   int c = phase2_getc ();
 159
 160   for (;;)
 161     {
 162       if (c != '\\')
 163         return c;
 164
 165       c = phase2_getc ();
 166       if (c != '\n')
 167         {
 168           phase2_ungetc (c);
 169           return '\\';
 170         }
 171
 172       /* Skip the backslash-newline and all whitespace that follows it.  */
 173       do
 174         c = phase2_getc ();
 175       while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
 176     }
 177 }
 178
 179 static inline void
 180 phase3_ungetc (int c)
 181 {
 182   phase2_ungetc (c);
 183 }
 184
 185
 186 /* Phase 4: Read an UTF-16 codepoint, treating CR/LF like a single LF,
 187    with handling of continuation lines and of \uxxxx sequences.  */
 188
 189 static int
 190 phase4_getuc ()
 191 {
 192   int c = phase3_getc ();
 193
 194   if (c == EOF)
 195     return -1;
 196   if (c == '\\')
 197     {
 198       int c2 = phase3_getc ();
 199
 200       if (c2 == 't')
 201         return '\t';
 202       if (c2 == 'n')
 203         return '\n';
 204       if (c2 == 'r')
 205         return '\r';
 206       if (c2 == 'f')
 207         return '\f';
 208       if (c2 == 'u')
 209         {
 210           unsigned int n = 0;
 211           int i;
 212
 213           for (i = 0; i < 4; i++)
 214             {
 215               int c1 = phase3_getc ();
 216
 217               if (c1 >= '0' && c1 <= '9')
 218                 n = (n << 4) + (c1 - '0');
 219               else if (c1 >= 'A' && c1 <= 'F')
 220                 n = (n << 4) + (c1 - 'A' + 10);
 221               else if (c1 >= 'a' && c1 <= 'f')
 222                 n = (n << 4) + (c1 - 'a' + 10);
 223               else
 224                 {
 225                   phase3_ungetc (c1);
 226                   po_xerror (PO_SEVERITY_ERROR, NULL,
 227                              real_file_name, gram_pos.line_number, (size_t)(-1),
 228                              false, _("warning: invalid \\uxxxx syntax for Unicode character"));
 229                   return 'u';
 230                 }
 231             }
 232           return n;
 233         }
 234
 235       return c2;
 236     }
 237   else
 238     return c;
 239 }
 240
 241
 242 /* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
 243 static char *
 244 conv_from_iso_8859_1 (char *string)
 245 {
 246   if (is_ascii_string (string))
 247     return string;
 248   else
 249     {
 250       size_t length = strlen (string);
 251       /* Each ISO-8859-1 character needs 2 bytes at worst.  */
 252       unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
 253       unsigned char *q = utf8_string;
 254       const char *str = string;
 255       const char *str_limit = str + length;
 256
 257       while (str < str_limit)
 258         {
 259           unsigned int uc = (unsigned char) *str++;
 260           int n = u8_uctomb (q, uc, 6);
 261           assert (n > 0);
 262           q += n;
 263         }
 264       *q = '\0';
 265       assert (q - utf8_string <= 2 * length);
 266
 267       return (char *) utf8_string;
 268     }
 269 }
 270
 271
 272 /* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
 273    encoding.  May destructively modify the argument string.  */
 274 static char *
 275 conv_from_java (char *string)
 276 {
 277   /* This conversion can only shrink the string, never increase its size.
 278      So there is no need to xmalloc the result freshly.  */
 279   const char *p = string;
 280   unsigned char *q = (unsigned char *) string;
 281
 282   while (*p != '\0')
 283     {
 284       if (p[0] == '\\' && p[1] == 'u')
 285         {
 286           unsigned int n = 0;
 287           int i;
 288
 289           for (i = 0; i < 4; i++)
 290             {
 291               int c1 = (unsigned char) p[2 + i];
 292
 293               if (c1 >= '0' && c1 <= '9')
 294                 n = (n << 4) + (c1 - '0');
 295               else if (c1 >= 'A' && c1 <= 'F')
 296                 n = (n << 4) + (c1 - 'A' + 10);
 297               else if (c1 >= 'a' && c1 <= 'f')
 298                 n = (n << 4) + (c1 - 'a' + 10);
 299               else
 300                 goto just_one_byte;
 301             }
 302
 303           if (i == 4)
 304             {
 305               unsigned int uc;
 306
 307               if (n >= 0xd800 && n < 0xdc00)
 308                 {
 309                   if (p[6] == '\\' && p[7] == 'u')
 310                     {
 311                       unsigned int m = 0;
 312
 313                       for (i = 0; i < 4; i++)
 314                         {
 315                           int c1 = (unsigned char) p[8 + i];
 316
 317                           if (c1 >= '0' && c1 <= '9')
 318                             m = (m << 4) + (c1 - '0');
 319                           else if (c1 >= 'A' && c1 <= 'F')
 320                             m = (m << 4) + (c1 - 'A' + 10);
 321                           else if (c1 >= 'a' && c1 <= 'f')
 322                             m = (m << 4) + (c1 - 'a' + 10);
 323                           else
 324                             goto just_one_byte;
 325                         }
 326
 327                       if (i == 4 && (m >= 0xdc00 && m < 0xe000))
 328                         {
 329                           /* Combine two UTF-16 words to a character.  */
 330                           uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
 331                           p += 12;
 332                         }
 333                       else
 334                         goto just_one_byte;
 335                     }
 336                   else
 337                     goto just_one_byte;
 338                 }
 339               else
 340                 {
 341                   uc = n;
 342                   p += 6;
 343                 }
 344
 345               q += u8_uctomb (q, uc, 6);
 346               continue;
 347             }
 348         }
 349       just_one_byte:
 350         *q++ = (unsigned char) *p++;
 351     }
 352   *q = '\0';
 353   return string;
 354 }
 355
 356
 357 /* Reads a key or value string.
 358    Returns the string in UTF-8 encoding, or NULL if the end of the logical
 359    line is reached.
 360    Parsing ends:
 361      - when returning NULL, after the end of the logical line,
 362      - otherwise, if in_key is true, after the whitespace and possibly the
 363        separator that follows after the string,
 364      - otherwise, if in_key is false, after the end of the logical line. */
 365
 366 static char *
 367 read_escaped_string (bool in_key)
 368 {
 369   static unsigned short *buffer;
 370   static size_t bufmax;
 371   static size_t buflen;
 372   int c;
 373
 374   /* Skip whitespace before the string.  */
 375   do
 376     c = phase3_getc ();
 377   while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
 378
 379   if (c == EOF || c == '\n')
 380     /* Empty string.  */
 381     return NULL;
 382
 383   /* Start accumulating the string.  We store the string in UTF-16 before
 384      converting it to UTF-8.  Why not converting every character directly to
 385      UTF-8? Because a string can contain surrogates like \uD800\uDF00, and
 386      we must combine them to a single UTF-8 character.  */
 387   buflen = 0;
 388   for (;;)
 389     {
 390       if (in_key && (c == '=' || c == ':'
 391                      || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
 392         {
 393           /* Skip whitespace after the string.  */
 394           while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
 395             c = phase3_getc ();
 396           /* Skip '=' or ':' separator.  */
 397           if (!(c == '=' || c == ':'))
 398             phase3_ungetc (c);
 399           break;
 400         }
 401
 402       phase3_ungetc (c);
 403
 404       /* Read the next UTF-16 codepoint.  */
 405       c = phase4_getuc ();
 406       if (c < 0)
 407         break;
 408       /* Append it to the buffer.  */
 409       if (buflen >= bufmax)
 410         {
 411           bufmax += 100;
 412           buffer = xrealloc (buffer, bufmax * sizeof (unsigned short));
 413         }
 414       buffer[buflen++] = c;
 415
 416       c = phase3_getc ();
 417       if (c == EOF || c == '\n')
 418         {
 419           if (in_key)
 420             phase3_ungetc (c);
 421           break;
 422         }
 423     }
 424
 425   /* Now convert from UTF-16 to UTF-8.  */
 426   {
 427     size_t pos;
 428     unsigned char *utf8_string;
 429     unsigned char *q;
 430
 431     /* Each UTF-16 word needs 3 bytes at worst.  */
 432     utf8_string = XNMALLOC (3 * buflen + 1, unsigned char);
 433     for (pos = 0, q = utf8_string; pos < buflen; )
 434       {
 435         ucs4_t uc;
 436         int n;
 437
 438         pos += u16_mbtouc (&uc, buffer + pos, buflen - pos);
 439         n = u8_uctomb (q, uc, 6);
 440         assert (n > 0);
 441         q += n;
 442       }
 443     *q = '\0';
 444     assert (q - utf8_string <= 3 * buflen);
 445
 446     return (char *) utf8_string;
 447   }
 448 }
 449
 450
 451 /* Read a .properties file from a stream, and dispatch to the various
 452    abstract_catalog_reader_class_ty methods.  */
 453 static void
 454 properties_parse (abstract_catalog_reader_ty *this, FILE *file,
 455                   const char *real_filename, const char *logical_filename)
 456 {
 457   fp = file;
 458   real_file_name = real_filename;
 459   gram_pos.file_name = xstrdup (real_file_name);
 460   gram_pos.line_number = 1;
 461
 462   for (;;)
 463     {
 464       int c;
 465       bool comment;
 466       bool hidden;
 467
 468       c = phase2_getc ();
 469
 470       if (c == EOF)
 471         break;
 472
 473       comment = false;
 474       hidden = false;
 475       if (c == '#')
 476         comment = true;
 477       else if (c == '!')
 478         {
 479           /* For compatibility with write-properties.c, we treat '!' not
 480              followed by space as a fuzzy or untranslated message.  */
 481           int c2 = phase2_getc ();
 482           if (c2 == ' ' || c2 == '\n' || c2 == EOF)
 483             comment = true;
 484           else
 485             hidden = true;
 486           phase2_ungetc (c2);
 487         }
 488       else
 489         phase2_ungetc (c);
 490
 491       if (comment)
 492         {
 493           /* A comment line.  */
 494           static char *buffer;
 495           static size_t bufmax;
 496           static size_t buflen;
 497
 498           buflen = 0;
 499           for (;;)
 500             {
 501               c = phase2_getc ();
 502
 503               if (buflen >= bufmax)
 504                 {
 505                   bufmax += 100;
 506                   buffer = xrealloc (buffer, bufmax);
 507                 }
 508
 509               if (c == EOF || c == '\n')
 510                 break;
 511
 512               buffer[buflen++] = c;
 513             }
 514           buffer[buflen] = '\0';
 515
 516           po_callback_comment_dispatcher (conv_from_java (conv_from_iso_8859_1 (buffer)));
 517         }
 518       else
 519         {
 520           /* A key/value pair.  */
 521           char *msgid;
 522           lex_pos_ty msgid_pos;
 523
 524           msgid_pos = gram_pos;
 525           msgid = read_escaped_string (true);
 526           if (msgid == NULL)
 527             /* Skip blank line.  */
 528             ;
 529           else
 530             {
 531               char *msgstr;
 532               lex_pos_ty msgstr_pos;
 533               bool force_fuzzy;
 534
 535               msgstr_pos = gram_pos;
 536               msgstr = read_escaped_string (false);
 537               if (msgstr == NULL)
 538                 msgstr = xstrdup ("");
 539
 540               /* Be sure to make the message fuzzy if it was commented out
 541                  and if it is not already header/fuzzy/untranslated.  */
 542               force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');
 543
 544               po_callback_message (NULL, msgid, &msgid_pos, NULL,
 545                                    msgstr, strlen (msgstr) + 1, &msgstr_pos,
 546                                    NULL, NULL, NULL,
 547                                    force_fuzzy, false);
 548             }
 549         }
 550     }
 551
 552   fp = NULL;
 553   real_file_name = NULL;
 554   gram_pos.line_number = 0;
 555 }
 556
 557 const struct catalog_input_format input_format_properties =
 558 {
 559   properties_parse,                     /* parse */
 560   true                                  /* produces_utf8 */
 561 };