source/tools/genrb/read.c

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 1998-2012, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *
  11 * File read.c
  12 *
  13 * Modification History:
  14 *
  15 *   Date        Name        Description
  16 *   05/26/99    stephen     Creation.
  17 *   5/10/01     Ram         removed ustdio dependency
  18 *******************************************************************************
  19 */
  20
  21 #include "read.h"
  22 #include "errmsg.h"
  23 #include "unicode/ustring.h"
  24 #include "unicode/utf16.h"
  25
  26 #define OPENBRACE    0x007B
  27 #define CLOSEBRACE   0x007D
  28 #define COMMA        0x002C
  29 #define QUOTE        0x0022
  30 #define ESCAPE       0x005C
  31 #define SLASH        0x002F
  32 #define ASTERISK     0x002A
  33 #define SPACE        0x0020
  34 #define COLON        0x003A
  35 #define BADBOM       0xFFFE
  36 #define CR           0x000D
  37 #define LF           0x000A
  38
  39 static int32_t lineCount;
  40
  41 /* Protos */
  42 static enum ETokenType getStringToken(UCHARBUF *buf,
  43                                       UChar32 initialChar,
  44                                       struct UString *token,
  45                                       UErrorCode *status);
  46
  47 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
  48 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  49 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
  50 static UBool   isWhitespace          (UChar32 c);
  51 static UBool   isNewline             (UChar32 c);
  52
  53 U_CFUNC void resetLineNumber() {
  54     lineCount = 1;
  55 }
  56
  57 /* Read and return the next token from the stream.  If the token is of
  58    type eString, fill in the token parameter with the token.  If the
  59    token is eError, then the status parameter will contain the
  60    specific error.  This will be eItemNotFound at the end of file,
  61    indicating that all tokens have been returned.  This method will
  62    never return eString twice in a row; instead, multiple adjacent
  63    string tokens will be merged into one, with no intervening
  64    space. */
  65 U_CFUNC enum ETokenType
  66 getNextToken(UCHARBUF* buf,
  67              struct UString *token,
  68              uint32_t *linenumber, /* out: linenumber of token */
  69              struct UString *comment,
  70              UErrorCode *status) {
  71     enum ETokenType result;
  72     UChar32         c;
  73
  74     if (U_FAILURE(*status)) {
  75         return TOK_ERROR;
  76     }
  77
  78     /* Skip whitespace */
  79     c = getNextChar(buf, TRUE, comment, status);
  80
  81     if (U_FAILURE(*status)) {
  82         return TOK_ERROR;
  83     }
  84
  85     *linenumber = lineCount;
  86
  87     switch(c) {
  88     case BADBOM:
  89         return TOK_ERROR;
  90     case OPENBRACE:
  91         return TOK_OPEN_BRACE;
  92     case CLOSEBRACE:
  93         return TOK_CLOSE_BRACE;
  94     case COMMA:
  95         return TOK_COMMA;
  96     case U_EOF:
  97         return TOK_EOF;
  98     case COLON:
  99         return TOK_COLON;
 100
 101     default:
 102         result = getStringToken(buf, c, token, status);
 103     }
 104
 105     *linenumber = lineCount;
 106     return result;
 107 }
 108
 109 /* Copy a string token into the given UnicodeString.  Upon entry, we
 110    have already read the first character of the string token, which is
 111    not a whitespace character (but may be a QUOTE or ESCAPE). This
 112    function reads all subsequent characters that belong with this
 113    string, and copy them into the token parameter. The other
 114    important, and slightly convoluted purpose of this function is to
 115    merge adjacent strings.  It looks forward a bit, and if the next
 116    non comment, non whitespace item is a string, it reads it in as
 117    well.  If two adjacent strings are quoted, they are merged without
 118    intervening space.  Otherwise a single SPACE character is
 119    inserted. */
 120 static enum ETokenType getStringToken(UCHARBUF* buf,
 121                                       UChar32 initialChar,
 122                                       struct UString *token,
 123                                       UErrorCode *status) {
 124     UBool    lastStringWasQuoted;
 125     UChar32  c;
 126     UChar    target[3] = { '\0' };
 127     UChar    *pTarget   = target;
 128     int      len=0;
 129     UBool    isFollowingCharEscaped=FALSE;
 130     UBool    isNLUnescaped = FALSE;
 131     UChar32  prevC=0;
 132
 133     /* We are guaranteed on entry that initialChar is not a whitespace
 134        character. If we are at the EOF, or have some other problem, it
 135        doesn't matter; we still want to validly return the initialChar
 136        (if nothing else) as a string token. */
 137
 138     if (U_FAILURE(*status)) {
 139         return TOK_ERROR;
 140     }
 141
 142     /* setup */
 143     lastStringWasQuoted = FALSE;
 144     c = initialChar;
 145     ustr_setlen(token, 0, status);
 146
 147     if (U_FAILURE(*status)) {
 148         return TOK_ERROR;
 149     }
 150
 151     for (;;) {
 152         if (c == QUOTE) {
 153             if (!lastStringWasQuoted && token->fLength > 0) {
 154                 ustr_ucat(token, SPACE, status);
 155
 156                 if (U_FAILURE(*status)) {
 157                     return TOK_ERROR;
 158                 }
 159             }
 160
 161             lastStringWasQuoted = TRUE;
 162
 163             for (;;) {
 164                 c = ucbuf_getc(buf,status);
 165
 166                 /* EOF reached */
 167                 if (c == U_EOF) {
 168                     return TOK_EOF;
 169                 }
 170
 171                 /* Unterminated quoted strings */
 172                 if (U_FAILURE(*status)) {
 173                     return TOK_ERROR;
 174                 }
 175
 176                 if (c == QUOTE && !isFollowingCharEscaped) {
 177                     break;
 178                 }
 179
 180                 if (c == ESCAPE  && !isFollowingCharEscaped) {
 181                     pTarget = target;
 182                     c       = unescape(buf, status);
 183
 184                     if (c == U_ERR) {
 185                         return TOK_ERROR;
 186                     }
 187                     if(c == CR || c == LF){
 188                         isNLUnescaped = TRUE;
 189                     }
 190                 }
 191
 192                 if(c==ESCAPE && !isFollowingCharEscaped){
 193                     isFollowingCharEscaped = TRUE;
 194                 }else{
 195                     U_APPEND_CHAR32(c, pTarget,len);
 196                     pTarget = target;
 197                     ustr_uscat(token, pTarget,len, status);
 198                     isFollowingCharEscaped = FALSE;
 199                     len=0;
 200                     if(c == CR || c == LF){
 201                         if(isNLUnescaped == FALSE && prevC!=CR){
 202                             lineCount++;
 203                         }
 204                         isNLUnescaped = FALSE;
 205                     }
 206                 }
 207
 208                 if (U_FAILURE(*status)) {
 209                     return TOK_ERROR;
 210                 }
 211                 prevC = c;
 212             }
 213         } else {
 214             if (token->fLength > 0) {
 215                 ustr_ucat(token, SPACE, status);
 216
 217                 if (U_FAILURE(*status)) {
 218                     return TOK_ERROR;
 219                 }
 220             }
 221
 222             if(lastStringWasQuoted){
 223                 if(getShowWarning()){
 224                     warning(lineCount, "Mixing quoted and unquoted strings");
 225                 }
 226                 if(isStrict()){
 227                     return TOK_ERROR;
 228                 }
 229
 230             }
 231
 232             lastStringWasQuoted = FALSE;
 233
 234             /* if we reach here we are mixing
 235              * quoted and unquoted strings
 236              * warn in normal mode and error in
 237              * pedantic mode
 238              */
 239
 240             if (c == ESCAPE) {
 241                 pTarget = target;
 242                 c       = unescape(buf, status);
 243
 244                 /* EOF reached */
 245                 if (c == U_EOF) {
 246                     return TOK_ERROR;
 247                 }
 248             }
 249
 250             U_APPEND_CHAR32(c, pTarget,len);
 251             pTarget = target;
 252             ustr_uscat(token, pTarget,len, status);
 253             len=0;
 254
 255             if (U_FAILURE(*status)) {
 256                 return TOK_ERROR;
 257             }
 258
 259             for (;;) {
 260                 /* DON'T skip whitespace */
 261                 c = getNextChar(buf, FALSE, NULL, status);
 262
 263                 /* EOF reached */
 264                 if (c == U_EOF) {
 265                     ucbuf_ungetc(c, buf);
 266                     return TOK_STRING;
 267                 }
 268
 269                 if (U_FAILURE(*status)) {
 270                     return TOK_STRING;
 271                 }
 272
 273                 if (c == QUOTE
 274                         || c == OPENBRACE
 275                         || c == CLOSEBRACE
 276                         || c == COMMA
 277                         || c == COLON) {
 278                     ucbuf_ungetc(c, buf);
 279                     break;
 280                 }
 281
 282                 if (isWhitespace(c)) {
 283                     break;
 284                 }
 285
 286                 if (c == ESCAPE) {
 287                     pTarget = target;
 288                     c       = unescape(buf, status);
 289
 290                     if (c == U_ERR) {
 291                         return TOK_ERROR;
 292                     }
 293                 }
 294
 295                 U_APPEND_CHAR32(c, pTarget,len);
 296                 pTarget = target;
 297                 ustr_uscat(token, pTarget,len, status);
 298                 len=0;
 299                 if (U_FAILURE(*status)) {
 300                     return TOK_ERROR;
 301                 }
 302             }
 303         }
 304
 305         /* DO skip whitespace */
 306         c = getNextChar(buf, TRUE, NULL, status);
 307
 308         if (U_FAILURE(*status)) {
 309             return TOK_STRING;
 310         }
 311
 312         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
 313             ucbuf_ungetc(c, buf);
 314             return TOK_STRING;
 315         }
 316     }
 317 }
 318
 319 /* Retrieve the next character.  If skipwhite is
 320    true, whitespace is skipped as well. */
 321 static UChar32 getNextChar(UCHARBUF* buf,
 322                            UBool skipwhite,
 323                            struct UString *token,
 324                            UErrorCode *status) {
 325     UChar32 c, c2;
 326
 327     if (U_FAILURE(*status)) {
 328         return U_EOF;
 329     }
 330
 331     for (;;) {
 332         c = ucbuf_getc(buf,status);
 333
 334         if (c == U_EOF) {
 335             return U_EOF;
 336         }
 337
 338         if (skipwhite && isWhitespace(c)) {
 339             continue;
 340         }
 341
 342         /* This also handles the get() failing case */
 343         if (c != SLASH) {
 344             return c;
 345         }
 346
 347         c = ucbuf_getc(buf,status); /* "/c" */
 348
 349         if (c == U_EOF) {
 350             return U_EOF;
 351         }
 352
 353         switch (c) {
 354         case SLASH:  /* "//" */
 355             seekUntilNewline(buf, NULL, status);
 356             break;
 357
 358         case ASTERISK:  /* " / * " */
 359             c2 = ucbuf_getc(buf, status); /* "/ * c" */
 360             if(c2 == ASTERISK){  /* "/ * *" */
 361                 /* parse multi-line comment and store it in token*/
 362                 seekUntilEndOfComment(buf, token, status);
 363             } else {
 364                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
 365                 seekUntilEndOfComment(buf, NULL, status);
 366             }
 367             break;
 368
 369         default:
 370             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
 371             /* If get() failed this is a NOP */
 372             return SLASH;
 373         }
 374
 375     }
 376 }
 377
 378 static void seekUntilNewline(UCHARBUF* buf,
 379                              struct UString *token,
 380                              UErrorCode *status) {
 381     UChar32 c;
 382
 383     if (U_FAILURE(*status)) {
 384         return;
 385     }
 386
 387     do {
 388         c = ucbuf_getc(buf,status);
 389         /* add the char to token */
 390         if(token!=NULL){
 391             ustr_u32cat(token, c, status);
 392         }
 393     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
 394 }
 395
 396 static void seekUntilEndOfComment(UCHARBUF *buf,
 397                                   struct UString *token,
 398                                   UErrorCode *status) {
 399     UChar32  c, d;
 400     uint32_t line;
 401
 402     if (U_FAILURE(*status)) {
 403         return;
 404     }
 405
 406     line = lineCount;
 407
 408     do {
 409         c = ucbuf_getc(buf, status);
 410
 411         if (c == ASTERISK) {
 412             d = ucbuf_getc(buf, status);
 413
 414             if (d != SLASH) {
 415                 ucbuf_ungetc(d, buf);
 416             } else {
 417                 break;
 418             }
 419         }
 420         /* add the char to token */
 421         if(token!=NULL){
 422             ustr_u32cat(token, c, status);
 423         }
 424         /* increment the lineCount */
 425         isNewline(c);
 426
 427     } while (c != U_EOF && *status == U_ZERO_ERROR);
 428
 429     if (c == U_EOF) {
 430         *status = U_INVALID_FORMAT_ERROR;
 431         error(line, "unterminated comment detected");
 432     }
 433 }
 434
 435 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
 436     if (U_FAILURE(*status)) {
 437         return U_EOF;
 438     }
 439
 440     /* We expect to be called after the ESCAPE has been seen, but
 441      * u_fgetcx needs an ESCAPE to do its magic. */
 442     ucbuf_ungetc(ESCAPE, buf);
 443
 444     return ucbuf_getcx32(buf, status);
 445 }
 446
 447 static UBool isWhitespace(UChar32 c) {
 448     switch (c) {
 449         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
 450     case 0x000A:
 451     case 0x2029:
 452         lineCount++;
 453     case 0x000D:
 454     case 0x0020:
 455     case 0x0009:
 456     case 0xFEFF:
 457         return TRUE;
 458
 459     default:
 460         return FALSE;
 461     }
 462 }
 463
 464 static UBool isNewline(UChar32 c) {
 465     switch (c) {
 466         /* '\n', '\r', 0x2029 */
 467     case 0x000A:
 468     case 0x2029:
 469         lineCount++;
 470     case 0x000D:
 471         return TRUE;
 472
 473     default:
 474         return FALSE;
 475     }
 476 }