From: Paul Eggert Date: Sun, 3 Nov 2002 08:42:32 +0000 (+0000) Subject: Revamp to fix POSIX incompatibilities, to count columns correctly, and X-Git-Tag: BISON-1_875~359 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d8d3f94a993ce890baae68bf9da7ded29f9f8d76;p=platform%2Fupstream%2Fbison.git Revamp to fix POSIX incompatibilities, to count columns correctly, and to check for invalid inputs. Use mbsnwidth to count columns correctly. Account for tabs, too. Include mbswidth.h. (YY_USER_ACTION): Invoke extend_location rather than LOCATION_COLUMNS. (extend_location): New function. (YY_LINES): Remove. Handle CRLF in C code rather than in Lex code. (YY_INPUT): New macro. (no_cr_read): New function. Scan UCNs, even though we don't fully handle them yet. (convert_ucn_to_byte): New function. Handle backslash-newline correctly in C code. (SC_LINE_COMMENT, SC_YACC_COMMENT): New states. (eols, blanks): Remove. YY_USER_ACTION now counts newlines etc.; all uses changed. (tag, splice): New EREs. Do not allow NUL or newline in tags. Use {splice} wherever C allows backslash-newline. YY_STEP after space, newline, vertical-tab. ("/*"): BEGIN SC_YACC_COMMENT, not yy_push_state (SC_COMMENT). (letter, id): Don't assume ASCII; e.g., spell out a-z. ({int}, handle_action_dollar, handle_action_at): Check for integer overflow. (YY_STEP): Omit trailing semicolon, so that it's more like C. (): Allow \0 and \00 as well as \000. Check for UCHAR_MAX, not 255. Allow \x with an arbitrary positive number of digits, as in C. Check for overflow here. Allow \? and UCNs, for compatibility with C. (handle_symbol_code_dollar): Use quote_n slot 1 to avoid collision with quote slot used by complain_at. --- diff --git a/src/scan-gram.l b/src/scan-gram.l index fa24a1a..200e56e 100644 --- a/src/scan-gram.l +++ b/src/scan-gram.l @@ -24,6 +24,7 @@ %{ #include "system.h" +#include "mbswidth.h" #include "complain.h" #include "quote.h" #include "getargs.h" @@ -39,9 +40,95 @@ do { \ if (yycontrol) {;}; \ } while (0) -#define YY_USER_ACTION LOCATION_COLUMNS (*yylloc, yyleng); -#define YY_LINES LOCATION_LINES (*yylloc, yyleng); -#define YY_STEP LOCATION_STEP (*yylloc); +#define YY_USER_ACTION extend_location (yylloc, yytext, yyleng); +#define YY_STEP LOCATION_STEP (*yylloc) + +#define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size)) + + +/* Read bytes from FP into buffer BUF of size SIZE. Return the + number of bytes read. Remove '\r' from input, treating \r\n + and isolated \r as \n. */ + +static size_t +no_cr_read (FILE *fp, char *buf, size_t size) +{ + size_t s = fread (buf, 1, size, fp); + if (s) + { + char *w = memchr (buf, '\r', s); + if (w) + { + char const *r = ++w; + char const *lim = buf + s; + + for (;;) + { + /* Found an '\r'. Treat it like '\n', but ignore any + '\n' that immediately follows. */ + w[-1] = '\n'; + if (r == lim) + { + int ch = getc (fp); + if (ch != '\n' && ungetc (ch, fp) != ch) + break; + } + else if (*r == '\n') + r++; + + /* Copy until the next '\r'. */ + do + { + if (r == lim) + return w - buf; + } + while ((*w++ = *r++) != '\r'); + } + + return w - buf; + } + } + + return s; +} + + +/* Extend *LOC to account for token TOKEN of size SIZE. */ + +static void +extend_location (location_t *loc, char const *token, int size) +{ + int line = loc->last_line; + int column = loc->last_column; + char const *p0 = token; + char const *p = token; + char const *lim = token + size; + + for (p = token; p < lim; p++) + switch (*p) + { + case '\r': + /* \r shouldn't survive no_cr_read. */ + abort (); + + case '\n': + line++; + column = 1; + p0 = p + 1; + break; + + case '\t': + column += mbsnwidth (p0, p - p0, 0); + column += 8 - ((column - 1) & 7); + p0 = p + 1; + break; + } + + loc->last_line = line; + loc->last_column = column + mbsnwidth (p0, p - p0, 0); +} + + /* STRING_OBSTACK -- Used to store all the characters that we need to keep (to construct ID, STRINGS etc.). Use the following macros to @@ -91,17 +178,26 @@ static void handle_dollar (braced_code_t code_kind, char *cp, location_t location); static void handle_at (braced_code_t code_kind, char *cp, location_t location); +static int convert_ucn_to_byte (char const *hex_text); %} -%x SC_COMMENT +%x SC_COMMENT SC_LINE_COMMENT SC_YACC_COMMENT %x SC_STRING SC_CHARACTER %x SC_ESCAPED_STRING SC_ESCAPED_CHARACTER %x SC_BRACED_CODE SC_PROLOGUE SC_EPILOGUE -id [.a-zA-Z_][.a-zA-Z_0-9]* +letter [.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_] +id {letter}({letter}|[0-9])* int [0-9]+ -eols (\n|\r|\n\r|\r\n)+ -blanks [ \t\f]+ + +/* POSIX says that a tag must be both an id and a C union member, but + historically almost any character is allowed in a tag. We disallow + NUL and newline, as this simplifies our implementation. */ +tag [^\0\n>]+ + +/* Zero or more instances of backslash-newline. Following GCC, allow + white space between the backslash and the newline. */ +splice (\\[ \f\t\v]*\n)* %% %{ @@ -136,7 +232,7 @@ blanks [ \t\f]+ "%nterm" return PERCENT_NTERM; "%output" return PERCENT_OUTPUT; "%parse-param" return PERCENT_PARSE_PARAM; - "%prec" { rule_length--; return PERCENT_PREC; } + "%prec" rule_length--; return PERCENT_PREC; "%printer" return PERCENT_PRINTER; "%pure"[-_]"parser" return PERCENT_PURE_PARSER; "%right" return PERCENT_RIGHT; @@ -152,20 +248,31 @@ blanks [ \t\f]+ "%yacc" return PERCENT_YACC; "=" return EQUAL; - ":" { rule_length = 0; return COLON; } - "|" { rule_length = 0; return PIPE; } + ":" rule_length = 0; return COLON; + "|" rule_length = 0; return PIPE; "," return COMMA; ";" return SEMICOLON; - {eols} YY_LINES; YY_STEP; - {blanks} YY_STEP; + [ \f\n\t\v]+ YY_STEP; + {id} { yylval->symbol = symbol_get (yytext, *yylloc); rule_length++; return ID; } - {int} yylval->integer = strtol (yytext, 0, 10); return INT; + {int} { + unsigned long num; + errno = 0; + num = strtoul (yytext, 0, 10); + if (INT_MAX < num || errno) + { + complain_at (*yylloc, _("%s is invalid"), yytext); + num = INT_MAX; + } + yylval->integer = num; + return INT; + } /* Characters. We don't check there is only one. */ "'" YY_OBS_GROW; yy_push_state (SC_ESCAPED_CHARACTER); @@ -174,7 +281,7 @@ blanks [ \t\f]+ "\"" YY_OBS_GROW; yy_push_state (SC_ESCAPED_STRING); /* Comments. */ - "/*" yy_push_state (SC_COMMENT); + "/*" BEGIN SC_YACC_COMMENT; "//".* YY_STEP; /* Prologue. */ @@ -184,7 +291,7 @@ blanks [ \t\f]+ "{" YY_OBS_GROW; ++braces_level; yy_push_state (SC_BRACED_CODE); /* A type. */ - "<"[^>]+">" { + "<"{tag}">" { obstack_grow (&string_obstack, yytext + 1, yyleng - 2); YY_OBS_FINISH; yylval->string = last_string; @@ -206,41 +313,48 @@ blanks [ \t\f]+ } - /*------------------------------------------------------------. - | Whatever the start condition (but those which correspond to | - | entity `swallowed' by Bison: SC_ESCAPED_STRING and | - | SC_ESCAPED_CHARACTER), no M4 character must escape as is. | - `------------------------------------------------------------*/ + /*-------------------------------------------------------------------. + | Whatever the start condition (but those which correspond to | + | entities `swallowed' by Bison: SC_YACC_COMMENT, SC_ESCAPED_STRING, | + | and SC_ESCAPED_CHARACTER), no M4 character must escape as is. | + `-------------------------------------------------------------------*/ - + { - \[ if (YY_START != SC_COMMENT) obstack_sgrow (&string_obstack, "@<:@"); - \] if (YY_START != SC_COMMENT) obstack_sgrow (&string_obstack, "@:>@"); + \[ obstack_sgrow (&string_obstack, "@<:@"); + \] obstack_sgrow (&string_obstack, "@:>@"); } + /*---------------------------------------------------------------. + | Scanning a Yacc comment. The initial `/ *' is already eaten. | + `---------------------------------------------------------------*/ - /*-----------------------------------------------------------. - | Scanning a C comment. The initial `/ *' is already eaten. | - `-----------------------------------------------------------*/ - - + { - "*/" { /* End of the comment. */ - if (yy_top_state () == INITIAL) - { - YY_STEP; - } - else - { - YY_OBS_GROW; - } - yy_pop_state (); + "*/" { + YY_STEP; + BEGIN INITIAL; } - [^\[\]*\n\r]+ if (yy_top_state () != INITIAL) YY_OBS_GROW; - {eols} if (yy_top_state () != INITIAL) YY_OBS_GROW; YY_LINES; - . /* Stray `*'. */if (yy_top_state () != INITIAL) YY_OBS_GROW; + [^*]+|"*" ; + + <> { + LOCATION_PRINT (stderr, *yylloc); + fprintf (stderr, _(": unexpected end of file in a comment\n")); + BEGIN INITIAL; + } +} + + + /*------------------------------------------------------------. + | Scanning a C comment. The initial `/ *' is already eaten. | + `------------------------------------------------------------*/ + + +{ + "*"{splice}"/" YY_OBS_GROW; yy_pop_state (); + [^*\[\]]+|"*" YY_OBS_GROW; <> { LOCATION_PRINT (stderr, *yylloc); @@ -250,6 +364,18 @@ blanks [ \t\f]+ } + /*--------------------------------------------------------------. + | Scanning a line comment. The initial `//' is already eaten. | + `--------------------------------------------------------------*/ + + +{ + "\n" YY_OBS_GROW; yy_pop_state (); + ([^\n\[\]]|{splice})+ YY_OBS_GROW; + <> yy_pop_state (); +} + + /*----------------------------------------------------------------. | Scanning a C string, including its escapes. The initial `"' is | | already eaten. | @@ -267,9 +393,7 @@ blanks [ \t\f]+ return STRING; } - [^\"\n\r\\]+ YY_OBS_GROW; - - {eols} obstack_1grow (&string_obstack, '\n'); YY_LINES; + [^\"\\]+ YY_OBS_GROW; <> { LOCATION_PRINT (stderr, *yylloc); @@ -305,9 +429,7 @@ blanks [ \t\f]+ } } - [^\n\r\\] YY_OBS_GROW; - - {eols} obstack_1grow (&string_obstack, '\n'); YY_LINES; + [^'\\]+ YY_OBS_GROW; <> { LOCATION_PRINT (stderr, *yylloc); @@ -327,9 +449,9 @@ blanks [ \t\f]+ { - \\[0-7]{3} { - long c = strtol (yytext + 1, 0, 8); - if (c > 255) + \\[0-7]{1,3} { + unsigned long c = strtoul (yytext + 1, 0, 8); + if (UCHAR_MAX < c) { LOCATION_PRINT (stderr, *yylloc); fprintf (stderr, _(": invalid escape: %s\n"), quote (yytext)); @@ -339,8 +461,18 @@ blanks [ \t\f]+ obstack_1grow (&string_obstack, c); } - \\x[0-9a-fA-F]{2} { - obstack_1grow (&string_obstack, strtol (yytext + 2, 0, 16)); + \\x[0-9a-fA-F]+ { + unsigned long c; + errno = 0; + c = strtoul (yytext + 2, 0, 16); + if (UCHAR_MAX < c || errno) + { + LOCATION_PRINT (stderr, *yylloc); + fprintf (stderr, _(": invalid escape: %s\n"), quote (yytext)); + YY_STEP; + } + else + obstack_1grow (&string_obstack, c); } \\a obstack_1grow (&string_obstack, '\a'); @@ -350,7 +482,18 @@ blanks [ \t\f]+ \\r obstack_1grow (&string_obstack, '\r'); \\t obstack_1grow (&string_obstack, '\t'); \\v obstack_1grow (&string_obstack, '\v'); - \\[\\""''] obstack_1grow (&string_obstack, yytext[1]); + \\[\"'?\\] obstack_1grow (&string_obstack, yytext[1]); + \\(u|U[0-9a-fA-F]{4})[0-9a-fA-F]{4} { + int c = convert_ucn_to_byte (yytext); + if (c < 0) + { + LOCATION_PRINT (stderr, *yylloc); + fprintf (stderr, _(": invalid escape: %s\n"), quote (yytext)); + YY_STEP; + } + else + obstack_1grow (&string_obstack, c); + } \\(.|\n) { LOCATION_PRINT (stderr, *yylloc); fprintf (stderr, _(": unrecognized escape: %s\n"), quote (yytext)); @@ -374,13 +517,12 @@ blanks [ \t\f]+ yy_pop_state (); } - [^\[\]\'\n\r\\]+ YY_OBS_GROW; - \\(.|\n) YY_OBS_GROW; - /* FLex wants this rule, in case of a `\<>'. */ + [^'\[\]\\]+ YY_OBS_GROW; + \\{splice}[^\[\]] YY_OBS_GROW; + {splice} YY_OBS_GROW; + /* Needed for `\<>', `\\<>[', and `\\<>]'. */ \\ YY_OBS_GROW; - {eols} YY_OBS_GROW; YY_LINES; - <> { LOCATION_PRINT (stderr, *yylloc); fprintf (stderr, _(": unexpected end of file in a character\n")); @@ -403,13 +545,12 @@ blanks [ \t\f]+ yy_pop_state (); } - [^\[\]\"\n\r\\]+ YY_OBS_GROW; - \\(.|\n) YY_OBS_GROW; - /* FLex wants this rule, in case of a `\<>'. */ + [^\"\[\]\\]+ YY_OBS_GROW; + \\{splice}[^\[\]] YY_OBS_GROW; + {splice} YY_OBS_GROW; + /* Needed for `\<>', `\\<>[', and `\\<>]'. */ \\ YY_OBS_GROW; - {eols} YY_OBS_GROW; YY_LINES; - <> { LOCATION_PRINT (stderr, *yylloc); fprintf (stderr, _(": unexpected end of file in a string\n")); @@ -432,8 +573,8 @@ blanks [ \t\f]+ "\"" YY_OBS_GROW; yy_push_state (SC_STRING); /* Comments. */ - "/*" YY_OBS_GROW; yy_push_state (SC_COMMENT); - "//".* YY_OBS_GROW; + "/"{splice}"*" YY_OBS_GROW; yy_push_state (SC_COMMENT); + "/"{splice}"/" YY_OBS_GROW; yy_push_state (SC_LINE_COMMENT); /* Not comments. */ "/" YY_OBS_GROW; @@ -461,15 +602,14 @@ blanks [ \t\f]+ "{" YY_OBS_GROW; braces_level++; - "$"("<"[^>]+">")?(-?[0-9]+|"$") { handle_dollar (current_braced_code, + "$"("<"{tag}">")?(-?[0-9]+|"$") { handle_dollar (current_braced_code, yytext, *yylloc); } "@"(-?[0-9]+|"$") { handle_at (current_braced_code, yytext, *yylloc); } - [^$@\[\]/\'\"\{\}\n\r]+ YY_OBS_GROW; - {eols} YY_OBS_GROW; YY_LINES; + [^$@\[\]/'\"\{\}]+ YY_OBS_GROW; - /* A lose $, or /, or etc. */ + /* A stray $, or /, or etc. */ . YY_OBS_GROW; <> { @@ -497,9 +637,8 @@ blanks [ \t\f]+ return PROLOGUE; } - [^%\[\]/\'\"\n\r]+ YY_OBS_GROW; + [^%\[\]/'\"]+ YY_OBS_GROW; "%" YY_OBS_GROW; - {eols} YY_OBS_GROW; YY_LINES; <> { LOCATION_PRINT (stderr, *yylloc); @@ -514,12 +653,12 @@ blanks [ \t\f]+ /*---------------------------------------------------------------. | Scanning the epilogue (everything after the second "%%", which | - | has already been eaten. | + | has already been eaten). | `---------------------------------------------------------------*/ { - ([^\[\]]|{eols})+ YY_OBS_GROW; + [^\[\]]+ YY_OBS_GROW; <> { yy_pop_state (); @@ -568,14 +707,15 @@ handle_action_dollar (char *text, location_t location) obstack_fgrow1 (&string_obstack, "]b4_lhs_value([%s])[", type_name); } - else if (('0' <= *cp && *cp <= '9') || *cp == '-') + else { - int n = strtol (cp, &cp, 10); + long num; + errno = 0; + num = strtol (cp, 0, 10); - if (n > rule_length) - complain_at (location, _("invalid value: %s%d"), "$", n); - else + if (INT_MIN <= num && num <= rule_length && ! errno) { + int n = num; if (!type_name && n > 0) type_name = symbol_list_n_type_name_get (current_rule, location, n); @@ -588,16 +728,14 @@ handle_action_dollar (char *text, location_t location) "]b4_rhs_value([%d], [%d], [%s])[", rule_length, n, type_name); } - } - else - { - complain_at (location, _("%s is invalid"), quote (text)); + else + complain_at (location, _("invalid value: %s"), text); } } /*---------------------------------------------------------------. -| TEXT is expexted tp be $$ in some code associated to a symbol: | +| TEXT is expected to be $$ in some code associated to a symbol: | | destructor or printer. | `---------------------------------------------------------------*/ @@ -608,7 +746,7 @@ handle_symbol_code_dollar (char *text, location_t location) if (*cp == '$') obstack_sgrow (&string_obstack, "]b4_dollar_dollar["); else - complain_at (location, _("%s is invalid"), quote (text)); + complain_at (location, _("%s is invalid"), quote_n (1, text)); } @@ -650,25 +788,26 @@ handle_action_at (char *text, location_t location) { obstack_sgrow (&string_obstack, "]b4_lhs_location["); } - else if (('0' <= *cp && *cp <= '9') || *cp == '-') + else { - int n = strtol (cp, &cp, 10); + long num; + errno = 0; + num = strtol (cp, 0, 10); - if (n > rule_length) - complain_at (location, _("invalid value: %s%d"), "@", n); + if (INT_MIN <= num && num <= rule_length && ! errno) + { + int n = num; + obstack_fgrow2 (&string_obstack, "]b4_rhs_location([%d], [%d])[", + rule_length, n); + } else - obstack_fgrow2 (&string_obstack, "]b4_rhs_location([%d], [%d])[", - rule_length, n); - } - else - { - complain_at (location, _("%s is invalid"), quote (text)); + complain_at (location, _("invalid value: %s"), text); } } /*---------------------------------------------------------------. -| TEXT is expexted tp be @$ in some code associated to a symbol: | +| TEXT is expected to be @$ in some code associated to a symbol: | | destructor or printer. | `---------------------------------------------------------------*/ @@ -679,7 +818,7 @@ handle_symbol_code_at (char *text, location_t location) if (*cp == '$') obstack_sgrow (&string_obstack, "]b4_at_dollar["); else - complain_at (location, _("%s is invalid"), quote (text)); + complain_at (location, _("%s is invalid"), quote_n (1, text)); } @@ -706,6 +845,62 @@ handle_at (braced_code_t braced_code_kind, } +/*------------------------------------------------------------------. +| Convert universal character name UCN to a single-byte character, | +| and return that character. Return -1 if UCN does not correspond | +| to a single-byte character. | +`------------------------------------------------------------------*/ + +static int +convert_ucn_to_byte (char const *ucn) +{ + unsigned long code = strtoul (ucn + 2, 0, 16); + + /* FIXME: Currently we assume Unicode-compatible unibyte characters + on ASCII hosts (i.e., Latin-1 on hosts with 8-bit bytes). On + non-ASCII hosts we support only the portable C character set. + These limitations should be removed once we add support for + multibyte characters. */ + + if (UCHAR_MAX < code) + return -1; + +#if ! ('$' == 0x24 && '@' == 0x40 && '`' == 0x60 && '~' == 0x7e) + { + /* A non-ASCII host. Use CODE to index into a table of the C + basic execution character set, which is guaranteed to exist on + all Standard C platforms. This table also includes '$', '@', + and '`', which not in the basic execution character set but + which are unibyte characters on all the platforms that we know + about. */ + static signed char const table[] = + { + '\0', -1, -1, -1, -1, -1, -1, '\a', + '\b', '\t', '\n', '\v', '\f', '\r', -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + ' ', '!', '"', '#', '$', '%', '&', '\'', + '(', ')', '*', '+', ',', '-', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', ':', ';', '<', '=', '>', '?', + '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', + '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', + 'x', 'y', 'z', '{', '|', '}', '~' + }; + + code = code < sizeof table ? table[code] : -1; + } +#endif + + return code; +} + + /*-------------------------. | Initialize the scanner. | `-------------------------*/