From 07f7264624e0307ed32e3b140ef2a0ea9d86a07f Mon Sep 17 00:00:00 2001 From: Brian Fraser Date: Tue, 5 Mar 2013 17:46:52 -0300 Subject: [PATCH] Fix several differences in the parsing of $.. and ${...} Namely: * The first character in ${...} used to have no restrictions * ${foo:bar} used to be legal * ${foo::bar} worked, but ${foo'bar} didn't And possibly other subtle, so far undiscovered bugs. This was resolved by simply using the same code for both things. Note that this commit is not entirely useful on its own; While tests pass, it requires changes from the following commit to work entirely. --- MANIFEST | 1 + embed.fnc | 3 ++ embed.h | 1 + pod/perldelta.pod | 10 ++-- proto.h | 7 +++ t/uni/labels.t | 12 +++-- t/uni/variables.t | 54 +++++++++++++++++++++ toke.c | 139 +++++++++++++++++++++--------------------------------- 8 files changed, 132 insertions(+), 95 deletions(-) create mode 100644 t/uni/variables.t diff --git a/MANIFEST b/MANIFEST index 4a49acd..832dc9c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -5609,6 +5609,7 @@ t/uni/tr_sjis.t See if Unicode tr/// in sjis works t/uni/tr_utf8.t See if Unicode tr/// in utf8 works t/uni/universal.t See if Unicode in calls to UNIVERSAL works t/uni/upper.t See if Unicode casing works +t/uni/variables.t See that the rules for variable names work t/uni/write.t See if Unicode formats work t/win32/fs.t Test Win32 link for compatibility t/win32/runenv.t Test if Win* perl honors its env variables diff --git a/embed.fnc b/embed.fnc index c9832d4..2f5e089 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2239,6 +2239,9 @@ so |SV* |new_constant |NULLOK const char *s|STRLEN len \ |STRLEN typelen s |int |deprecate_commaless_var_list s |int |ao |int toketype +s |void|parse_ident|NN char **s|NN char **d \ + |NN char * const e|int allow_package \ + |bool is_utf8 # if defined(PERL_CR_FILTER) s |I32 |cr_textfilter |int idx|NULLOK SV *sv|int maxlen s |void |strip_return |NN SV *sv diff --git a/embed.h b/embed.h index 9654979..248ed50 100644 --- a/embed.h +++ b/embed.h @@ -1609,6 +1609,7 @@ #define lop(a,b,c) S_lop(aTHX_ a,b,c) #define missingterm(a) S_missingterm(aTHX_ a) #define no_op(a,b) S_no_op(aTHX_ a,b) +#define parse_ident(a,b,c,d,e) S_parse_ident(aTHX_ a,b,c,d,e) #define pending_ident() S_pending_ident(aTHX) #define readpipe_override() S_readpipe_override(aTHX) #define scan_const(a) S_scan_const(aTHX_ a) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index c692139..ea49f94 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -37,11 +37,11 @@ L section. =head1 Incompatible Changes -XXX For a release on a stable branch, this section aspires to be: - - There are no changes intentionally incompatible with 5.XXX.XXX - If any exist, they are bugs, and we request that you submit a - report. See L below. +There are no longer any differences in the parsing of identifiers specified +as $... or ${...}; previously, they were dealt with in different parts of +the core, and so had slightly different behavior. For instance, +C<${foo:bar}> was a legal variable name. Since they are now both parsed +by the same code, that is no longer the case. [ List each incompatible change as a =head2 entry ] diff --git a/proto.h b/proto.h index 9192960..35d49db 100644 --- a/proto.h +++ b/proto.h @@ -7272,6 +7272,13 @@ STATIC void S_no_op(pTHX_ const char *const what, char *s) #define PERL_ARGS_ASSERT_NO_OP \ assert(what) +STATIC void S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT_PARSE_IDENT \ + assert(s); assert(d); assert(e) + STATIC int S_pending_ident(pTHX); STATIC void S_readpipe_override(pTHX); STATIC char* S_scan_const(pTHX_ char *start) diff --git a/t/uni/labels.t b/t/uni/labels.t index 3d7d476..3fa9d38 100644 --- a/t/uni/labels.t +++ b/t/uni/labels.t @@ -15,7 +15,7 @@ use feature qw 'unicode_strings evalbytes'; use charnames qw( :full ); -plan(9); +plan(10); LABEL: { pass("Sanity check, UTF-8 labels don't throw a syntax error."); @@ -54,11 +54,13 @@ SKIP: { like $@, qr/Label not found for "next E" at/u, "next's error is UTF-8 clean"; } -my $d = 4; +my $d = 2; LÁBEL: { + my $e = $@; my $prog = "redo L\N{LATIN CAPITAL LETTER A WITH ACUTE}BEL"; - if ($d % 2) { + if ($d == 1) { + is $e, '', "redo UTF8 works"; utf8::downgrade($prog); } if ($d--) { @@ -68,8 +70,8 @@ LÁBEL: { } } -is $@, '', "redo to downgradeable labels works"; -is $d, -1, "Latin-1 labels reachable regardless of UTF-8ness"; +like $@, qr/Unrecognized character/, "redo to downgradeable labels"; +is $d, 0, "Latin-1 labels are reachable"; { no warnings; diff --git a/t/uni/variables.t b/t/uni/variables.t new file mode 100644 index 0000000..14f4c2b --- /dev/null +++ b/t/uni/variables.t @@ -0,0 +1,54 @@ +#!./perl + +# Checks if the parser behaves correctly in edge case +# (including weird syntax errors) + +BEGIN { + require './test.pl'; +} + +use 5.016; +use utf8; +use open qw( :utf8 :std ); + +plan (tests => 5); + +# ${single:colon} should not be valid syntax +{ + no strict; + + local $@; + eval "\${\x{30cd}single:\x{30cd}colon} = 1"; + like($@, + qr/syntax error .* near "\x{30cd}single:/, + '${\x{30cd}single:\x{30cd}colon} should not be valid syntax' + ); + + local $@; + no utf8; + evalbytes '${single:colon} = 1'; + like($@, + qr/syntax error .* near "single:/, + '...same with ${single:colon}' + ); +} + +# ${yadda'etc} and ${yadda::etc} should both work under strict +{ + local $@; + eval q; + is($@, '', q<${package::var} works>); + + local $@; + eval q; + is($@, '', q<...as does ${package'var}>); +} + +# The first character in ${...} should respect the rules +TODO: { + local $::TODO = "Fixed by the next commit"; + local $@; + use utf8; + eval '${☭asd} = 1'; + like($@, qr/\QUnrecognized character/, q(the first character in ${...} isn't special)) +} diff --git a/toke.c b/toke.c index 006f885..2748546 100644 --- a/toke.c +++ b/toke.c @@ -9188,6 +9188,46 @@ now_ok: return res; } +PERL_STATIC_INLINE void +S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8) { + dVAR; + PERL_ARGS_ASSERT_PARSE_IDENT; + + for (;;) { + if (*d >= e) + Perl_croak(aTHX_ "%s", ident_too_long); + if (isWORDCHAR(**s)) /* UTF handled below */ + *(*d)++ = *(*s)++; + else if (is_utf8 && UTF8_IS_START(**s) && isWORDCHAR_utf8((U8*)*s)) { + char *t = *s + UTF8SKIP(*s); + while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) + t += UTF8SKIP(t); + if (*d + (t - *s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(*s, *d, t - *s, char); + *d += t - *s; + *s = t; + } + else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) { + *(*d)++ = ':'; + *(*d)++ = ':'; + (*s)++; + } + else if (allow_package && **s == ':' && (*s)[1] == ':' + /* Disallow things like Foo::$bar. For the curious, this is + * the code path that triggers the "Bad name after" warning + * when looking for barewords. + */ + && (*s)[2] != '$') { + *(*d)++ = *(*s)++; + *(*d)++ = *(*s)++; + } + else + break; + } + return; +} + /* Returns a NUL terminated string, with the length of the string written to *slp */ @@ -9197,44 +9237,14 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN dVAR; char *d = dest; char * const e = d + destlen - 3; /* two-character token, ending NUL */ + bool is_utf8 = cBOOL(UTF); PERL_ARGS_ASSERT_SCAN_WORD; - for (;;) { - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - if (isWORDCHAR(*s) - || (!UTF && isALPHANUMERIC_L1(*s))) /* UTF handled below */ - { - *d++ = *s++; - } - else if (allow_package && (*s == '\'') && isIDFIRST_lazy_if(s+1,UTF)) { - *d++ = ':'; - *d++ = ':'; - s++; - } - else if (allow_package && (s[0] == ':') && (s[1] == ':') && (s[2] != '$')) { - *d++ = *s++; - *d++ = *s++; - } - else if (UTF && UTF8_IS_START(*s) && isWORDCHAR_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - size_t len; - while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - len = t - s; - if (d + len > e) - Perl_croak(aTHX_ "%s", ident_too_long); - Copy(s, d, len, char); - d += len; - s = t; - } - else { - *d = '\0'; - *slp = d - dest; - return s; - } - } + parse_ident(&s, &d, e, allow_package, is_utf8); + *d = '\0'; + *slp = d - dest; + return s; } STATIC char * @@ -9245,6 +9255,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck char funny = *s++; char *d = dest; char * const e = d + destlen - 3; /* two-character token, ending NUL */ + bool is_utf8 = cBOOL(UTF); PERL_ARGS_ASSERT_SCAN_IDENT; @@ -9258,33 +9269,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck } } else { - for (;;) { - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - if (isWORDCHAR(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) { - *d++ = ':'; - *d++ = ':'; - s++; - } - else if (*s == ':' && s[1] == ':') { - *d++ = *s++; - *d++ = *s++; - } - else if (UTF && UTF8_IS_START(*s) && isWORDCHAR_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ "%s", ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } - else - break; - } + parse_ident(&s, &d, e, 1, is_utf8); } *d = '\0'; d = dest; @@ -9294,7 +9279,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck return s; } if (*s == '$' && s[1] && - (isWORDCHAR_lazy_if(s+1,UTF) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) + (isWORDCHAR_lazy_if(s+1,is_utf8) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) { return s; } @@ -9303,7 +9288,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck s++; } if (s < send) { - if (UTF) { + if (is_utf8) { const STRLEN skip = UTF8SKIP(s); STRLEN i; d[skip] = '\0'; @@ -9331,25 +9316,9 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck } } } - if (isIDFIRST_lazy_if(d,UTF)) { - d += UTF8SKIP(d); - if (UTF) { - char *end = s; - while ((end < send && isWORDCHAR_lazy_if(end,UTF)) || *end == ':') { - end += UTF8SKIP(end); - while (end < send && UTF8_IS_CONTINUED(*end) && _is_utf8_mark((U8*)end)) - end += UTF8SKIP(end); - } - Copy(s, d, end - s, char); - d += end - s; - s = end; - } - else { - while ((isWORDCHAR(*s) || *s == ':') && d < e) - *d++ = *s++; - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - } + if (isIDFIRST_lazy_if(d,is_utf8)) { + d += is_utf8 ? UTF8SKIP(d) : 1; + parse_ident(&s, &d, e, 1, is_utf8); *d = '\0'; while (s < send && SPACE_OR_TAB(*s)) s++; @@ -9391,10 +9360,10 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck if (PL_lex_state == LEX_NORMAL) { if (ckWARN(WARN_AMBIGUOUS) && (keyword(dest, d - dest, 0) - || get_cvn_flags(dest, d - dest, UTF ? SVf_UTF8 : 0))) + || get_cvn_flags(dest, d - dest, is_utf8 ? SVf_UTF8 : 0))) { SV *tmp = newSVpvn_flags( dest, d - dest, - SVs_TEMP | (UTF ? SVf_UTF8 : 0) ); + SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) ); if (funny == '#') funny = '@'; Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), -- 2.7.4