Fix several differences in the parsing of $.. and ${...}

author Brian Fraser <fraserbn@gmail.com>

Tue, 5 Mar 2013 20:46:52 +0000 (17:46 -0300)

committer Karl Williamson <public@khwilliamson.com>

Wed, 6 Mar 2013 22:02:40 +0000 (15:02 -0700)
author Brian Fraser <fraserbn@gmail.com>
Tue, 5 Mar 2013 20:46:52 +0000 (17:46 -0300)
committer Karl Williamson <public@khwilliamson.com>
Wed, 6 Mar 2013 22:02:40 +0000 (15:02 -0700)
diff --git a/MANIFEST b/MANIFEST

index 4a49acd..832dc9c 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -5609,6 +5609,7 @@ t/uni/tr_sjis.t                   See if Unicode tr/// in sjis works
  t/uni/tr_utf8.t                        See if Unicode tr/// in utf8 works
  t/uni/universal.t              See if Unicode in calls to UNIVERSAL works
  t/uni/upper.t                  See if Unicode casing works
+t/uni/variables.t              See that the rules for variable names work
  t/uni/write.t                  See if Unicode formats work
  t/win32/fs.t                   Test Win32 link for compatibility
  t/win32/runenv.t               Test if Win* perl honors its env variables
diff --git a/embed.fnc b/embed.fnc

index c9832d4..2f5e089 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -2239,6 +2239,9 @@ so        |SV*    |new_constant   |NULLOK const char *s|STRLEN len \
                                 |STRLEN typelen
  s      |int    |deprecate_commaless_var_list
  s      |int    |ao             |int toketype
+s  |void|parse_ident|NN char **s|NN char **d \
+                     |NN char * const e|int allow_package \
+                     |bool is_utf8
  #  if defined(PERL_CR_FILTER)
  s      |I32    |cr_textfilter  |int idx|NULLOK SV *sv|int maxlen
  s      |void   |strip_return   |NN SV *sv
diff --git a/embed.h b/embed.h

index 9654979..248ed50 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -1609,6 +1609,7 @@
  #define lop(a,b,c)             S_lop(aTHX_ a,b,c)
  #define missingterm(a)         S_missingterm(aTHX_ a)
  #define no_op(a,b)             S_no_op(aTHX_ a,b)
+#define parse_ident(a,b,c,d,e) S_parse_ident(aTHX_ a,b,c,d,e)
  #define pending_ident()                S_pending_ident(aTHX)
  #define readpipe_override()    S_readpipe_override(aTHX)
  #define scan_const(a)          S_scan_const(aTHX_ a)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index c692139..ea49f94 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -37,11 +37,11 @@ L</Selected Bug Fixes> section.
  
  =head1 Incompatible Changes
  
-XXX For a release on a stable branch, this section aspires to be:
-
-    There are no changes intentionally incompatible with 5.XXX.XXX
-    If any exist, they are bugs, and we request that you submit a
-    report.  See L</Reporting Bugs> below.
+There are no longer any differences in the parsing of identifiers specified
+as $... or ${...}; previously, they were dealt with in different parts of
+the core, and so had slightly different behavior. For instance,
+C<${foo:bar}> was a legal variable name.  Since they are now both parsed
+by the same code, that is no longer the case.
  
  [ List each incompatible change as a =head2 entry ]
  
diff --git a/proto.h b/proto.h

index 9192960..35d49db 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -7272,6 +7272,13 @@ STATIC void      S_no_op(pTHX_ const char *const what, char *s)
  #define PERL_ARGS_ASSERT_NO_OP \
         assert(what)
  
+STATIC void    S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8)
+                       __attribute__nonnull__(pTHX_1)
+                       __attribute__nonnull__(pTHX_2)
+                       __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_PARSE_IDENT   \
+       assert(s); assert(d); assert(e)
+
  STATIC int     S_pending_ident(pTHX);
  STATIC void    S_readpipe_override(pTHX);
  STATIC char*   S_scan_const(pTHX_ char *start)
diff --git a/t/uni/labels.t b/t/uni/labels.t

index 3d7d476..3fa9d38 100644 (file)
--- a/t/uni/labels.t
+++ b/t/uni/labels.t
@@ -15,7 +15,7 @@ use feature qw 'unicode_strings evalbytes';
  
  use charnames qw( :full );
  
-plan(9);
+plan(10);
  
  ＬＡＢＥＬ: {
      pass("Sanity check, UTF-8 labels don't throw a syntax error.");
@@ -54,11 +54,13 @@ SKIP: {
      like $@, qr/Label not found for "next Ｅ" at/u, "next's error is UTF-8 clean";
  }
  
-my $d = 4;
+my $d = 2;
  LÁBEL: {
+    my $e = $@;
      my $prog = "redo L\N{LATIN CAPITAL LETTER A WITH ACUTE}BEL";
  
-    if ($d % 2) {
+    if ($d == 1) {
+        is $e, '', "redo UTF8 works";
          utf8::downgrade($prog);
      }
      if ($d--) {
@@ -68,8 +70,8 @@ LÁBEL: {
      }
  }
  
-is $@, '', "redo to downgradeable labels works";
-is $d, -1, "Latin-1 labels reachable regardless of UTF-8ness";
+like $@, qr/Unrecognized character/, "redo to downgradeable labels";
+is $d, 0, "Latin-1 labels are reachable";
  
  {
      no warnings;
diff --git a/t/uni/variables.t b/t/uni/variables.t

new file mode 100644 (file)

index 0000000..14f4c2b
--- /dev/null
+++ b/t/uni/variables.t
@@ -0,0 +1,54 @@
+#!./perl
+
+# Checks if the parser behaves correctly in edge case
+# (including weird syntax errors)
+
+BEGIN {
+    require './test.pl';
+}
+
+use 5.016;
+use utf8;
+use open qw( :utf8 :std );
+
+plan (tests => 5);
+
+# ${single:colon} should not be valid syntax
+{
+    no strict;
+
+    local $@;
+    eval "\${\x{30cd}single:\x{30cd}colon} = 1";
+    like($@,
+         qr/syntax error .* near "\x{30cd}single:/,
+         '${\x{30cd}single:\x{30cd}colon} should not be valid syntax'
+        );
+
+    local $@;
+    no utf8;
+    evalbytes '${single:colon} = 1';
+    like($@,
+         qr/syntax error .* near "single:/,
+         '...same with ${single:colon}'
+        );
+}
+
+# ${yadda'etc} and ${yadda::etc} should both work under strict
+{
+    local $@;
+    eval q<use strict; ${flark::fleem}>;
+    is($@, '', q<${package::var} works>);
+
+    local $@;
+    eval q<use strict; ${fleem'flark}>;
+    is($@, '', q<...as does ${package'var}>);
+}
+
+# The first character in ${...} should respect the rules
+TODO: {
+   local $::TODO = "Fixed by the next commit";
+   local $@;
+   use utf8;
+   eval '${☭asd} = 1';
+   like($@, qr/\QUnrecognized character/, q(the first character in ${...} isn't special))
+}
diff --git a/toke.c b/toke.c

index 006f885..2748546 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -9188,6 +9188,46 @@ now_ok:
      return res;
  }
  
+PERL_STATIC_INLINE void
+S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8) {
+    dVAR;
+    PERL_ARGS_ASSERT_PARSE_IDENT;
+
+    for (;;) {
+        if (*d >= e)
+            Perl_croak(aTHX_ "%s", ident_too_long);
+        if (isWORDCHAR(**s)) /* UTF handled below */
+            *(*d)++ = *(*s)++;
+        else if (is_utf8 && UTF8_IS_START(**s) && isWORDCHAR_utf8((U8*)*s)) {
+            char *t = *s + UTF8SKIP(*s);
+            while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t))
+                t += UTF8SKIP(t);
+            if (*d + (t - *s) > e)
+                Perl_croak(aTHX_ "%s", ident_too_long);
+            Copy(*s, *d, t - *s, char);
+            *d += t - *s;
+            *s = t;
+        }
+        else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) {
+            *(*d)++ = ':';
+            *(*d)++ = ':';
+            (*s)++;
+        }
+        else if (allow_package && **s == ':' && (*s)[1] == ':'
+           /* Disallow things like Foo::$bar. For the curious, this is
+            * the code path that triggers the "Bad name after" warning
+            * when looking for barewords.
+            */
+           && (*s)[2] != '$') {
+            *(*d)++ = *(*s)++;
+            *(*d)++ = *(*s)++;
+        }
+        else
+            break;
+    }
+    return;
+}
+
  /* Returns a NUL terminated string, with the length of the string written to
     *slp
     */
@@ -9197,44 +9237,14 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN
      dVAR;
      char *d = dest;
      char * const e = d + destlen - 3;  /* two-character token, ending NUL */
+    bool is_utf8 = cBOOL(UTF);
  
      PERL_ARGS_ASSERT_SCAN_WORD;
  
-    for (;;) {
-       if (d >= e)
-           Perl_croak(aTHX_ "%s", ident_too_long);
-       if (isWORDCHAR(*s)
-            || (!UTF && isALPHANUMERIC_L1(*s))) /* UTF handled below */
-        {
-           *d++ = *s++;
-        }
-       else if (allow_package && (*s == '\'') && isIDFIRST_lazy_if(s+1,UTF)) {
-           *d++ = ':';
-           *d++ = ':';
-           s++;
-       }
-       else if (allow_package && (s[0] == ':') && (s[1] == ':') && (s[2] != '$')) {
-           *d++ = *s++;
-           *d++ = *s++;
-       }
-       else if (UTF && UTF8_IS_START(*s) && isWORDCHAR_utf8((U8*)s)) {
-           char *t = s + UTF8SKIP(s);
-           size_t len;
-           while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t))
-               t += UTF8SKIP(t);
-           len = t - s;
-           if (d + len > e)
-               Perl_croak(aTHX_ "%s", ident_too_long);
-           Copy(s, d, len, char);
-           d += len;
-           s = t;
-       }
-       else {
-           *d = '\0';
-           *slp = d - dest;
-           return s;
-       }
-    }
+    parse_ident(&s, &d, e, allow_package, is_utf8);
+    *d = '\0';
+    *slp = d - dest;
+    return s;
  }
  
  STATIC char *
@@ -9245,6 +9255,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck
      char funny = *s++;
      char *d = dest;
      char * const e = d + destlen - 3;    /* two-character token, ending NUL */
+    bool is_utf8 = cBOOL(UTF);
  
      PERL_ARGS_ASSERT_SCAN_IDENT;
  
@@ -9258,33 +9269,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck
         }
      }
      else {
-       for (;;) {
-           if (d >= e)
-               Perl_croak(aTHX_ "%s", ident_too_long);
-           if (isWORDCHAR(*s)) /* UTF handled below */
-               *d++ = *s++;
-           else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) {
-               *d++ = ':';
-               *d++ = ':';
-               s++;
-           }
-           else if (*s == ':' && s[1] == ':') {
-               *d++ = *s++;
-               *d++ = *s++;
-           }
-           else if (UTF && UTF8_IS_START(*s) && isWORDCHAR_utf8((U8*)s)) {
-               char *t = s + UTF8SKIP(s);
-               while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t))
-                   t += UTF8SKIP(t);
-               if (d + (t - s) > e)
-                   Perl_croak(aTHX_ "%s", ident_too_long);
-               Copy(s, d, t - s, char);
-               d += t - s;
-               s = t;
-           }
-           else
-               break;
-       }
+        parse_ident(&s, &d, e, 1, is_utf8);
      }
      *d = '\0';
      d = dest;
@@ -9294,7 +9279,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck
         return s;
      }
      if (*s == '$' && s[1] &&
-       (isWORDCHAR_lazy_if(s+1,UTF) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) )
+       (isWORDCHAR_lazy_if(s+1,is_utf8) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) )
      {
         return s;
      }
@@ -9303,7 +9288,7 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck
         s++;
      }
      if (s < send) {
-        if (UTF) {
+        if (is_utf8) {
              const STRLEN skip = UTF8SKIP(s);
              STRLEN i;
              d[skip] = '\0';
@@ -9331,25 +9316,9 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck
                 }
             }
         }
-       if (isIDFIRST_lazy_if(d,UTF)) {
-           d += UTF8SKIP(d);
-           if (UTF) {
-               char *end = s;
-               while ((end < send && isWORDCHAR_lazy_if(end,UTF)) || *end == ':') {
-                   end += UTF8SKIP(end);
-                   while (end < send && UTF8_IS_CONTINUED(*end) && _is_utf8_mark((U8*)end))
-                       end += UTF8SKIP(end);
-               }
-               Copy(s, d, end - s, char);
-               d += end - s;
-               s = end;
-           }
-           else {
-               while ((isWORDCHAR(*s) || *s == ':') && d < e)
-                   *d++ = *s++;
-               if (d >= e)
-                   Perl_croak(aTHX_ "%s", ident_too_long);
-           }
+       if (isIDFIRST_lazy_if(d,is_utf8)) {
+        d += is_utf8 ? UTF8SKIP(d) : 1;
+        parse_ident(&s, &d, e, 1, is_utf8);
             *d = '\0';
             while (s < send && SPACE_OR_TAB(*s))
                 s++;
@@ -9391,10 +9360,10 @@ S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck
             if (PL_lex_state == LEX_NORMAL) {
                 if (ckWARN(WARN_AMBIGUOUS) &&
                     (keyword(dest, d - dest, 0)
-                    || get_cvn_flags(dest, d - dest, UTF ? SVf_UTF8 : 0)))
+                    || get_cvn_flags(dest, d - dest, is_utf8 ? SVf_UTF8 : 0)))
                 {
                      SV *tmp = newSVpvn_flags( dest, d - dest,
-                                            SVs_TEMP | (UTF ? SVf_UTF8 : 0) );
+                                            SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) );
                     if (funny == '#')
                         funny = '@';
                     Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
author	Brian Fraser <fraserbn@gmail.com>
	Tue, 5 Mar 2013 20:46:52 +0000 (17:46 -0300)
committer	Karl Williamson <public@khwilliamson.com>
	Wed, 6 Mar 2013 22:02:40 +0000 (15:02 -0700)
MANIFEST		patch \| blob \| history
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
pod/perldelta.pod		patch \| blob \| history
proto.h		patch \| blob \| history
t/uni/labels.t		patch \| blob \| history
t/uni/variables.t	[new file with mode: 0644]	patch \| blob
toke.c		patch \| blob \| history