PATCH: deprecation warnings for unreasonable charnames

author Karl Williamson <khw@khw-desktop.(none)>

Sat, 20 Feb 2010 06:53:36 +0000 (23:53 -0700)

committer Steve Hay <steve.m.hay@googlemail.com>

Sat, 20 Feb 2010 11:03:53 +0000 (11:03 +0000)
author Karl Williamson <khw@khw-desktop.(none)>
Sat, 20 Feb 2010 06:53:36 +0000 (23:53 -0700)
committer Steve Hay <steve.m.hay@googlemail.com>
Sat, 20 Feb 2010 11:03:53 +0000 (11:03 +0000)
diff --git a/handy.h b/handy.h

index 911deca..d12972d 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -462,6 +462,18 @@ US-ASCII (Basic Latin) range are viewed as not having any case.
  #define isALNUM(c)     (isALPHA(c) || isDIGIT(c) || (c) == '_')
  #define isIDFIRST(c)   (isALPHA(c) || (c) == '_')
  #define isALPHA(c)     (isUPPER(c) || isLOWER(c))
+/* ALPHAU includes Unicode semantics for latin1 characters.  It has an extra
+ * >= AA test to speed up ASCII-only tests at the expense of the others */
+#define isALPHAU(c)    (isALPHA(c) || (NATIVE_TO_UNI((U8) c) >= 0xAA \
+    && ((NATIVE_TO_UNI((U8) c) >= 0xC0 \
+           && NATIVE_TO_UNI((U8) c) != 0xD7 && NATIVE_TO_UNI((U8) c) != 0xF7) \
+       || NATIVE_TO_UNI((U8) c) == 0xAA \
+       || NATIVE_TO_UNI((U8) c) == 0xB5 \
+       || NATIVE_TO_UNI((U8) c) == 0xBA)))
+#define isALNUMU(c)    (isDIGIT(c) || isALPHAU(c) || (c) == '_')
+
+/* continuation character for legal NAME in \N{NAME} */
+#define isCHARNAME_CONT(c) (isALNUMU(c) || (c) == ' ' || (c) == '-' || (c) == '(' || (c) == ')' || (c) == ':' || NATIVE_TO_UNI((U8) c) == 0xA0)
  #define isSPACE(c) \
         ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) =='\r' || (c) == '\f')
  #define isPSXSPC(c)    (isSPACE(c) || (c) == '\v')
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 95b45f7..e1f02db 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1457,6 +1457,14 @@ there are neither package declarations nor a C<$VERSION>.
  long for Perl to handle.  You have to be seriously twisted to write code
  that triggers this error.
  
+=item Deprecated character(s) in \\N{...} starting at '%s'
+
+(D deprecated) Just about anything is legal for the C<...> in C<\N{...}>.
+But starting in 5.12, non-reasonable ones that don't look like names are
+deprecated.  A reasonable name begins with an alphabetic character and
+continues with any combination of alphanumerics, dashes, spaces, parentheses or
+colons.
+
  =item Deprecated use of my() in false conditional
  
  (D deprecated) You used a declaration similar to C<my $x if 0>.
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t

index 86735ec..881fd9e 100644 (file)
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -21,7 +21,7 @@ BEGIN {
  }
  
  
-plan tests => 1155;  # Update this when adding/deleting tests.
+plan tests => 1159;  # Update this when adding/deleting tests.
  
  run_tests() unless caller;
  
@@ -1065,6 +1065,20 @@ sub run_tests {
          undef $w;
          eval 'q(syntax error) =~ /\N{MALFORMED}/';
          ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error';
+        undef $w;
+        eval 'q() =~ /\N{4F}/';
+        ok $w && $w =~ /Deprecated/, 'Verify that leading digit in name gives warning';
+        undef $w;
+        eval 'q() =~ /\N{COM,MA}/';
+        ok $w && $w =~ /Deprecated/, 'Verify that comma in name gives warning';
+        undef $w;
+        my $name = "A\x{D7}O";
+        eval "q(W) =~ /\\N{$name}/";
+        ok $w && $w =~ /Deprecated/, 'Verify that latin1 symbol in name gives warning';
+        undef $w;
+        $name = "A\x{D1}O";
+        eval "q(W) =~ /\\N{$name}/";
+        ok ! $w, 'Verify that latin1 letter in name doesnt give warning';
  
      }
  
diff --git a/toke.c b/toke.c

index 361d7d2..7167004 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3206,7 +3206,70 @@ S_scan_const(pTHX_ char *start)
                         d += len;
                     }
                     SvREFCNT_dec(res);
-               }
+
+                   /* Deprecate non-approved name syntax */
+                   if (ckWARN_d(WARN_DEPRECATED)) {
+                       bool problematic = FALSE;
+                       char* i = s;
+
+                       /* For non-ut8 input, look to see that the first
+                        * character is an alpha, then loop through the rest
+                        * checking that each is a continuation */
+                       if (! this_utf8) {
+                           if (! isALPHAU(*i)) problematic = TRUE;
+                           else for (i = s + 1; i < e; i++) {
+                               if (isCHARNAME_CONT(*i)) continue;
+                               problematic = TRUE;
+                               break;
+                           }
+                       }
+                       else {
+                           /* Similarly for utf8.  For invariants can check
+                            * directly.  We accept anything above the latin1
+                            * range because it is immaterial to Perl if it is
+                            * correct or not, and is expensive to check.  But
+                            * it is fairly easy in the latin1 range to convert
+                            * the variants into a single character and check
+                            * those */
+                           if (UTF8_IS_INVARIANT(*i)) {
+                               if (! isALPHAU(*i)) problematic = TRUE;
+                           } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
+                               if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
+                                                                           *(i+1)))))
+                               {
+                                   problematic = TRUE;
+                               }
+                           }
+                           if (! problematic) for (i = s + UTF8SKIP(s);
+                                                   i < e;
+                                                   i+= UTF8SKIP(i))
+                           {
+                               if (UTF8_IS_INVARIANT(*i)) {
+                                   if (isCHARNAME_CONT(*i)) continue;
+                               } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
+                                   continue;
+                               } else if (isCHARNAME_CONT(
+                                           UNI_TO_NATIVE(
+                                           UTF8_ACCUMULATE(*i, *(i+1)))))
+                               {
+                                   continue;
+                               }
+                               problematic = TRUE;
+                               break;
+                           }
+                       }
+                       if (problematic) {
+                           char *string;
+                           Newx(string, e - i + 1, char);
+                           Copy(i, string, e - i, char);
+                           string[e - i] = '\0';
+                           Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                               "Deprecated character(s) in \\N{...} starting at '%s'",
+                               string);
+                           Safefree(string);
+                       }
+                   }
+               } /* End \N{NAME} */
  #ifdef EBCDIC
                 if (!dorange) 
                     native_range = FALSE; /* \N{} is defined to be Unicode */
author	Karl Williamson <khw@khw-desktop.(none)>
	Sat, 20 Feb 2010 06:53:36 +0000 (23:53 -0700)
committer	Steve Hay <steve.m.hay@googlemail.com>
	Sat, 20 Feb 2010 11:03:53 +0000 (11:03 +0000)
handy.h		patch \| blob \| history
pod/perldiag.pod		patch \| blob \| history
t/re/pat_advanced.t		patch \| blob \| history
toke.c		patch \| blob \| history