Improve handling of qq(\N{...}); and /x

author Karl Williamson <khw@khw-desktop.(none)>

Fri, 19 Feb 2010 21:42:16 +0000 (14:42 -0700)

committer Steve Hay <steve.m.hay@googlemail.com>

Sat, 20 Feb 2010 10:59:24 +0000 (10:59 +0000)
author Karl Williamson <khw@khw-desktop.(none)>
Fri, 19 Feb 2010 21:42:16 +0000 (14:42 -0700)
committer Steve Hay <steve.m.hay@googlemail.com>
Sat, 20 Feb 2010 10:59:24 +0000 (10:59 +0000)
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 4a12889..95b45f7 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1912,7 +1912,7 @@ about 250 characters for simple names, and somewhat more for compound
  names (like C<$A::B>).  You've exceeded Perl's limits.  Future versions
  of Perl are likely to eliminate these arbitrary limitations.
  
-=item Ignoring zero length \N{} in character class"
+=item Ignoring zero length \N{} in character class
  
  (W) Named Unicode character escapes (\N{...}) may return a
  zero length sequence.  When such an escape is used in a character class
@@ -2474,7 +2474,10 @@ immediately after the switch, without intervening spaces.
  =item Missing braces on \N{}
  
  (F) Wrong syntax of character name literal C<\N{charname}> within
-double-quotish context.
+double-quotish context.  This can also happen when there is a space (or
+comment) between the C<\N> and the C<{> in a regex with the C</x> modifier.
+This modifier does not change the requirement that the brace immediately follow
+the C<\N>.
  
  =item Missing comma after first argument to %s function
  
@@ -2524,18 +2527,18 @@ double-quoted strings and regular expression patterns.  In patterns, it doesn't
  have the meaning an unescaped C<*> does.
  
  Starting in Perl 5.12.0, C<\N> also can have an additional meaning (only) in
-patterns, namely to match a non-newline character.  (This is like C<.> but is
-not affected by the C</s> modifier.)
+patterns, namely to match a non-newline character.  (This is short for
+C<[^\n]>, and like C<.> but is not affected by the C</s> regex modifier.)
  
  This can lead to some ambiguities.  When C<\N> is not followed immediately by a
-left brace, Perl assumes the "match non-newline character" meaning.  Also, if
+left brace, Perl assumes the C<[^\n]> meaning.  Also, if
  the braces form a valid quantifier such as C<\N{3}> or C<\N{5,}>, Perl assumes
  that this means to match the given quantity of non-newlines (in these examples,
  3; and 5 or more, respectively).  In all other case, where there is a C<\N{>
  and a matching C<}>, Perl assumes that a character name is desired.
  
  However, if there is no matching C<}>, Perl doesn't know if it was mistakenly
-omitted, or if "match non-newline" followed by "match a C<{>" was desired, and
+omitted, or if C<[^\n]{> was desired, and
  raises this error.  If you meant the former, add the right brace; if you meant
  the latter, escape the brace with a backslash, like so: C<\N\{>
  
@@ -2626,10 +2629,38 @@ local() if you want to localize a package variable.
  
  =item \\N in a character class must be a named character: \\N{...}
  
-The new (5.12) meaning of C<\N> to match non-newlines is not valid in a
-bracketed character class, for the same reason that C<.> in a character class
-loses its specialness: it matches almost everything, which is probably not what
-you want.
+(F) The new (5.12) meaning of C<\N> as C<[^\n]> is not valid in a bracketed
+character class, for the same reason that C<.> in a character class loses its
+specialness: it matches almost everything, which is probably not what you want.
+
+=item \\N{NAME} must be resolved by the lexer
+
+(F) When compiling a regex pattern, an unresolved named character or sequence
+was encountered.  This can happen in any of several ways that bypass the lexer,
+such as using single-quotish context:
+
+    $re = '\N{SPACE}'; # Wrong!
+    /$re/;
+
+Instead, use double-quotes:
+
+    $re = "\N{SPACE}"; # ok
+    /$re/;
+
+The lexer can be bypassed as well by creating the pattern from smaller
+components:
+
+    $re = '\N';
+    /${re}{SPACE}/;    # Wrong!
+
+It's not a good idea to split a construct in the middle like this, and it
+doesn't work here.  Instead use the solution above.
+
+Finally, the message also can happen under the C</x> regex modifier when the
+C<\N> is separated by spaces from the C<{>, in which case, remove the spaces.
+
+    /\N {SPACE}/x;     # Wrong!
+    /\N{SPACE}/x;      # ok
  
  =item Name "%s::%s" used only once: possible typo
  
@@ -2646,7 +2677,8 @@ will not trigger this warning.
  =item Invalid hexadecimal number in \\N{U+...}
  
  (F) The character constant represented by C<...> is not a valid hexadecimal
-number.
+number.  Either it is empty, or you tried to use a character other than 0 - 9
+or A - F, a - f in a hexadecimal number.
  
  =item Negative '/' count in unpack
  
diff --git a/regcomp.c b/regcomp.c

index ce4104a..b5c685c 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -6625,26 +6625,29 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
  STATIC regnode *
  S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
  {
-    char * endbrace;    /* endbrace following the name */
+    char * endbrace;    /* '}' following the name */
      regnode *ret = NULL;
  #ifdef DEBUGGING
      char* parse_start = RExC_parse - 2;            /* points to the '\N' */
  #endif
+    char* p;
  
      GET_RE_DEBUG_FLAGS_DECL;
   
      PERL_ARGS_ASSERT_REG_NAMEDSEQ;
  
      GET_RE_DEBUG_FLAGS;
+
+    /* The [^\n] meaning of \N ignores spaces and comments under the /x
+     * modifier.  The other meaning does not */
+    p = (RExC_flags & RXf_PMf_EXTENDED)
+       ? regwhite( pRExC_state, RExC_parse )
+       : RExC_parse;
     
      /* Disambiguate between \N meaning a named character versus \N meaning
-     * don't match a newline. */
-    if (*RExC_parse != '{'
-       || (! (endbrace = strchr(RExC_parse, '}'))) /* no trailing brace */
-       || ! (endbrace == RExC_parse + 1        /* nothing between the {} */
-             || (endbrace - RExC_parse > 3     /* U+ and at least one hex */
-                 && strnEQ(RExC_parse + 1, "U+", 2))))
-    {
+     * [^\n].  The former is assumed when it can't be the latter. */
+    if (*p != '{' || regcurly(p)) {
+       RExC_parse = p;
         if (valuep) {
             /* no bare \N in a charclass */
             vFAIL("\\N in a character class must be a named character: \\N{...}");
@@ -6658,8 +6661,27 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
         return ret;
      }
  
-    /* Here, we have decided it is a named sequence */
+    /* Here, we have decided it should be a named sequence */
+
+    /* The test above made sure that the next real character is a '{', but
+     * under the /x modifier, it could be separated by space (or a comment and
+     * \n) and this is not allowed (for consistency with \x{...} and the
+     * tokenizer handling of \N{NAME}). */
+    if (*RExC_parse != '{') {
+       vFAIL("Missing braces on \\N{}");
+    }
+
      RExC_parse++;      /* Skip past the '{' */
+
+    if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
+       || ! (endbrace == RExC_parse            /* nothing between the {} */
+             || (endbrace - RExC_parse >= 2    /* U+ (bad hex is checked below */
+                 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
+    {
+       if (endbrace) RExC_parse = endbrace;    /* position msg's '<--HERE' */
+       vFAIL("\\N{NAME} must be resolved by the lexer");
+    }
+
      if (endbrace == RExC_parse) {   /* empty: \N{} */
         if (! valuep) {
             RExC_parse = endbrace + 1;  
@@ -6703,8 +6725,16 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
  
         /* The tokenizer should have guaranteed validity, but it's possible to
          * bypass it by using single quoting, so check */
-       if ( length_of_hex != (STRLEN)(endchar - RExC_parse) ) {
-           *valuep = UNICODE_REPLACEMENT;
+       if (length_of_hex == 0
+           || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+       {
+           RExC_parse += length_of_hex;        /* Includes all the valid */
+           RExC_parse += (RExC_orig_utf8)      /* point to after 1st invalid */
+                           ? UTF8SKIP(RExC_parse)
+                           : 1;
+           /* Guard against malformed utf8 */
+           if (RExC_parse >= endchar) RExC_parse = endchar;
+           vFAIL("Invalid hexadecimal number in \\N{U+...}");
         }    
  
         RExC_parse = endbrace + 1;
@@ -6731,7 +6761,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
          * is primarily a named character, and not intended to be a huge long
          * string, so 255 bytes should be good enough */
         while (1) {
-           STRLEN this_char_length;
+           STRLEN length_of_hex;
             I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
                             | PERL_SCAN_DISALLOW_PREFIX
                             | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
@@ -6743,12 +6773,18 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
             if (! endchar) endchar = endbrace;
  
             /* The values are Unicode even on EBCDIC machines */
-           this_char_length = (STRLEN)(endchar - RExC_parse);
-           cp = grok_hex(RExC_parse, &this_char_length, &grok_flags, NULL);
-           if ( this_char_length == 0 
-               || this_char_length != (STRLEN)(endchar - RExC_parse) )
+           length_of_hex = (STRLEN)(endchar - RExC_parse);
+           cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
+           if ( length_of_hex == 0 
+               || length_of_hex != (STRLEN)(endchar - RExC_parse) )
             {
-               cp = UNICODE_REPLACEMENT;   /* Substitute a valid character */
+               RExC_parse += length_of_hex;        /* Includes all the valid */
+               RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
+                               ? UTF8SKIP(RExC_parse)
+                               : 1;
+               /* Guard against malformed utf8 */
+               if (RExC_parse >= endchar) RExC_parse = endchar;
+               vFAIL("Invalid hexadecimal number in \\N{U+...}");
             }    
  
             if (! FOLD) {       /* Not folding, just append to the string */
diff --git a/t/re/re_tests b/t/re/re_tests

index 6304fe6..1807ffc 100644 (file)
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -34,9 +34,15 @@ ab*bc        abbbbc  y       $+[0]   6
  \N{1}  abbbbc  y       $&      a
  \N{1}  abbbbc  y       $-[0]   0
  \N{1}  abbbbc  y       $+[0]   1
+/\N {1}/x      abbbbc  y       $&      a
+/\N {1}/x      abbbbc  y       $-[0]   0
+/\N {1}/x      abbbbc  y       $+[0]   1
  \N{3,4}        abbbbc  y       $&      abbb
  \N{3,4}        abbbbc  y       $-[0]   0
  \N{3,4}        abbbbc  y       $+[0]   4
+/\N {3,4}/x    abbbbc  y       $&      abbb
+/\N {3,4}/x    abbbbc  y       $-[0]   0
+/\N {3,4}/x    abbbbc  y       $+[0]   4
  ab{0,}bc       abbbbc  y       $&      abbbbc
  ab{0,}bc       abbbbc  y       $-[0]   0
  ab{0,}bc       abbbbc  y       $+[0]   6
@@ -76,10 +82,13 @@ $   abc     y       $&
  a.c    abc     y       $&      abc
  a.c    axc     y       $&      axc
  a\Nc   abc     y       $&      abc
+/a\N c/x       abc     y       $&      abc
  a.*c   axyzc   y       $&      axyzc
  a\N*c  axyzc   y       $&      axyzc
+/a\N *c/x      axyzc   y       $&      axyzc
  a.*c   axyzd   n       -       -
  a\N*c  axyzd   n       -       -
+/a\N *c/x      axyzd   n       -       -
  a[bc]d abc     n       -       -
  a[bc]d abd     y       $&      abd
  a[b]d  abd     y       $&      abd
@@ -1412,12 +1421,32 @@ abc\N   abcd    y       $&      abcd
  abc\N  abc\n   n               
  
  # Verify get errors.  For these, we need // or else puts it in single quotes,
-# and doesn't expand.
+# and bypasses the lexer.
  /\N{U+}/       -       c       -       Invalid hexadecimal number
+# Below currently gives a misleading message
+/[\N{U+}]/     -       c       -       Unmatched
  /abc\N{def/    -       c       -       Missing right brace
+/\N{U+4AG3}/   -       c       -       Illegal hexadecimal digit
+/[\N{U+4AG3}]/ -       c       -       Illegal hexadecimal digit
+
+# And verify that in single quotes which bypasses the lexer, the regex compiler
+# figures it out.
+\N{U+} -       c       -       Invalid hexadecimal number
+[\N{U+}]       -       c       -       Invalid hexadecimal number
+\N{U+4AG3}     -       c       -       Invalid hexadecimal number
+[\N{U+4AG3}]   -       c       -       Invalid hexadecimal number
+abc\N{def      -       c       -       \\N{NAME} must be resolved by the lexer
+
+# Verify that under /x that still cant have space before left brace
+/abc\N {U+41}/x        -       c       -       Missing braces
+/abc\N {SPACE}/x       -       c       -       Missing braces
  
  # Verifies catches hex errors, and doesn't expose our . notation to the outside
  /\N{U+0xBEEF}/ -       c       -       Illegal hexadecimal digit
  /\N{U+BEEF.BEAD}/      -       c       -       Illegal hexadecimal digit
  
+# Verify works in single quotish context; regex compiler delivers slightly different msg
+# \N{U+BEEF.BEAD} succeeds here, because can't completely hide it from the outside.
+\N{U+0xBEEF}   -       c       -       Invalid hexadecimal number
+
  # vim: set noexpandtab
author	Karl Williamson <khw@khw-desktop.(none)>
	Fri, 19 Feb 2010 21:42:16 +0000 (14:42 -0700)
committer	Steve Hay <steve.m.hay@googlemail.com>
	Sat, 20 Feb 2010 10:59:24 +0000 (10:59 +0000)
pod/perldiag.pod		patch \| blob \| history
regcomp.c		patch \| blob \| history
t/re/re_tests		patch \| blob \| history