reg_namedseq: Restructure so doesn't duplicate code
authorKarl Williamson <public@khwilliamson.com>
Sun, 20 Mar 2011 16:25:17 +0000 (10:25 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sun, 20 Mar 2011 18:16:12 +0000 (12:16 -0600)
This routine now calls reg() recursively after converting the parse
to something the rest of the code understands.  This eliminates
duplicated code, and allows for uniform treatment of code points, as
things were getting out of sync.  It also eliminates the restrction on
how many characters a named sequence can expand to.

toke now converts its input (which is in Unicode terms) to native on
EBCDIC platforms, so the rest of the code can can continue to ignore
that.

The restriction on the length of the number of characters a named
sequence is hereby removed, because reg() handles that.

pod/perldiag.pod
regcomp.c
t/re/pat_advanced.t
t/re/re_tests
toke.c

index a461d7b..235c39c 100644 (file)
@@ -5296,14 +5296,6 @@ removed in a future version.
 Currently all but the first one are discarded when used in a regular
 expression pattern bracketed character class.
 
-=item Using just the first characters returned by \N{}
-
-(W) A charnames handler may return a sequence of characters.  There is a
-finite limit as to the number of characters that can be used, which this
-sequence exceeded.  In the message, the characters in the sequence are
-separated by dots, and each is shown by its ordinal in hex.  Anything to
-the left of the C<HERE> was retained; anything to the right was discarded.
-
 =item Using !~ with %s doesn't make sense
 
 (F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is
index 796cefa..cf0f3db 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4613,6 +4613,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
     RExC_seen_evals = 0;
     RExC_extralen = 0;
+    RExC_override_recoding = 0;
 
     /* First pass: determine size, legality. */
     RExC_parse = exp;
@@ -7794,168 +7795,55 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
         ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
     }
     else {     /* Not a char class */
-       char *s;            /* String to put in generated EXACT node */
-       STRLEN len = 0;     /* Its current byte length */
+
+       /* What is done here is to convert this to a sub-pattern of the form
+        * (?:\x{char1}\x{char2}...)
+        * and then call reg recursively.  That way, it retains its atomicness,
+        * while not having to worry about special handling that some code
+        * points may have.  toke.c has converted the original Unicode values
+        * to native, so that we can just pass on the hex values unchanged.  We
+        * do have to set a flag to keep recoding from happening in the
+        * recursion */
+
+       SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
+       STRLEN len;
        char *endchar;      /* Points to '.' or '}' ending cur char in the input
                               stream */
-       ret = reg_node(pRExC_state,
-                          (U8) ((! FOLD) ? EXACT
-                                         : (LOC)
-                                            ? EXACTFL
-                                            : (MORE_ASCII_RESTRICTED)
-                                              ? EXACTFA
-                                              : (AT_LEAST_UNI_SEMANTICS)
-                                                ? EXACTFU
-                                                : EXACTF));
-       s= STRING(ret);
-
-       /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
-        * the input which is of the form now 'c1.c2.c3...}' until find the
-        * ending brace or exceed length 255.  The characters that exceed this
-        * limit are dropped.  The limit could be relaxed should it become
-        * desirable by reparsing this as (?:\N{NAME}), so could generate
-        * multiple EXACT nodes, as is done for just regular input.  But this
-        * is primarily a named character, and not intended to be a huge long
-        * string, so 255 bytes should be good enough */
-       while (1) {
-           STRLEN length_of_hex;
-           I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
-                           | PERL_SCAN_DISALLOW_PREFIX
-                           | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-           UV cp;  /* Ord of current character */
-           bool use_this_char_fold = FOLD;
+       char *orig_end = RExC_end;
+
+       while (RExC_parse < endbrace) {
 
            /* Code points are separated by dots.  If none, there is only one
             * code point, and is terminated by the brace */
            endchar = RExC_parse + strcspn(RExC_parse, ".}");
 
-           /* The values are Unicode even on EBCDIC machines */
-           length_of_hex = (STRLEN)(endchar - RExC_parse);
-           cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
-           if ( length_of_hex == 0 
-               || length_of_hex != (STRLEN)(endchar - RExC_parse) )
-           {
-               RExC_parse += length_of_hex;        /* Includes all the valid */
-               RExC_parse += (RExC_orig_utf8)  /* point to after 1st invalid */
-                               ? UTF8SKIP(RExC_parse)
-                               : 1;
-               /* Guard against malformed utf8 */
-               if (RExC_parse >= endchar) RExC_parse = endchar;
-               vFAIL("Invalid hexadecimal number in \\N{U+...}");
-           }    
-
-           /* XXX ? Change to ANYOF node
-           if (FOLD
-               && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
-               && is_TRICKYFOLD_cp(cp))
-           {
-           }
-           */
-
-           /* Under /aa, we can't mix ASCII with non- in a fold.  If we are
-            * folding, and the source isn't ASCII, look through all the
-            * characters it folds to.  If any one of them is ASCII, forbid
-            * this fold.  (cp is uni, so the 127 below is correct even for
-            * EBCDIC).  Similarly under locale rules, we don't mix under 256
-            * with above 255.  XXX It really doesn't make sense to have \N{}
-            * which means a Unicode rules under locale.  I (khw) think this
-            * should be warned about, but the counter argument is that people
-            * who have programmed around Perl's earlier lack of specifying the
-            * rules and used \N{} to force Unicode things in a local
-            * environment shouldn't get suddenly a warning */
-           if (use_this_char_fold) {
-               if (LOC && cp < 256) {  /* Fold not known until run-time */
-                   use_this_char_fold = FALSE;
-               }
-               else if ((cp > 127 && MORE_ASCII_RESTRICTED)
-                        || (cp > 255 && LOC))
-               {
-               U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-               U8* s = tmpbuf;
-               U8* e;
-               STRLEN foldlen;
-
-               (void) toFOLD_uni(cp, tmpbuf, &foldlen);
-               e = s + foldlen;
-
-               while (s < e) {
-                   if (isASCII(*s)
-                       || (LOC && (UTF8_IS_INVARIANT(*s)
-                                   || UTF8_IS_DOWNGRADEABLE_START(*s))))
-                   {
-                       use_this_char_fold = FALSE;
-                       break;
-                   }
-                   s += UTF8SKIP(s);
-               }
-               }
-           }
-
-           if (! use_this_char_fold) { /* Not folding, just append to the
-                                          string */
-               STRLEN unilen;
-
-               /* Quit before adding this character if would exceed limit */
-               if (len + UNISKIP(cp) > U8_MAX) break;
-
-               unilen = reguni(pRExC_state, cp, s);
-               if (unilen > 0) {
-                   s   += unilen;
-                   len += unilen;
-               }
-           } else {    /* Folding, output the folded equivalent */
-               STRLEN foldlen,numlen;
-               U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
-               cp = toFOLD_uni(cp, tmpbuf, &foldlen);
-
-               /* Quit before exceeding size limit */
-               if (len + foldlen > U8_MAX) break;
-               
-               for (foldbuf = tmpbuf;
-                   foldlen;
-                   foldlen -= numlen) 
-               {
-                   cp = utf8_to_uvchr(foldbuf, &numlen);
-                   if (numlen > 0) {
-                       const STRLEN unilen = reguni(pRExC_state, cp, s);
-                       s       += unilen;
-                       len     += unilen;
-                       /* In EBCDIC the numlen and unilen can differ. */
-                       foldbuf += numlen;
-                       if (numlen >= foldlen)
-                           break;
-                   }
-                   else
-                       break; /* "Can't happen." */
-               }                          
-           }
+           /* Convert to notation the rest of the code understands */
+           sv_catpv(substitute_parse, "\\x{");
+           sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+           sv_catpv(substitute_parse, "}");
 
            /* Point to the beginning of the next character in the sequence. */
            RExC_parse = endchar + 1;
-
-           /* Quit if no more characters */
-           if (RExC_parse >= endbrace) break;
        }
+       sv_catpv(substitute_parse, ")");
 
+       RExC_parse = SvPV(substitute_parse, len);
 
-       if (SIZE_ONLY) {
-           if (RExC_parse < endbrace) {
-               ckWARNreg(RExC_parse - 1,
-                         "Using just the first characters returned by \\N{}");
-           }
-
-           RExC_size += STR_SZ(len);
-       } else {
-           STR_LEN(ret) = len;
-           RExC_emit += STR_SZ(len);
+       /* Don't allow empty number */
+       if (len < 8) {
+           vFAIL("Invalid hexadecimal number in \\N{U+...}");
        }
+       RExC_end = RExC_parse + len;
 
-       RExC_parse = endbrace + 1;
+       /* The values are Unicode, and therefore not subject to recoding */
+       RExC_override_recoding = 1;
+
+       ret = reg(pRExC_state, 1, flagp, depth+1);
+
+       RExC_parse = endbrace;
+       RExC_end = orig_end;
+       RExC_override_recoding = 0;
 
-       *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
-                              with malformed in t/re/pat_advanced.t */
-       RExC_parse --;
-       Set_Node_Cur_Length(ret); /* MJD */
        nextchar(pRExC_state);
     }
 
@@ -8775,7 +8663,7 @@ tryagain:
                            goto recode_encoding;
                        break;
                    recode_encoding:
-                       {
+                       if (! RExC_override_recoding) {
                            SV* enc = PL_encoding;
                            ender = reg_recode((const char)(U8)ender, &enc);
                            if (!enc && SIZE_ONLY)
@@ -9765,7 +9653,7 @@ parseit:
                    break;
                }
            recode_encoding:
-               {
+               if (! RExC_override_recoding) {
                    SV* enc = PL_encoding;
                    value = reg_recode((const char)(U8)value, &enc);
                    if (!enc && SIZE_ONLY)
index b102188..6d7624d 100644 (file)
@@ -1007,18 +1007,7 @@ sub run_tests {
         # If remove the limitation in regcomp code these should work
         # differently
         undef $w;
-        eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string fails gracefully'];
-        ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
-        undef $w;
-        eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string fails gracefully'];
-        ok $w && $w =~ /Using just the first characters returned/, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
-        undef $w;
-        eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string doesnt work'];
-        ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
-        undef $w;
-        eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string doesnt work'];
-        ok $w && $w =~ /Using just the first characters returned/i, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
-        undef $w;
+        eval q [ok "\N{TOO-LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that what once was too long a string works'];
         eval 'q(syntax error) =~ /\N{MALFORMED}/';
         ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error';
         undef $w;
index b381529..af24677 100644 (file)
@@ -1439,7 +1439,7 @@ abc\N     abc\n   n
 # figures it out.
 \N{U+} -       c       -       Invalid hexadecimal number
 [\N{U+}]       -       c       -       Invalid hexadecimal number
-\N{U+4AG3}     -       c       -       Invalid hexadecimal number
+\N{U+4AG3}     -       c       -       Illegal hexadecimal digit
 [\N{U+4AG3}]   -       c       -       Invalid hexadecimal number
 abc\N{def      -       c       -       \\N{NAME} must be resolved by the lexer
 
@@ -1453,7 +1453,7 @@ abc\N{def -       c       -       \\N{NAME} must be resolved by the lexer
 
 # Verify works in single quotish context; regex compiler delivers slightly different msg
 # \N{U+BEEF.BEAD} succeeds here, because can't completely hide it from the outside.
-\N{U+0xBEEF}   -       c       -       Invalid hexadecimal number
+\N{U+0xBEEF}   -       c       -       Illegal hexadecimal digit
 \c`    -       c       -       \"\\c`\" is more clearly written simply as \"\\ \"
 \c1    -       c       -       \"\\c1\" is more clearly written simply as \"q\"
 \cA    \001    y       $&      \1
diff --git a/toke.c b/toke.c
index 2dbe7f7..6933e62 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3140,12 +3140,22 @@ S_scan_const(pTHX_ char *start)
 
                    if (PL_lex_inpat) {
 
-                       /* Pass through to the regex compiler unchanged.  The
-                        * reason we evaluated the number above is to make sure
-                        * there wasn't a syntax error. */
+                       /* On non-EBCDIC platforms, pass through to the regex
+                        * compiler unchanged.  The reason we evaluated the
+                        * number above is to make sure there wasn't a syntax
+                        * error.  But on EBCDIC we convert to native so
+                        * downstream code can continue to assume it's native
+                        */
                        s -= 5;     /* Include the '\N{U+' */
+#ifdef EBCDIC
+                       d += my_snprintf(d, e - s + 1 + 1,  /* includes the }
+                                                              and the \0 */
+                                   "\\N{U+%X}",
+                                   (unsigned int) UNI_TO_NATIVE(uv));
+#else
                        Copy(s, d, e - s + 1, char);    /* 1 = include the } */
                        d += e - s + 1;
+#endif
                    }
                    else {  /* Not a pattern: convert the hex to string */
 
@@ -3239,10 +3249,13 @@ S_scan_const(pTHX_ char *start)
                            }
 
                            /* Convert first code point to hex, including the
-                            * boiler plate before it */
+                            * boiler plate before it.  For all these, we
+                            * convert to native format so that downstream code
+                            * can continue to assume the input is native */
                            output_length =
                                my_snprintf(hex_string, sizeof(hex_string),
-                                           "\\N{U+%X", (unsigned int) uv);
+                                           "\\N{U+%X",
+                                           (unsigned int) UNI_TO_NATIVE(uv));
 
                            /* Make sure there is enough space to hold it */
                            d = off + SvGROW(sv, off
@@ -3267,7 +3280,8 @@ S_scan_const(pTHX_ char *start)
 
                                output_length =
                                    my_snprintf(hex_string, sizeof(hex_string),
-                                               ".%X", (unsigned int) uv);
+                                           ".%X",
+                                           (unsigned int) UNI_TO_NATIVE(uv));
 
                                d = off + SvGROW(sv, off
                                                     + output_length