From fc8cd66c26827f6c2ee1aa00ab2d3b3c320a4a28 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Tue, 19 Sep 2006 03:37:19 +0200 Subject: [PATCH] Re: \N{...} in regular expression [PATCH] Message-ID: <9b18b3110609181637m796d6c16o1b2741edc5f09eb2@mail.gmail.com> p4raw-id: //depot/perl@28868 --- MANIFEST | 1 + embed.fnc | 1 + embed.h | 2 + pod/perldiag.pod | 15 +++ proto.h | 3 + regcomp.c | 301 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- regexec.c | 2 +- t/lib/Cname.pm | 22 ++++ t/op/pat.t | 139 +++++++++++++++++-------- toke.c | 10 +- 10 files changed, 448 insertions(+), 48 deletions(-) create mode 100644 t/lib/Cname.pm diff --git a/MANIFEST b/MANIFEST index d2a854e..7463370 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3177,6 +3177,7 @@ t/io/through.t See if pipe passes data intact t/io/utf8.t See if file seeking works t/japh/abigail.t Obscure tests t/lib/1_compile.t See if the various libraries and extensions compile +t/lib/Cname.pm Test charnames in regexes (op/pat.t) t/lib/common.pl Helper for lib/{warnings,feature}.t t/lib/commonsense.t See if configuration meets basic needs t/lib/compmod.pl Helper for 1_compile.t diff --git a/embed.fnc b/embed.fnc index 082a7c5..63e9e8f 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1308,6 +1308,7 @@ Es |regnode*|regclass |NN struct RExC_state_t *state|U32 depth ERsn |I32 |regcurly |NN const char * Es |regnode*|reg_node |NN struct RExC_state_t *state|U8 op Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth +Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth diff --git a/embed.h b/embed.h index dbb6ca3..fa43f4b 100644 --- a/embed.h +++ b/embed.h @@ -1312,6 +1312,7 @@ #define regcurly S_regcurly #define reg_node S_reg_node #define regpiece S_regpiece +#define reg_namedseq S_reg_namedseq #define reginsert S_reginsert #define regtail S_regtail #define join_exact S_join_exact @@ -3500,6 +3501,7 @@ #define regcurly S_regcurly #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) +#define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b) #define reginsert(a,b,c) S_reginsert(aTHX_ a,b,c) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) #define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index cda8945..f3a5eed 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1334,6 +1334,14 @@ specified in the C<\N{...}> escape. Perhaps you forgot to load the corresponding C or C pragma? See L and L. +=item Constant(%s)%s: %s in regex; marked by <-- HERE in m/%s/ + +(F) The parser found inconsistencies while attempting to find +the character name specified in the C<\N{...}> escape. Perhaps you +forgot to load the corresponding C pragma? +See L. + + =item Constant is not %s reference (F) A constant value (perhaps declared using the C pragma) @@ -1841,6 +1849,13 @@ about 250 characters for simple names, and somewhat more for compound names (like C<$A::B>). You've exceeded Perl's limits. Future versions of Perl are likely to eliminate these arbitrary limitations. +=item Ignoring %s in character class in regex; marked by <-- HERE in m/%s/ + +(W) Named unicode character escapes (\N{...}) may return multi-char +or zero length sequences. When such an escape is used in a character class +its behaviour is not well defined. Check that the correct escape has +been used, and the correct charname handler is in scope. + =item Illegal binary digit %s (F) You used a digit other than 0 or 1 in a binary number. diff --git a/proto.h b/proto.h index 48a66bb..a957f32 100644 --- a/proto.h +++ b/proto.h @@ -3565,6 +3565,9 @@ STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp, U32 dep __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); +STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *state, UV *valuep) + __attribute__nonnull__(pTHX_1); + STATIC void S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_3); diff --git a/regcomp.c b/regcomp.c index db73dfb..4cf4775 100644 --- a/regcomp.c +++ b/regcomp.c @@ -118,6 +118,7 @@ typedef struct RExC_state_t { I32 seen_zerolen; I32 seen_evals; I32 utf8; + HV *charnames; /* cache of named sequences */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -149,6 +150,7 @@ typedef struct RExC_state_t { #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) +#define RExC_charnames (pRExC_state->charnames) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ @@ -3734,6 +3736,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_size = 0L; RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; + RExC_charnames = NULL; + #if 0 /* REGC() is (currently) a NOP at the first pass. * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)RExC_emit); @@ -3833,6 +3837,7 @@ reStudy: copyRExC_state=RExC_state; } #endif + /* Dig out information for optimizations. */ r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ pm->op_pmflags = RExC_flags; @@ -4204,6 +4209,8 @@ reStudy: r->reganch |= ROPT_CANY_SEEN; Newxz(r->startp, RExC_npar, I32); Newxz(r->endp, RExC_npar, I32); + if (RExC_charnames) + SvREFCNT_dec((SV*)(RExC_charnames)); DEBUG_r( RX_DEBUG_on(r) ); DEBUG_DUMP_r({ @@ -4948,6 +4955,274 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } + +/* reg_namedseq(pRExC_state,UVp) + + This is expected to be called by a parser routine that has + recognized'\N' and needs to handle the rest. RExC_parse is + expected to point at the first char following the N at the time + of the call. + + If valuep is non-null then it is assumed that we are parsing inside + of a charclass definition and the first codepoint in the resolved + string is returned via *valuep and the routine will return NULL. + In this mode if a multichar string is returned from the charnames + handler a warning will be issued, and only the first char in the + sequence will be examined. If the string returned is zero length + then the value of *valuep is undefined and NON-NULL will + be returned to indicate failure. (This will NOT be a valid pointer + to a regnode.) + + If value is null then it is assumed that we are parsing normal text + and inserts a new EXACT node into the program containing the resolved + string and returns a pointer to the new node. If the string is + zerolength a NOTHING node is emitted. + + On success RExC_parse is set to the char following the endbrace. + Parsing failures will generate a fatal errorvia vFAIL(...) + + NOTE: We cache all results from the charnames handler locally in + the RExC_charnames hash (created on first use) to prevent a charnames + handler from playing silly-buggers and returning a short string and + then a long string for a given pattern. Since the regexp program + size is calculated during an initial parse this would result + in a buffer overrun so we cache to prevent the charname result from + changing during the course of the parse. + + */ +STATIC regnode * +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) +{ + char * name; /* start of the content of the name */ + char * endbrace; /* endbrace following the name */ + SV *sv_str = NULL; + SV *sv_name = NULL; + STRLEN len; /* this has various purposes throughout the code */ + bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */ + regnode *ret = NULL; + + if (*RExC_parse != '{') { + vFAIL("Missing braces on \\N{}"); + } + name = RExC_parse+1; + endbrace = strchr(RExC_parse, '}'); + if ( ! endbrace ) { + RExC_parse++; + vFAIL("Missing right brace on \\N{}"); + } + RExC_parse = endbrace + 1; + + + /* RExC_parse points at the beginning brace, + endbrace points at the last */ + if ( name[0]=='U' && name[1]=='+' ) { + /* its a "unicode hex" notation {U+89AB} */ + I32 fl = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + UV cp; + len = endbrace - name - 2; + cp = grok_hex(name + 2, &len, &fl, NULL); + if ( len != endbrace - name - 2 ) { + cp = 0xFFFD; + } + if (cp > 0xff) + RExC_utf8 = 1; + if ( valuep ) { + *valuep = cp; + return NULL; + } + sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); + } else { + /* fetch the charnames handler for this scope */ + HV * const table = GvHV(PL_hintgv); + SV **cvp= table ? + hv_fetchs(table, "charnames", FALSE) : + NULL; + SV *cv= cvp ? *cvp : NULL; + HE *he_str; + int count; + /* create an SV with the name as argument */ + sv_name = newSVpvn(name, endbrace - name); + + if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { + vFAIL2("Constant(\\N{%s}) unknown: " + "(possibly a missing \"use charnames ...\")", + SvPVX(sv_name)); + } + if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */ + vFAIL2("Constant(\\N{%s}): " + "$^H{charnames} is not defined",SvPVX(sv_name)); + } + + + + if (!RExC_charnames) { + /* make sure our cache is allocated */ + RExC_charnames = newHV(); + } + /* see if we have looked this one up before */ + he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 ); + if ( he_str ) { + sv_str = HeVAL(he_str); + cached = 1; + } else { + dSP ; + + ENTER ; + SAVETMPS ; + PUSHMARK(SP) ; + + XPUSHs(sv_name); + + PUTBACK ; + + count= call_sv(cv, G_SCALAR); + + if (count == 1) { /* XXXX is this right? dmq */ + sv_str = POPs; + SvREFCNT_inc_simple_void(sv_str); + } + + SPAGAIN ; + PUTBACK ; + FREETMPS ; + LEAVE ; + + if ( !sv_str || !SvOK(sv_str) ) { + vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} " + "did not return a defined value",SvPVX(sv_name)); + } + if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0)) + cached = 1; + } + } + if (valuep) { + char *p = SvPV(sv_str, len); + if (len) { + STRLEN numlen = 1; + if ( SvUTF8(sv_str) ) { + *valuep = utf8_to_uvchr(p, &numlen); + if (*valuep > 0x7F) + RExC_utf8 = 1; + /* XXXX + We have to turn on utf8 for high bit chars otherwise + we get failures with + + "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i + "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i + + This is different from what \x{} would do with the same + codepoint, where the condition is > 0xFF. + - dmq + */ + + + } else { + *valuep = (UV)*p; + /* warn if we havent used the whole string? */ + } + if (numlen 0) { + const STRLEN unilen = reguni(pRExC_state, uvc, s); + s += unilen; + len += unilen; + /* In EBCDIC the numlen + * and unilen can differ. */ + foldbuf += numlen; + if (numlen >= foldlen) + break; + } + else + break; /* "Can't happen." */ + } + } else { + const STRLEN unilen = reguni(pRExC_state, uvc, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + } + } else { + len++; + REGC(*p, s++); + } + } + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } else { + STR_LEN(ret) = len; + RExC_emit += STR_SZ(len); + } + Set_Node_Cur_Length(ret); /* MJD */ + RExC_parse--; + nextchar(pRExC_state); + } else { + ret = reg_node(pRExC_state,NOTHING); + } + if (!cached) { + SvREFCNT_dec(sv_str); + } + if (sv_name) { + SvREFCNT_dec(sv_name); + } + return ret; + +} + + + /* - regatom - the lowest level * @@ -5184,6 +5459,14 @@ tryagain: *flagp |= HASWIDTH|SIMPLE; } break; + case 'N': + /* Handle \N{NAME} here and not below because it can be + multicharacter. join_exact() will join them up later on. + Also this makes sure that things like /\N{BLAH}+/ and + \N{BLAH} being multi char Just Happen. dmq*/ + ++RExC_parse; + ret= reg_namedseq(pRExC_state, NULL); + break; case 'n': case 'r': case 't': @@ -5295,6 +5578,7 @@ tryagain: case 'D': case 'p': case 'P': + case 'N': --p; goto loopdone; case 'n': @@ -5493,7 +5777,7 @@ tryagain: /* If the encoding pragma is in effect recode the text of * any EXACT-kind nodes. */ - if (PL_encoding && PL_regkind[OP(ret)] == EXACT) { + if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) { const STRLEN oldlen = STR_LEN(ret); SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen)); @@ -5766,6 +6050,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (UCHARAT(RExC_parse) == ']') goto charclassloop; +parseit: while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { charclassloop: @@ -5807,6 +6092,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'N': /* Handle \N{NAME} in class */ + { + /* We only pay attention to the first char of + multichar strings being returned. I kinda wonder + if this makes sense as it does change the behaviour + from earlier versions, OTOH that behaviour was broken + as well. */ + UV v; /* value is register so we cant & it /grrr */ + if (reg_namedseq(pRExC_state, &v)) { + goto parseit; + } + value= v; + } + break; case 'p': case 'P': { diff --git a/regexec.c b/regexec.c index 1a86e49..679e31f 100644 --- a/regexec.c +++ b/regexec.c @@ -527,7 +527,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, DEBUG_OPTIMISE_r({ PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), - (int)(end_point - start_point), + (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point), start_point); }); diff --git a/t/lib/Cname.pm b/t/lib/Cname.pm new file mode 100644 index 0000000..d4b8a9e --- /dev/null +++ b/t/lib/Cname.pm @@ -0,0 +1,22 @@ +package Cname; +our $Evil='A'; + +sub translator { + my $str = shift; + if ( $str eq 'EVIL' ) { + (my $c=substr("A".$Evil,-1))++; + my $r=$Evil; + $Evil.=$c; + return $r; + } + if ( $str eq 'EMPTY-STR') { + return ""; + } + return $str; +} + +sub import { + shift; + $^H{charnames} = \&translator; +} +1; diff --git a/t/op/pat.t b/t/op/pat.t index 4ff133b..97bad61 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -6,8 +6,7 @@ $| = 1; -# please update note at bottom of file when you change this -print "1..1232\n"; +# Test counter output is generated by a BEGIN block at bottom of file BEGIN { chdir 't' if -d 't'; @@ -1286,7 +1285,7 @@ print "ok 247\n"; { # bug id 20001008.001 - my $test = 248; + $test = 248; my @x = ("stra\337e 138","stra\337e 138"); for (@x) { s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; @@ -1376,7 +1375,7 @@ print "ok 247\n"; } SKIP: { - my $test = 264; # till 575 + $test = 264; # till 575 use charnames ":full"; @@ -2032,13 +2031,13 @@ print "ok 683\n" if @a == 9 && "@a" eq "f o o \n $a $b b a r"; } -my $test = 687; +$test = 687; # Force scalar context on the patern match -sub ok ($$) { +sub ok ($;$) { my($ok, $name) = @_; - printf "%sok %d - %s\n", ($ok ? "" : "not "), $test, $name; + printf "%sok %d - %s\n", ($ok ? "" : "not "), $test, $name||'unnamed'; printf "# Failed test at line %d\n", (caller)[2] unless $ok; @@ -2604,35 +2603,21 @@ print "# some Unicode properties\n"; use charnames ':full'; - print "\N{LATIN SMALL LETTER SHARP S}" =~ - /\N{LATIN SMALL LETTER SHARP S}/ ? "ok 835\n" : "not ok 835\n"; + $test= 835; - print "\N{LATIN SMALL LETTER SHARP S}" =~ - /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 836\n" : "not ok 836\n"; + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER SHARP S}/); + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER SHARP S}/i); - print "\N{LATIN SMALL LETTER SHARP S}" =~ - /[\N{LATIN SMALL LETTER SHARP S}]/ ? "ok 837\n" : "not ok 837\n"; + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}]/); + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i); - print "\N{LATIN SMALL LETTER SHARP S}" =~ - /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 838\n" : "not ok 838\n"; + ok("ss" =~ /\N{LATIN SMALL LETTER SHARP S}/i); + ok("SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i); + ok("ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i); + ok("SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i); - print "ss" =~ - /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 839\n" : "not ok 839\n"; - - print "SS" =~ - /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 840\n" : "not ok 840\n"; - - print "ss" =~ - /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n"; - - print "SS" =~ - /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n"; - - print "\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i ? - "ok 843\n" : "not ok 843\n"; - - print "\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i ? - "ok 844\n" : "not ok 844\n"; + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i); + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i); } { @@ -2751,7 +2736,7 @@ print "# some Unicode properties\n"; # check utf8/non-utf8 mixtures # try to force all float/anchored check combinations my $c = "\x{100}"; - my $test = 865; + $test = 865; my $subst; for my $re ( "xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", "xx.*(?=$c)", "(?=$c).*xx", @@ -2790,7 +2775,7 @@ print "# some Unicode properties\n"; { print "# qr/.../x\n"; - my $test = 893; + $test = 893; my $R = qr/ A B C # D E/x; @@ -2806,7 +2791,7 @@ print "# some Unicode properties\n"; { print "# illegal Unicode properties\n"; - my $test = 896; + $test = 896; print eval { "a" =~ /\pq / } ? "not ok $test\n" : "ok $test\n"; $test++; @@ -2818,7 +2803,7 @@ print "# some Unicode properties\n"; { print "# [ID 20020412.005] wrong pmop flags checked when empty pattern\n"; # requires reuse of last successful pattern - my $test = 898; + $test = 898; $test =~ /\d/; for (0 .. 1) { my $match = ?? + 0; @@ -3039,7 +3024,7 @@ ok("A" =~ /\p{AsciiHexAndDash}/, "'A' is AsciiHexAndDash"); my $ok = $s =~ /(\x{100}{4})/; my($ord, $len) = (ord $1, length $1); print +($ok && $ord == 0x100 && $len == 4) - ? "ok $test\n" : "not ok $test\t# $ok/$ord/$len\n"; + ? "ok $test\n" : "not ok $test\t# [#18179] $ok/$ord/$len\n"; ++$test; } @@ -3404,10 +3389,12 @@ ok(("foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i) -{ +if (!$ENV{PERL_SKIP_PSYCHO_TEST}){ my @normal=qw(these are some normal words); my $psycho=join "|",@normal,map chr $_,255..20000; ok(('these'=~/($psycho)/) && $1 eq 'these','Pyscho'); +} else { + ok(1,'Skipped Psycho'); } # [perl #36207] mixed utf8 / latin-1 and case folding @@ -3533,22 +3520,22 @@ if ($ordA == 193) { my @chars = ("A".."Z"); my $delim = ","; my $size = 32771 - 4; - my $test = ''; + my $str = ''; # create some random junk. Inefficient, but it works. for ($i = 0 ; $i < $size ; $i++) { - $test .= $chars[int(rand(@chars))]; + $str .= $chars[int(rand(@chars))]; } - $test .= ($delim x 4); + $str .= ($delim x 4); my $res; my $matched; - if ($test =~ s/^(.*?)${delim}{4}//s) { + if ($str =~ s/^(.*?)${delim}{4}//s) { $res = $1; $matched=1; } ok($matched,'pattern matches'); - ok(length($test)==0,"Empty string"); + ok(length($str)==0,"Empty string"); ok(defined($res) && length($res)==$size,"\$1 is correct size"); } @@ -3578,9 +3565,73 @@ if ($ordA == 193) { ok("A@-B" =~ /A@{-}B/x, 'interpolation of @- in /@{-}/x'); } +{ + use lib 'lib'; + use Cname; + + ok('fooB'=~/\N{foo}[\N{B}\N{b}]/,"Passthrough charname"); + $test=1233; my $handle=make_must_warn('Ignoring excess chars from'); + $handle->('q(xxWxx) =~ /[\N{WARN}]/'); + { + my $code; + my $w=""; + local $SIG{__WARN__} = sub { $w.=shift }; + eval($code=<<'EOFTEST') or die "$@\n$code\n"; + { + use warnings; + + #1234 + ok("\0" !~ /[\N{EMPTY-STR}XY]/, + "Zerolength charname in charclass doesnt match \0"); + 1; + } +EOFTEST + ok($w=~/Ignoring zero length/, + "Got expected zero length warning"); + warn $code; + + } + $handle= make_must_warn('Ignoring zero length'); + $handle->('qq(\\0) =~ /[\N{EMPTY-STR}XY]/'); + ok('AB'=~/(\N{EVIL})/ && $1 eq 'A',"Charname caching $1"); + ok('ABC'=~/(\N{EVIL})/,"Charname caching $1"); + ok('xy'=~/x\N{EMPTY-STR}y/, 'Empty string charname produces NOTHING node'); + ok(''=~/\N{EMPTY-STR}/, 'Empty string charname produces NOTHING node 2'); + +} +{ + print "# MORE LATIN SMALL LETTER SHARP S\n"; + + use charnames ':full'; + + #see also test #835 + ok("ss" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i, + "unoptimized named sequence in class 1"); + ok("SS" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i, + "unoptimized named sequence in class 2"); + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/, + "unoptimized named sequence in class 3"); + ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i, + "unoptimized named sequence in class 4"); + + ok('aabc' !~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against aabc'); + ok('a+bc' =~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against a+bc'); + ok('a+bc' =~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against a+bc'); + + ok(' A B'=~/\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, + 'Intermixed named and unicode escapes 1'); + ok("\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}"=~ + /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, + 'Intermixed named and unicode escapes 2'); + ok("\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042} 3"=~ + /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/, + 'Intermixed named and unicode escapes'); +} # Keep the following test last -- it may crash perl ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274") or print "# Unexpected outcome: should pass or crash perl\n"; -# last test 1231 +# Don't forget to update this! +BEGIN{print "1..1251\n"}; + diff --git a/toke.c b/toke.c index 3fec508..b097e39 100644 --- a/toke.c +++ b/toke.c @@ -1793,7 +1793,7 @@ S_scan_const(pTHX_ char *start) const char * const leaveit = /* set of acceptably-backslashed characters */ (const char *) (PL_lex_inpat - ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" + ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-Nnrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""); if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { @@ -2179,6 +2179,7 @@ S_scan_const(pTHX_ char *start) SV *res; STRLEN len; const char *str; + SV *type; if (!e) { yyerror("Missing right brace on \\N{}"); @@ -2192,12 +2193,17 @@ S_scan_const(pTHX_ char *start) s += 3; len = e - s; uv = grok_hex(s, &len, &flags, NULL); + if ( len != e - s ) { + uv=0xFFFD; + } s = e + 1; goto NUM_ESCAPE_INSERT; } res = newSVpvn(s + 1, e - s - 1); + type = newSVpvn(s - 2,e - s + 3); res = new_constant( NULL, 0, "charnames", - res, NULL, "\\N{...}" ); + res, NULL, SvPVX(type) ); + SvREFCNT_dec(type); if (has_utf8) sv_utf8_upgrade(res); str = SvPV_const(res,len); -- 2.7.4