From 0a4db386e1881073eaec2c3026e38146ff1d6b18 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Thu, 12 Oct 2006 02:46:50 +0200 Subject: [PATCH] Add Regex conditionals. Various bugfixes. More tests. Message-ID: <9b18b3110610111546j74ca490dg21bd9fd1e7e10d42@mail.gmail.com> p4raw-id: //depot/perl@28998 --- embed.fnc | 1 + embed.h | 2 + pod/perlre.pod | 18 ++- proto.h | 5 + regcomp.c | 181 ++++++++++++++++++----------- regcomp.sym | 5 + regexec.c | 100 +++++++++++++--- regnodes.h | 351 ++++++++++++++++++++++++++++++--------------------------- t/op/pat.t | 10 +- t/op/re_tests | 22 ++++ 10 files changed, 439 insertions(+), 256 deletions(-) diff --git a/embed.fnc b/embed.fnc index 9be1e37..ea5450d 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1371,6 +1371,7 @@ ERsn |U8* |reghopmaybe3 |NN U8 *pos|I32 off|NN const U8 *lim ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK const regmatch_info *reginfo Es |void |to_utf8_substr |NN regexp * prog Es |void |to_byte_substr |NN regexp * prog +ERs |I32 |reg_check_named_buff_matched |NN const regexp *rex|NN const regnode *prog # ifdef DEBUGGING Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\ |NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8 diff --git a/embed.h b/embed.h index dc5efad..bc884d7 100644 --- a/embed.h +++ b/embed.h @@ -1373,6 +1373,7 @@ #define find_byclass S_find_byclass #define to_utf8_substr S_to_utf8_substr #define to_byte_substr S_to_byte_substr +#define reg_check_named_buff_matched S_reg_check_named_buff_matched #endif # ifdef DEBUGGING #if defined(PERL_CORE) || defined(PERL_EXT) @@ -3572,6 +3573,7 @@ #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) #define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a) #define to_byte_substr(a) S_to_byte_substr(aTHX_ a) +#define reg_check_named_buff_matched(a,b) S_reg_check_named_buff_matched(aTHX_ a,b) #endif # ifdef DEBUGGING #if defined(PERL_CORE) || defined(PERL_EXT) diff --git a/pod/perlre.pod b/pod/perlre.pod index a22344f..f79b8c7 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -998,18 +998,28 @@ highly experimental, and may be changed or deleted without notice. Conditional expression. C<(condition)> should be either an integer in parentheses (which is valid if the corresponding pair of parentheses -matched), or look-ahead/look-behind/evaluate zero-width assertion. +matched), a look-ahead/look-behind/evaluate zero-width assertion, a +name in angle brackets or single quotes (which is valid if a buffer +with the given name matched), the special symbol (R) (true when +evaluated inside of recursion or eval). Additionally the R may be +followed by a number, (which will be true when evaluated when recursing +inside of the appropriate group), or by C<&NAME> in which case it will +be true only when evaluated during recursion into the named group. For example: - m{ ( \( )? - [^()]+ - (?(1) \) ) + m{ ( \( )? + [^()]+ + (?(1) \) ) }x matches a chunk of non-parentheses, possibly included in parentheses themselves. +An additional special form of this pattern is the DEFINE pattern, which +never executes its yes-pattern except by recursion, and does not allow +a no-pattern. + =back =head2 Backtracking diff --git a/proto.h b/proto.h index 0e51ab4..520c86e 100644 --- a/proto.h +++ b/proto.h @@ -3750,6 +3750,11 @@ STATIC void S_to_utf8_substr(pTHX_ regexp * prog) STATIC void S_to_byte_substr(pTHX_ regexp * prog) __attribute__nonnull__(pTHX_1); +STATIC I32 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *prog) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); + # ifdef DEBUGGING STATIC void S_dump_exec_pos(pTHX_ const char *locinput, const regnode *scan, const char *loc_regeol, const char *loc_bostr, const char *loc_reg_starttry, const bool do_utf8) __attribute__nonnull__(pTHX_1) diff --git a/regcomp.c b/regcomp.c index 64e6c8d..e64702a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2713,9 +2713,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } flags &= ~SCF_DO_STCLASS; } - else if (OP(scan)==RECURSE) { - ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan ); - } else if (strchr((const char*)PL_varies,OP(scan))) { I32 mincount, maxcount, minnext, deltanext, fl = 0; I32 f = flags, pos_before = 0; @@ -3452,7 +3449,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data) data->flags |= SF_HAS_EVAL; } - else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ + else if ( (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + || OP(scan)==RECURSE) /* recursion */ + { + if (OP(scan)==RECURSE) { + ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan ); + } if (flags & SCF_DO_SUBSTR) { scan_commit(pRExC_state,data,minlenp); data->longest = &(data->longest_float); @@ -4301,8 +4303,7 @@ Perl_reg_named_buff_sv(pTHX_ SV* namesv) SV* sv_dat=HeVAL(he_str); I32 *nums=(I32*)SvPVX(sv_dat); for ( i=0; ilastcloseparen) >= nums[i] && - rx->startp[nums[i]] != -1 && + if ((I32)(rx->lastparen) >= nums[i] && rx->endp[nums[i]] != -1) { parno = nums[i]; @@ -4323,33 +4324,59 @@ Perl_reg_named_buff_sv(pTHX_ SV* namesv) } } + /* Scans the name of a named buffer from the pattern. - * If flags is true then returns an SV containing the name. + * If flags is REG_RSN_RETURN_NULL returns null. + * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name + * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding + * to the parsed name as looked up in the RExC_paren_names hash. + * If there is an error throws a vFAIL().. type exception. */ + +#define REG_RSN_RETURN_NULL 0 +#define REG_RSN_RETURN_NAME 1 +#define REG_RSN_RETURN_DATA 2 + STATIC SV* S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { char *name_start = RExC_parse; - if (UTF) { + if ( UTF ) { STRLEN numlen; - while (isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, UTF8_ALLOW_DEFAULT))) - RExC_parse += numlen; - } - else { - while (isIDFIRST(*RExC_parse)) + while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT))) + { + RExC_parse += numlen; + } + } else { + while( isIDFIRST(*RExC_parse) ) RExC_parse++; } - if (flags) { - SV* svname = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, - (int)(RExC_parse - name_start))); + if ( flags ) { + SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, + (int)(RExC_parse - name_start))); if (UTF) - SvUTF8_on(svname); - return svname; - } - else { - return NULL; + SvUTF8_on(sv_name); + if ( flags == REG_RSN_RETURN_NAME) + return sv_name; + else if (flags==REG_RSN_RETURN_DATA) { + HE *he_str = NULL; + SV *sv_dat = NULL; + if ( ! sv_name ) /* should not happen*/ + Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); + if ( he_str ) + sv_dat = HeVAL(he_str); + if ( ! sv_dat ) + vFAIL("Reference to nonexistent named group"); + return sv_dat; + } + else { + Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); + } + /* NOT REACHED */ } + return NULL; } #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ @@ -4376,9 +4403,9 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { else \ num=REG_NODE_NUM(RExC_emit); \ if (RExC_lastnum!=num) \ - PerlIO_printf(Perl_debug_log,"|%4d",num); \ + PerlIO_printf(Perl_debug_log,"|%4d",num); \ else \ - PerlIO_printf(Perl_debug_log,"|%4s",""); \ + PerlIO_printf(Perl_debug_log,"|%4s",""); \ PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ (int)((depth*2)), "", \ (funcname) \ @@ -4463,13 +4490,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; - else if (*RExC_parse != '=') { /* (?<...>) */ + else if (*RExC_parse != '=') + { /* (?<...>) */ char *name_start; SV *svname; paren= '>'; case '\'': /* (?'...') */ name_start= RExC_parse; - svname = reg_scan_name(pRExC_state,SIZE_ONLY); + svname = reg_scan_name(pRExC_state, + SIZE_ONLY ? /* reverse test from the others */ + REG_RSN_RETURN_NAME : + REG_RSN_RETURN_NULL); if (RExC_parse == name_start) goto unknown; if (*RExC_parse != paren) @@ -4543,27 +4574,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '&': /* (?&NAME) */ parse_start = RExC_parse - 1; { - char *name_start = RExC_parse; - SV *svname = reg_scan_name(pRExC_state, !SIZE_ONLY); - if (RExC_parse == name_start) - goto unknown; - if (*RExC_parse != ')') - vFAIL("Expecting close bracket"); - if (!SIZE_ONLY) { - HE *he_str = NULL; - SV *sv_dat; - if (!svname) /* shouldn't happen*/ - Perl_croak(aTHX_ "panic: reg_scan_name returned NULL"); - if (RExC_paren_names) - he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 ); - if (he_str) - sv_dat = HeVAL(he_str); - else - vFAIL("Reference to nonexistent group"); - num = *((I32 *)SvPVX(sv_dat)); - } else { - num = 0; - } + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } goto gen_recurse_regop; /* NOT REACHED */ @@ -4590,8 +4603,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "Recurse #%"UVuf" to %"IVdf"\n", ARG(ret), ARG2L(ret))); } else { RExC_size++; - RExC_seen|=REG_SEEN_RECURSE; } + RExC_seen |= REG_SEEN_RECURSE; Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ @@ -4682,6 +4695,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } case '(': /* (?(?{...})...) and (?(?=...)...) */ { + int is_define= 0; if (RExC_parse[0] == '?') { /* (?(?...)) */ if (RExC_parse[1] == '=' || RExC_parse[1] == '!' || RExC_parse[1] == '<' @@ -4695,6 +4709,55 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto insert_if; } } + else if ( RExC_parse[0] == '<' /* (?()...) */ + || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ + { + char ch = RExC_parse[0] == '<' ? '>' : '\''; + char *name_start= RExC_parse++; + I32 num = 0; + SV *sv_dat=reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence (?(%c... not terminated", + (ch == '>' ? '<' : ch)); + RExC_parse++; + if (!SIZE_ONLY) { + num = add_data( pRExC_state, 1, "S" ); + RExC_rx->data->data[num]=(void*)sv_dat; + SvREFCNT_inc(sv_dat); + } + ret = reganode(pRExC_state,NGROUPP,num); + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'D' && + RExC_parse[1] == 'E' && + RExC_parse[2] == 'F' && + RExC_parse[3] == 'I' && + RExC_parse[4] == 'N' && + RExC_parse[5] == 'E') + { + ret = reganode(pRExC_state,DEFINEP,0); + RExC_parse +=6 ; + is_define = 1; + goto insert_if_check_paren; + } + else if (RExC_parse[0] == 'R') { + RExC_parse++; + parno = 0; + if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = atoi(RExC_parse++); + while (isDIGIT(*RExC_parse)) + RExC_parse++; + } else if (RExC_parse[0] == '&') { + SV *sv_dat; + RExC_parse++; + sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); + parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + } + ret = reganode(pRExC_state,RECURSEP,parno); + goto insert_if_check_paren; + } else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; @@ -4704,6 +4767,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; ret = reganode(pRExC_state, GROUPP, parno); + insert_if_check_paren: if ((c = *nextchar(pRExC_state)) != ')') vFAIL("Switch condition not recognized"); insert_if: @@ -4717,6 +4781,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { + if (is_define) + vFAIL("(?(DEFINE)....) does not allow branches"); lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ regbranch(pRExC_state, &flags, 1,depth+1); REGTAIL(pRExC_state, ret, lastbr); @@ -5721,7 +5787,7 @@ tryagain: ++RExC_parse; ret= reg_namedseq(pRExC_state, NULL); break; - case 'k': + case 'k': /* Handle \k and \k'NAME' */ { char ch= RExC_parse[1]; if (ch != '<' && ch != '\'') { @@ -5733,7 +5799,8 @@ tryagain: } else { char* name_start = (RExC_parse += 2); I32 num = 0; - SV *svname = reg_scan_name(pRExC_state,!SIZE_ONLY); + SV *sv_dat = reg_scan_name(pRExC_state, + SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); ch= (ch == '<') ? '>' : '\''; if (RExC_parse == name_start || *RExC_parse != ch) @@ -5748,18 +5815,6 @@ tryagain: if (!SIZE_ONLY) { - HE *he_str = NULL; - SV *sv_dat; - if (!svname) - Perl_croak(aTHX_ - "panic: reg_scan_name returned NULL"); - if (RExC_paren_names) - he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 ); - if ( he_str ) { - sv_dat = HeVAL(he_str); - } else { - vFAIL("Reference to nonexistent group"); - } num = add_data( pRExC_state, 1, "S" ); ARG_SET(ret,num); RExC_rx->data->data[num]=(void*)sv_dat; diff --git a/regcomp.sym b/regcomp.sym index 21904e1..561b25d 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -164,6 +164,11 @@ NREFF NREF, no-sv 1 Match already matched string, folded NREFFL NREF, no-sv 1 Match already matched string, folded in loc. +#*Special conditionals +NGROUPP NGROUPP, no-sv 1 Whether the group matched. +RECURSEP RECURSEP, num 1 Whether we are in a specific recurse. +DEFINEP DEFINEP, none 1 Never execute directly. + # NEW STUFF ABOVE THIS LINE -- Please update counts below. ################################################################################ diff --git a/regexec.c b/regexec.c index 6660b60..30dd354 100644 --- a/regexec.c +++ b/regexec.c @@ -2508,6 +2508,30 @@ S_dump_exec_pos(pTHX_ const char *locinput, #endif +/* reg_check_named_buff_matched() + * Checks to see if a named buffer has matched. The data array of + * buffer numbers corresponding to the buffer is expected to reside + * in the regexp->data->data array in the slot stored in the ARG() of + * node involved. Note that this routine doesn't actually care about the + * name, that information is not preserved from compilation to execution. + * Returns the index of the leftmost defined buffer with the given name + * or 0 if non of the buffers matched. + */ +STATIC I32 +S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) { + I32 n; + SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ]; + I32 *nums=(I32*)SvPVX(sv_dat); + for ( n=0; n= nums[n] && + PL_regendp[nums[n]] != -1) + { + return nums[n]; + } + } + return 0; +} + STATIC I32 /* 0 failure, 1 success */ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) { @@ -3300,22 +3324,15 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) case NREF: case NREFF: type = OP(scan); - { - SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ]; - I32 *nums=(I32*)SvPVX(sv_dat); - for ( n=0; n= nums[n] && - PL_regstartp[nums[n]] != -1 && - PL_regendp[nums[n]] != -1) - { - n = nums[n]; - type = REF + ( type - NREF ); - goto do_ref; - } - } + n = reg_check_named_buff_matched(rex,scan); + + if ( n ) { + type = REF + ( type - NREF ); + goto do_ref; + } else { sayNO; - /* unreached */ - } + } + /* unreached */ case REFFL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ @@ -3594,6 +3611,17 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) n = ARG(scan); /* which paren pair */ sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1); break; + case NGROUPP: + /* reg_check_named_buff_matched returns 0 for no match */ + sw = (bool)(0 < reg_check_named_buff_matched(rex,scan)); + break; + case RECURSEP: + n = ARG(scan); + sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n)); + break; + case DEFINEP: + sw = 0; + break; case IFTHEN: PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (sw) @@ -3747,7 +3775,6 @@ NULL case WHILEM: /* just matched an A in /A*B/ (for complex A) */ { /* see the discussion above about CURLYX/WHILEM */ - I32 n; assert(cur_curlyx); /* keep Coverity happy */ n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ @@ -3968,6 +3995,7 @@ NULL for (n = *PL_reglastparen; n > ST.lastparen; n--) PL_regendp[n] = -1; *PL_reglastparen = n; + /*dmq: *PL_reglastcloseparen = n; */ scan = ST.next_branch; /* no more branches? */ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) @@ -4047,13 +4075,21 @@ NULL ); locinput = PL_reginput; - if (ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) + + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.me->flags) + goto fake_end; + + if ( ST.count < (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)) ) goto curlym_do_A; /* try to match another A */ goto curlym_do_B; /* try to match B */ case CURLYM_A_fail: /* just failed to match an A */ REGCP_UNWIND(ST.cp); - if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ ) + + if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ + || (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.me->flags)) sayNO; curlym_do_B: /* execute the B in /A{m,n}B/ */ @@ -4102,10 +4138,20 @@ NULL PL_regstartp[paren] = HOPc(PL_reginput, -ST.alen) - PL_bostr; PL_regendp[paren] = PL_reginput - PL_bostr; + /*dmq: *PL_reglastcloseparen = paren; */ } else PL_regendp[paren] = -1; + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.me->flags) + { + if (ST.count) + goto fake_end; + else + sayNO; + } } + PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */ /* NOTREACHED */ @@ -4131,6 +4177,7 @@ NULL if (success) { \ PL_regstartp[paren] = HOPc(locinput, -1) - PL_bostr; \ PL_regendp[paren] = locinput - PL_bostr; \ + *PL_reglastcloseparen = paren; \ } \ else \ PL_regendp[paren] = -1; \ @@ -4156,6 +4203,11 @@ NULL *PL_reglastparen = ST.paren; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.paren) { + ST.min=1; + ST.max=1; + } scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); goto repeat; case CURLY: /* /A{m,n}B/ where A is width 1 */ @@ -4361,6 +4413,10 @@ NULL } PL_reginput = locinput; CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.paren) { + goto fake_end; + } PUSH_STATE_GOTO(CURLY_B_min_known, ST.B); } /* NOTREACHED */ @@ -4382,6 +4438,10 @@ NULL { curly_try_B_min: CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.paren) { + goto fake_end; + } PUSH_STATE_GOTO(CURLY_B_min, ST.B); } } @@ -4400,6 +4460,10 @@ NULL /* If it could work, try it. */ if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) { CURLY_SETPAREN(ST.paren, ST.count); + if (cur_eval && cur_eval->u.eval.close_paren && + cur_eval->u.eval.close_paren == ST.paren) { + goto fake_end; + } PUSH_STATE_GOTO(CURLY_B_max, ST.B); /* NOTREACHED */ } diff --git a/regnodes.h b/regnodes.h index 3030e04..f7ebda1 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 71 -#define REGMATCH_STATE_MAX 101 +#define REGNODE_MAX 74 +#define REGMATCH_STATE_MAX 104 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -79,41 +79,44 @@ #define NREF 67 /* 0x43 Match some already matched string */ #define NREFF 68 /* 0x44 Match already matched string, folded */ #define NREFFL 69 /* 0x45 Match already matched string, folded in loc. */ -#define OPTIMIZED 70 /* 0x46 Placeholder for dump. */ -#define PSEUDO 71 /* 0x47 Pseudo opcode for internal use. */ +#define NGROUPP 70 /* 0x46 Whether the group matched. */ +#define RECURSEP 71 /* 0x47 Whether we are in a specific recurse. */ +#define DEFINEP 72 /* 0x48 Never execute directly. */ +#define OPTIMIZED 73 /* 0x49 Placeholder for dump. */ +#define PSEUDO 74 /* 0x4a Pseudo opcode for internal use. */ /* ------------ States ------------- */ -#define TRIE_next 72 /* 0x48 Regmatch state for TRIE */ -#define TRIE_next_fail 73 /* 0x49 Regmatch state for TRIE */ -#define EVAL_AB 74 /* 0x4a Regmatch state for EVAL */ -#define EVAL_AB_fail 75 /* 0x4b Regmatch state for EVAL */ -#define CURLYX_end 76 /* 0x4c Regmatch state for CURLYX */ -#define CURLYX_end_fail 77 /* 0x4d Regmatch state for CURLYX */ -#define WHILEM_A_pre 78 /* 0x4e Regmatch state for WHILEM */ -#define WHILEM_A_pre_fail 79 /* 0x4f Regmatch state for WHILEM */ -#define WHILEM_A_min 80 /* 0x50 Regmatch state for WHILEM */ -#define WHILEM_A_min_fail 81 /* 0x51 Regmatch state for WHILEM */ -#define WHILEM_A_max 82 /* 0x52 Regmatch state for WHILEM */ -#define WHILEM_A_max_fail 83 /* 0x53 Regmatch state for WHILEM */ -#define WHILEM_B_min 84 /* 0x54 Regmatch state for WHILEM */ -#define WHILEM_B_min_fail 85 /* 0x55 Regmatch state for WHILEM */ -#define WHILEM_B_max 86 /* 0x56 Regmatch state for WHILEM */ -#define WHILEM_B_max_fail 87 /* 0x57 Regmatch state for WHILEM */ -#define BRANCH_next 88 /* 0x58 Regmatch state for BRANCH */ -#define BRANCH_next_fail 89 /* 0x59 Regmatch state for BRANCH */ -#define CURLYM_A 90 /* 0x5a Regmatch state for CURLYM */ -#define CURLYM_A_fail 91 /* 0x5b Regmatch state for CURLYM */ -#define CURLYM_B 92 /* 0x5c Regmatch state for CURLYM */ -#define CURLYM_B_fail 93 /* 0x5d Regmatch state for CURLYM */ -#define IFMATCH_A 94 /* 0x5e Regmatch state for IFMATCH */ -#define IFMATCH_A_fail 95 /* 0x5f Regmatch state for IFMATCH */ -#define CURLY_B_min_known 96 /* 0x60 Regmatch state for CURLY */ -#define CURLY_B_min_known_fail 97 /* 0x61 Regmatch state for CURLY */ -#define CURLY_B_min 98 /* 0x62 Regmatch state for CURLY */ -#define CURLY_B_min_fail 99 /* 0x63 Regmatch state for CURLY */ -#define CURLY_B_max 100 /* 0x64 Regmatch state for CURLY */ -#define CURLY_B_max_fail 101 /* 0x65 Regmatch state for CURLY */ +#define TRIE_next 75 /* 0x4b Regmatch state for TRIE */ +#define TRIE_next_fail 76 /* 0x4c Regmatch state for TRIE */ +#define EVAL_AB 77 /* 0x4d Regmatch state for EVAL */ +#define EVAL_AB_fail 78 /* 0x4e Regmatch state for EVAL */ +#define CURLYX_end 79 /* 0x4f Regmatch state for CURLYX */ +#define CURLYX_end_fail 80 /* 0x50 Regmatch state for CURLYX */ +#define WHILEM_A_pre 81 /* 0x51 Regmatch state for WHILEM */ +#define WHILEM_A_pre_fail 82 /* 0x52 Regmatch state for WHILEM */ +#define WHILEM_A_min 83 /* 0x53 Regmatch state for WHILEM */ +#define WHILEM_A_min_fail 84 /* 0x54 Regmatch state for WHILEM */ +#define WHILEM_A_max 85 /* 0x55 Regmatch state for WHILEM */ +#define WHILEM_A_max_fail 86 /* 0x56 Regmatch state for WHILEM */ +#define WHILEM_B_min 87 /* 0x57 Regmatch state for WHILEM */ +#define WHILEM_B_min_fail 88 /* 0x58 Regmatch state for WHILEM */ +#define WHILEM_B_max 89 /* 0x59 Regmatch state for WHILEM */ +#define WHILEM_B_max_fail 90 /* 0x5a Regmatch state for WHILEM */ +#define BRANCH_next 91 /* 0x5b Regmatch state for BRANCH */ +#define BRANCH_next_fail 92 /* 0x5c Regmatch state for BRANCH */ +#define CURLYM_A 93 /* 0x5d Regmatch state for CURLYM */ +#define CURLYM_A_fail 94 /* 0x5e Regmatch state for CURLYM */ +#define CURLYM_B 95 /* 0x5f Regmatch state for CURLYM */ +#define CURLYM_B_fail 96 /* 0x60 Regmatch state for CURLYM */ +#define IFMATCH_A 97 /* 0x61 Regmatch state for IFMATCH */ +#define IFMATCH_A_fail 98 /* 0x62 Regmatch state for IFMATCH */ +#define CURLY_B_min_known 99 /* 0x63 Regmatch state for CURLY */ +#define CURLY_B_min_known_fail 100 /* 0x64 Regmatch state for CURLY */ +#define CURLY_B_min 101 /* 0x65 Regmatch state for CURLY */ +#define CURLY_B_min_fail 102 /* 0x66 Regmatch state for CURLY */ +#define CURLY_B_max 103 /* 0x67 Regmatch state for CURLY */ +#define CURLY_B_max_fail 104 /* 0x68 Regmatch state for CURLY */ /* PL_regkind[] What type of regop or state is this. */ @@ -121,109 +124,112 @@ EXTCONST U8 PL_regkind[]; #else EXTCONST U8 PL_regkind[] = { - END, /* END */ - END, /* SUCCEED */ - BOL, /* BOL */ - BOL, /* MBOL */ - BOL, /* SBOL */ - EOL, /* EOS */ - EOL, /* EOL */ - EOL, /* MEOL */ - EOL, /* SEOL */ - BOUND, /* BOUND */ - BOUND, /* BOUNDL */ - NBOUND, /* NBOUND */ - NBOUND, /* NBOUNDL */ - GPOS, /* GPOS */ - REG_ANY, /* REG_ANY */ - REG_ANY, /* SANY */ - REG_ANY, /* CANY */ - ANYOF, /* ANYOF */ - ALNUM, /* ALNUM */ - ALNUM, /* ALNUML */ - NALNUM, /* NALNUM */ - NALNUM, /* NALNUML */ - SPACE, /* SPACE */ - SPACE, /* SPACEL */ - NSPACE, /* NSPACE */ - NSPACE, /* NSPACEL */ - DIGIT, /* DIGIT */ - DIGIT, /* DIGITL */ - NDIGIT, /* NDIGIT */ - NDIGIT, /* NDIGITL */ - CLUMP, /* CLUMP */ - BRANCH, /* BRANCH */ - BACK, /* BACK */ - EXACT, /* EXACT */ - EXACT, /* EXACTF */ - EXACT, /* EXACTFL */ - NOTHING, /* NOTHING */ - NOTHING, /* TAIL */ - STAR, /* STAR */ - PLUS, /* PLUS */ - CURLY, /* CURLY */ - CURLY, /* CURLYN */ - CURLY, /* CURLYM */ - CURLY, /* CURLYX */ - WHILEM, /* WHILEM */ - OPEN, /* OPEN */ - CLOSE, /* CLOSE */ - REF, /* REF */ - REF, /* REFF */ - REF, /* REFFL */ - BRANCHJ, /* IFMATCH */ - BRANCHJ, /* UNLESSM */ - BRANCHJ, /* SUSPEND */ - BRANCHJ, /* IFTHEN */ - GROUPP, /* GROUPP */ - LONGJMP, /* LONGJMP */ - BRANCHJ, /* BRANCHJ */ - EVAL, /* EVAL */ - MINMOD, /* MINMOD */ - LOGICAL, /* LOGICAL */ - BRANCHJ, /* RENUM */ - TRIE, /* TRIE */ - TRIE, /* TRIEC */ - TRIE, /* AHOCORASICK */ - TRIE, /* AHOCORASICKC */ - RECURSE, /* RECURSE */ - RECURSE, /* SRECURSE */ - NREF, /* NREF */ - NREF, /* NREFF */ - NREF, /* NREFFL */ - NOTHING, /* OPTIMIZED */ - PSEUDO, /* PSEUDO */ + END, /* END */ + END, /* SUCCEED */ + BOL, /* BOL */ + BOL, /* MBOL */ + BOL, /* SBOL */ + EOL, /* EOS */ + EOL, /* EOL */ + EOL, /* MEOL */ + EOL, /* SEOL */ + BOUND, /* BOUND */ + BOUND, /* BOUNDL */ + NBOUND, /* NBOUND */ + NBOUND, /* NBOUNDL */ + GPOS, /* GPOS */ + REG_ANY, /* REG_ANY */ + REG_ANY, /* SANY */ + REG_ANY, /* CANY */ + ANYOF, /* ANYOF */ + ALNUM, /* ALNUM */ + ALNUM, /* ALNUML */ + NALNUM, /* NALNUM */ + NALNUM, /* NALNUML */ + SPACE, /* SPACE */ + SPACE, /* SPACEL */ + NSPACE, /* NSPACE */ + NSPACE, /* NSPACEL */ + DIGIT, /* DIGIT */ + DIGIT, /* DIGITL */ + NDIGIT, /* NDIGIT */ + NDIGIT, /* NDIGITL */ + CLUMP, /* CLUMP */ + BRANCH, /* BRANCH */ + BACK, /* BACK */ + EXACT, /* EXACT */ + EXACT, /* EXACTF */ + EXACT, /* EXACTFL */ + NOTHING, /* NOTHING */ + NOTHING, /* TAIL */ + STAR, /* STAR */ + PLUS, /* PLUS */ + CURLY, /* CURLY */ + CURLY, /* CURLYN */ + CURLY, /* CURLYM */ + CURLY, /* CURLYX */ + WHILEM, /* WHILEM */ + OPEN, /* OPEN */ + CLOSE, /* CLOSE */ + REF, /* REF */ + REF, /* REFF */ + REF, /* REFFL */ + BRANCHJ, /* IFMATCH */ + BRANCHJ, /* UNLESSM */ + BRANCHJ, /* SUSPEND */ + BRANCHJ, /* IFTHEN */ + GROUPP, /* GROUPP */ + LONGJMP, /* LONGJMP */ + BRANCHJ, /* BRANCHJ */ + EVAL, /* EVAL */ + MINMOD, /* MINMOD */ + LOGICAL, /* LOGICAL */ + BRANCHJ, /* RENUM */ + TRIE, /* TRIE */ + TRIE, /* TRIEC */ + TRIE, /* AHOCORASICK */ + TRIE, /* AHOCORASICKC */ + RECURSE, /* RECURSE */ + RECURSE, /* SRECURSE */ + NREF, /* NREF */ + NREF, /* NREFF */ + NREF, /* NREFFL */ + NGROUPP, /* NGROUPP */ + RECURSEP, /* RECURSEP */ + DEFINEP, /* DEFINEP */ + NOTHING, /* OPTIMIZED */ + PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ - TRIE, /* TRIE_next */ - TRIE, /* TRIE_next_fail */ - EVAL, /* EVAL_AB */ - EVAL, /* EVAL_AB_fail */ - CURLYX, /* CURLYX_end */ - CURLYX, /* CURLYX_end_fail */ - WHILEM, /* WHILEM_A_pre */ - WHILEM, /* WHILEM_A_pre_fail */ - WHILEM, /* WHILEM_A_min */ - WHILEM, /* WHILEM_A_min_fail */ - WHILEM, /* WHILEM_A_max */ - WHILEM, /* WHILEM_A_max_fail */ - WHILEM, /* WHILEM_B_min */ - WHILEM, /* WHILEM_B_min_fail */ - WHILEM, /* WHILEM_B_max */ - WHILEM, /* WHILEM_B_max_fail */ - BRANCH, /* BRANCH_next */ - BRANCH, /* BRANCH_next_fail */ - CURLYM, /* CURLYM_A */ - CURLYM, /* CURLYM_A_fail */ - CURLYM, /* CURLYM_B */ - CURLYM, /* CURLYM_B_fail */ - IFMATCH, /* IFMATCH_A */ - IFMATCH, /* IFMATCH_A_fail */ - CURLY, /* CURLY_B_min_known */ - CURLY, /* CURLY_B_min_known_fail */ - CURLY, /* CURLY_B_min */ - CURLY, /* CURLY_B_min_fail */ - CURLY, /* CURLY_B_max */ - CURLY, /* CURLY_B_max_fail */ + TRIE, /* TRIE_next */ + TRIE, /* TRIE_next_fail */ + EVAL, /* EVAL_AB */ + EVAL, /* EVAL_AB_fail */ + CURLYX, /* CURLYX_end */ + CURLYX, /* CURLYX_end_fail */ + WHILEM, /* WHILEM_A_pre */ + WHILEM, /* WHILEM_A_pre_fail */ + WHILEM, /* WHILEM_A_min */ + WHILEM, /* WHILEM_A_min_fail */ + WHILEM, /* WHILEM_A_max */ + WHILEM, /* WHILEM_A_max_fail */ + WHILEM, /* WHILEM_B_min */ + WHILEM, /* WHILEM_B_min_fail */ + WHILEM, /* WHILEM_B_max */ + WHILEM, /* WHILEM_B_max_fail */ + BRANCH, /* BRANCH_next */ + BRANCH, /* BRANCH_next_fail */ + CURLYM, /* CURLYM_A */ + CURLYM, /* CURLYM_A_fail */ + CURLYM, /* CURLYM_B */ + CURLYM, /* CURLYM_B_fail */ + IFMATCH, /* IFMATCH_A */ + IFMATCH, /* IFMATCH_A_fail */ + CURLY, /* CURLY_B_min_known */ + CURLY, /* CURLY_B_min_known_fail */ + CURLY, /* CURLY_B_min */ + CURLY, /* CURLY_B_min_fail */ + CURLY, /* CURLY_B_max */ + CURLY, /* CURLY_B_max_fail */ }; #endif @@ -301,6 +307,9 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* NREF */ EXTRA_SIZE(struct regnode_1), /* NREFF */ EXTRA_SIZE(struct regnode_1), /* NREFFL */ + EXTRA_SIZE(struct regnode_1), /* NGROUPP */ + EXTRA_SIZE(struct regnode_1), /* RECURSEP */ + EXTRA_SIZE(struct regnode_1), /* DEFINEP */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -378,6 +387,9 @@ static const char reg_off_by_arg[] = { 0, /* NREF */ 0, /* NREFF */ 0, /* NREFFL */ + 0, /* NGROUPP */ + 0, /* RECURSEP */ + 0, /* DEFINEP */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -456,39 +468,42 @@ const char * reg_name[] = { "NREF", /* 0x43 */ "NREFF", /* 0x44 */ "NREFFL", /* 0x45 */ - "OPTIMIZED", /* 0x46 */ - "PSEUDO", /* 0x47 */ + "NGROUPP", /* 0x46 */ + "RECURSEP", /* 0x47 */ + "DEFINEP", /* 0x48 */ + "OPTIMIZED", /* 0x49 */ + "PSEUDO", /* 0x4a */ /* ------------ States ------------- */ - "TRIE_next", /* 0x48 */ - "TRIE_next_fail", /* 0x49 */ - "EVAL_AB", /* 0x4a */ - "EVAL_AB_fail", /* 0x4b */ - "CURLYX_end", /* 0x4c */ - "CURLYX_end_fail", /* 0x4d */ - "WHILEM_A_pre", /* 0x4e */ - "WHILEM_A_pre_fail", /* 0x4f */ - "WHILEM_A_min", /* 0x50 */ - "WHILEM_A_min_fail", /* 0x51 */ - "WHILEM_A_max", /* 0x52 */ - "WHILEM_A_max_fail", /* 0x53 */ - "WHILEM_B_min", /* 0x54 */ - "WHILEM_B_min_fail", /* 0x55 */ - "WHILEM_B_max", /* 0x56 */ - "WHILEM_B_max_fail", /* 0x57 */ - "BRANCH_next", /* 0x58 */ - "BRANCH_next_fail", /* 0x59 */ - "CURLYM_A", /* 0x5a */ - "CURLYM_A_fail", /* 0x5b */ - "CURLYM_B", /* 0x5c */ - "CURLYM_B_fail", /* 0x5d */ - "IFMATCH_A", /* 0x5e */ - "IFMATCH_A_fail", /* 0x5f */ - "CURLY_B_min_known", /* 0x60 */ - "CURLY_B_min_known_fail", /* 0x61 */ - "CURLY_B_min", /* 0x62 */ - "CURLY_B_min_fail", /* 0x63 */ - "CURLY_B_max", /* 0x64 */ - "CURLY_B_max_fail", /* 0x65 */ + "TRIE_next", /* 0x4b */ + "TRIE_next_fail", /* 0x4c */ + "EVAL_AB", /* 0x4d */ + "EVAL_AB_fail", /* 0x4e */ + "CURLYX_end", /* 0x4f */ + "CURLYX_end_fail", /* 0x50 */ + "WHILEM_A_pre", /* 0x51 */ + "WHILEM_A_pre_fail", /* 0x52 */ + "WHILEM_A_min", /* 0x53 */ + "WHILEM_A_min_fail", /* 0x54 */ + "WHILEM_A_max", /* 0x55 */ + "WHILEM_A_max_fail", /* 0x56 */ + "WHILEM_B_min", /* 0x57 */ + "WHILEM_B_min_fail", /* 0x58 */ + "WHILEM_B_max", /* 0x59 */ + "WHILEM_B_max_fail", /* 0x5a */ + "BRANCH_next", /* 0x5b */ + "BRANCH_next_fail", /* 0x5c */ + "CURLYM_A", /* 0x5d */ + "CURLYM_A_fail", /* 0x5e */ + "CURLYM_B", /* 0x5f */ + "CURLYM_B_fail", /* 0x60 */ + "IFMATCH_A", /* 0x61 */ + "IFMATCH_A_fail", /* 0x62 */ + "CURLY_B_min_known", /* 0x63 */ + "CURLY_B_min_known_fail", /* 0x64 */ + "CURLY_B_min", /* 0x65 */ + "CURLY_B_min_fail", /* 0x66 */ + "CURLY_B_max", /* 0x67 */ + "CURLY_B_max_fail", /* 0x68 */ }; #endif /* DEBUGGING */ #else diff --git a/t/op/pat.t b/t/op/pat.t index 465757d..a6ea46c 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3714,9 +3714,13 @@ sub iseq($$;$) { iseq(0+@k, 3, 'Got 3 keys in %+ via keys'); iseq("@k","A B C", "Got expected keys"); iseq("@v","bar baz foo", "Got expected values"); + eval' + print for $+{this_key_doesnt_exist}; + '; + ok(!$@,'lvalue $+{...} should not throw an exception'); } - - + + # stress test CURLYX/WHILEM. # # This test includes varying levels of nesting, and according to @@ -3831,5 +3835,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, or print "# Unexpected outcome: should pass or crash perl\n"; # Don't forget to update this! -BEGIN{print "1..1274\n"}; +BEGIN{print "1..1275\n"}; diff --git a/t/op/re_tests b/t/op/re_tests index 83de44a..9f0e06b 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1040,3 +1040,25 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8 /^(?'main'<(?:[^<>]+|(?&crap)|(?&main))*>)(?'empty')(?'crap'!>!>!>)$/ <!>!>><>>!>!>!> y $+{main} <!>!>><>> /^(?'main'<(?:[^<>]+|(?&main))*>)$/ <<><<<><>>>> y $1 <<><<<><>>>> /(?'first'(?&second)*)(?'second'[fF]o+)/ fooFoFoo y $+{first}-$+{second} fooFo-Foo +(?foo)?(?()bar|nada) foobar y $+{A} foo +(?foo)?(?()bar|nada) foo-barnada y $& nada +(?foo)?(?(1)bar|nada) foo-barnada y $& nada +(?foo(?(R)bar))?(?1) foofoobar y $1 foo +(?foo(?(R)bar))?(?1) foofoobar y $& foofoobar +(x)(?foo(?(R&A)bar))?(?&A) xfoofoobar y $2 foo +(x)(?foo(?(R&A)bar))?(?&A) xfoofoobar y $& xfoofoobar +(x)(?foo(?(R2)bar))?(?&A) xfoofoobar y $2 foo +(x)(?foo(?(R2)bar))?(?&A) xfoofoobar y $& xfoofoobar +(?1)(?(DEFINE)(blah)) blah y $& blah +/^(?(?.)((?&PAL)|.?)\k)$/ madamimadam y $& madamimadam +/^(?(?.)((?&PAL)|.?)\k)$/ madamiamadam n - - +/(a)?((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a)*((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a)+((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a){1,100}((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(a){0,100}((?1))(fox)/ aafox y $1-$2-$3 a-a-fox +/(ab)?((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox +/(ab)*((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox +/(ab)+((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox +/(ab){1,100}((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox +/(ab){0,100}((?1))(fox)/ ababfox y $1-$2-$3 ab-ab-fox -- 2.7.4