From e64f369d58efdbd31eecf147ef006f0f69e17fc2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 19 Oct 2012 11:40:24 -0600 Subject: [PATCH] regex: Make \R simple again. Commit 565fc1bb88638c2490cdab7a1055007f6b2d577c was based on my misunderstanding of what regrepeat() operates on. It is designed to operate on nodes that are quantifiable without having to be parenthesized. So, in 'a?', the node generated for 'a' is quantifiable as-is, but if you want to quantify 'ab', you have to parenthesize them, like '(?:ab)?' \R is such a node, since you can say things like '\R+'. There was indeed a bug, and that commit fixed it, but suboptimally. This patch essentially reverts the commit mentioned above, and solves the problem in a better way; it adds comments so that future maintainters will be less likely to be led astray than I was. It also adds a 'SIMPLE' flag for an expression that in my previous misunderstanding didn't think was simple. (It's enclosed in parentheses, and hence can serve as an operand to a quantifier.) --- regcomp.c | 8 ++++---- regexec.c | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/regcomp.c b/regcomp.c index c4875a3..307e4d5 100644 --- a/regcomp.c +++ b/regcomp.c @@ -231,10 +231,10 @@ typedef struct RExC_state_t { #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single * character. Note that this is not the same thing as REGNODE_SIMPLE */ #define SIMPLE 0x02 -#define SPSTART 0x04 /* Starts with * or +. */ +#define SPSTART 0x04 /* Starts with * or + */ #define TRYAGAIN 0x08 /* Weeded out a declaration. */ #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ @@ -10274,7 +10274,7 @@ tryagain: goto finish_meta_pat; case 'R': ret = reg_node(pRExC_state, LNBREAK); - *flagp |= HASWIDTH; + *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'h': ret = reg_node(pRExC_state, HORIZWS); @@ -12374,7 +12374,7 @@ parseit: ret = reg(pRExC_state, 1, ®_flags, depth+1); - *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED); + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED); RExC_parse = save_parse; RExC_end = save_end; diff --git a/regexec.c b/regexec.c index add4b9e..fe5dfab 100644 --- a/regexec.c +++ b/regexec.c @@ -6526,6 +6526,9 @@ no_silent: /* - regrepeat - repeatedly match something simple, report how many * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * * startposp - pointer a pointer to the start position. This is updated * to point to the byte following the highest successful * match. @@ -7081,8 +7084,23 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case LNBREAK: - Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK"); - assert(0); /* NOTREACHED */ + if (utf8_target) { + while (hardcount < max && scan < loceol && + (c=is_LNBREAK_utf8_safe(scan, loceol))) { + scan += c; + hardcount++; + } + } else { + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = PL_regeol; + while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { + scan+=c; + hardcount++; + } + } + break; case HORIZWS: if (utf8_target) { while (hardcount < max && scan < loceol && -- 2.7.4