From 63ac0dadb1aafcf0c171d3c1422c1923b611b2fc Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 28 Dec 2010 16:13:49 -0700 Subject: [PATCH] regex: Use BOUNDU regnodes This refactors one area in regexec.c to use BOUNDU, NBOUNDU for efficiciency, and easier adding of the future BOUNDA. --- regcomp.c | 34 +++++++--- regcomp.sym | 1 + regexec.c | 206 +++++++++++++++++++++++++++++++----------------------------- 3 files changed, 135 insertions(+), 106 deletions(-) diff --git a/regcomp.c b/regcomp.c index cbcabdf..1eded27 100644 --- a/regcomp.c +++ b/regcomp.c @@ -7399,22 +7399,40 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - if (LOC) { - ret = reg_node(pRExC_state, (U8)(BOUNDL)); - } else { - ret = reg_node(pRExC_state, (U8)(BOUND)); + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = BOUNDL; + break; + case REGEX_UNICODE_CHARSET: + op = BOUNDU; + break; + case REGEX_DEPENDS_CHARSET: + op = BOUND; + break; + default: + goto bad_charset; } + ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; goto finish_meta_pat; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - if (LOC) { - ret = reg_node(pRExC_state, (U8)(NBOUNDL)); - } else { - ret = reg_node(pRExC_state, (U8)(NBOUND)); + switch (get_regex_charset(RExC_flags)) { + case REGEX_LOCALE_CHARSET: + op = NBOUNDL; + break; + case REGEX_UNICODE_CHARSET: + op = NBOUNDU; + break; + case REGEX_DEPENDS_CHARSET: + op = NBOUND; + break; + default: + goto bad_charset; } + ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; goto finish_meta_pat; diff --git a/regcomp.sym b/regcomp.sym index 36ebe6a..2fb1785 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -33,6 +33,7 @@ SEOL EOL, no ; Same, assuming singleline. BOUND BOUND, no ; Match "" at any word boundary using native charset semantics for non-utf8 BOUNDL BOUND, no ; Match "" at any locale word boundary BOUNDU BOUND, no ; Match "" at any word boundary using Unicode semantics +# All NBOUND nodes are required by a line regexec.c to be greater than all BOUND ones NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset semantics for non-utf8 NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode semantics diff --git a/regexec.c b/regexec.c index 4416d69..c360cb2 100644 --- a/regexec.c +++ b/regexec.c @@ -1332,6 +1332,63 @@ if ((!reginfo || regtry(reginfo, &s))) \ #define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8) +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it in either the if or else clause, + * with the other one being empty */ +#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, ) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, , REXEC_FBC_TRYIT) + +/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to + * be passed in completely with the variable name being tested, which isn't + * such a clean interface, but this is easier to read than it was before. We + * are looking for the boundary (or non-boundary between a word and non-word + * character. The utf8 and non-utf8 cases have the same logic, but the details + * must be different. Find the "wordness" of the character just prior to this + * one, and compare it with the wordness of this one. If they differ, we have + * a boundary. At the beginning of the string, pretend that the previous + * character was a new-line */ +#define FBC_BOUND_COMMON(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8, \ + IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + if (s == PL_bostr) { \ + tmp = '\n'; \ + } \ + else { \ + U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \ + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TEST1_UTF8; \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( \ + if (tmp == ! (TEST2_UTF8)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + else { /* Not utf8 */ \ + tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \ + goto got_it; + /* We know what class REx starts with. Try to find this position... */ /* if reginfo is NULL, its a dryrun */ /* annoyingly all the vars in this routine have different names from their counterparts @@ -1524,91 +1581,35 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case BOUNDL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUND: - if (utf8_target) { - if (s == PL_bostr) - tmp = '\n'; - else { - U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); - } - tmp = (FLAGS(c) != REGEX_LOCALE_CHARSET ? - isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))); - LOAD_UTF8_CHARCLASS_ALNUM(); - REXEC_FBC_UTF8_SCAN( - if (tmp == !(FLAGS(c) != REGEX_LOCALE_CHARSET ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) : - isALNUM_LC_utf8((U8*)s))) - { - tmp = !tmp; - REXEC_FBC_TRYIT; - } - ); - } - else { /* Not utf8 */ - tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((FLAGS(c) == REGEX_LOCALE_CHARSET) - ? isALNUM_LC(tmp) - : (isWORDCHAR_L1(tmp) - && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))); - REXEC_FBC_SCAN( - if (tmp == - !((FLAGS(c) == REGEX_LOCALE_CHARSET) - ? isALNUM_LC(*s) - : (isWORDCHAR_L1((U8) *s) - && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET))))) - { - tmp = !tmp; - REXEC_FBC_TRYIT; - } - ); - } - if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) - goto got_it; + FBC_BOUND(isALNUM_LC, + isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)), + isALNUM_LC_utf8((U8*)s)); break; case NBOUNDL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ + FBC_NBOUND(isALNUM_LC, + isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)), + isALNUM_LC_utf8((U8*)s)); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; case NBOUND: - if (utf8_target) { - if (s == PL_bostr) - tmp = '\n'; - else { - U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); - } - tmp = (FLAGS(c) != REGEX_LOCALE_CHARSET ? - isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))); - LOAD_UTF8_CHARCLASS_ALNUM(); - REXEC_FBC_UTF8_SCAN( - if (tmp == !(FLAGS(c) != REGEX_LOCALE_CHARSET ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) : - isALNUM_LC_utf8((U8*)s))) - tmp = !tmp; - else REXEC_FBC_TRYIT; - ); - } - else { - tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((FLAGS(c) == REGEX_LOCALE_CHARSET) - ? isALNUM_LC(tmp) - : (isWORDCHAR_L1(tmp) - && (isASCII(tmp) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))); - REXEC_FBC_SCAN( - if (tmp == ! ( - (FLAGS(c) == REGEX_LOCALE_CHARSET) - ? isALNUM_LC(*s) - : (isWORDCHAR_L1((U8) *s) - && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET))))) - { - tmp = !tmp; - } - else REXEC_FBC_TRYIT; - ); - } - if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s))) - goto got_it; + FBC_NBOUND(isWORDCHAR, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); break; case ALNUML: REXEC_FBC_CSCAN_TAINT( @@ -3641,12 +3642,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) nextchr = UCHARAT(locinput); break; } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ case BOUNDL: case NBOUNDL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: + case BOUNDU: case NBOUND: + case NBOUNDU: /* was last char in word? */ if (utf8_target) { if (locinput == PL_bostr) @@ -3656,7 +3663,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); } - if (OP(scan) == BOUND || OP(scan) == NBOUND) { + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isALNUM_uni(ln); LOAD_UTF8_CHARCLASS_ALNUM(); n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target); @@ -3669,24 +3676,27 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) else { ln = (locinput != PL_bostr) ? UCHARAT(locinput - 1) : '\n'; - if (FLAGS(scan) == REGEX_UNICODE_CHARSET) { - - /* Here, can't be BOUNDL or NBOUNDL because they never set - * the flags to REGEX_UNICODE_CHARSET */ - ln = isWORDCHAR_L1(ln); - n = isWORDCHAR_L1(nextchr); - } - else if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isALNUM(ln); + n = isALNUM(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + break; } } - if (((!ln) == (!n)) == (OP(scan) == BOUND || - OP(scan) == BOUNDL)) + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) sayNO; break; case ANYOFV: -- 2.7.4