From e6a3850e182c1d286b5e83a9f9917b7f0ddc4178 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 16 Sep 2012 10:51:01 -0600 Subject: [PATCH] regexec.c: Avoid unnecessary calculation When matching an EXACT node and the target string and the pattern differ in utf8ness, the code prior to this patch calculated each code point from the utf8 version in order to do the EXACT comparision with the non-utf8 version. But it is unnecessary to do this full calculation. Code points above Latin1 cannot possibly match a non-UTF8 string; there is no need to know precisely which code point it is in order to know that it won't match. Similarly, invariant code points can be checked directly; and the Latin1 variants can be downgraded for comparison by a simple macro. --- regexec.c | 59 ++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/regexec.c b/regexec.c index 322e596..f207cda 100644 --- a/regexec.c +++ b/regexec.c @@ -3756,31 +3756,56 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * const e = s + ln; if (utf8_target) { - /* The target is utf8, the pattern is not utf8. */ + /* The target is utf8, the pattern is not utf8. + * Above-Latin1 code points can't match the pattern; + * invariants match exactly, and the other Latin1 ones need + * to be downgraded to a single byte in order to do the + * comparison. (If we could be confident that the target + * is not malformed, this could be refactored to have fewer + * tests by just assuming that if the first bytes match, it + * is an invariant, but there are tests in the test suite + * dealing with (??{...}) which violate this) */ while (s < e) { - STRLEN ulen; if (l >= PL_regeol) sayNO; - if (NATIVE_TO_UNI(*(U8*)s) != - utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen, - uniflags)) - sayNO; - l += ulen; - s ++; + if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)l)) { + if (*l != *s) { + sayNO; + } + l++; + } + else { + if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + sayNO; + } + l += 2; + } + s++; } } else { /* The target is not utf8, the pattern is utf8. */ while (s < e) { - STRLEN ulen; - if (l >= PL_regeol) - sayNO; - if (NATIVE_TO_UNI(*((U8*)l)) != - utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen, - uniflags)) - sayNO; - s += ulen; - l ++; + if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s)) + { + sayNO; + } + if (UTF8_IS_INVARIANT(*(U8*)s)) { + if (*s != *l) { + sayNO; + } + s++; + } + else { + if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + sayNO; + } + s += 2; + } + l++; } } locinput = l; -- 2.7.4