regexec.c: Avoid unnecessary calculation
authorKarl Williamson <public@khwilliamson.com>
Sun, 16 Sep 2012 16:51:01 +0000 (10:51 -0600)
committerKarl Williamson <public@khwilliamson.com>
Mon, 17 Sep 2012 04:31:23 +0000 (22:31 -0600)
When matching an EXACT node and the target string and the pattern differ
in utf8ness, the code prior to this patch calculated each code point from
the utf8 version in order to do the EXACT comparision with the non-utf8
version.  But it is unnecessary to do this full calculation.  Code
points above Latin1 cannot possibly match a non-UTF8 string; there is no
need to know precisely which code point it is in order to know that it
won't match.  Similarly, invariant code points can be checked directly;
and the Latin1 variants can be downgraded for comparison by a simple
macro.

regexec.c

index 322e596..f207cda 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -3756,31 +3756,56 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                const char * const e = s + ln;
 
                if (utf8_target) {
-                   /* The target is utf8, the pattern is not utf8. */
+                    /* The target is utf8, the pattern is not utf8.
+                     * Above-Latin1 code points can't match the pattern;
+                     * invariants match exactly, and the other Latin1 ones need
+                     * to be downgraded to a single byte in order to do the
+                     * comparison.  (If we could be confident that the target
+                     * is not malformed, this could be refactored to have fewer
+                     * tests by just assuming that if the first bytes match, it
+                     * is an invariant, but there are tests in the test suite
+                     * dealing with (??{...}) which violate this) */
                    while (s < e) {
-                       STRLEN ulen;
                        if (l >= PL_regeol)
                             sayNO;
-                       if (NATIVE_TO_UNI(*(U8*)s) !=
-                           utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
-                                           uniflags))
-                            sayNO;
-                       l += ulen;
-                       s ++;
+                        if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
+                            sayNO;
+                        }
+                        if (UTF8_IS_INVARIANT(*(U8*)l)) {
+                           if (*l != *s) {
+                                sayNO;
+                            }
+                            l++;
+                        }
+                        else {
+                            if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
+                                sayNO;
+                            }
+                            l += 2;
+                        }
+                       s++;
                    }
                }
                else {
                    /* The target is not utf8, the pattern is utf8. */
                    while (s < e) {
-                       STRLEN ulen;
-                       if (l >= PL_regeol)
-                           sayNO;
-                       if (NATIVE_TO_UNI(*((U8*)l)) !=
-                           utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
-                                          uniflags))
-                           sayNO;
-                       s += ulen;
-                       l ++;
+                        if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
+                        {
+                            sayNO;
+                        }
+                        if (UTF8_IS_INVARIANT(*(U8*)s)) {
+                           if (*s != *l) {
+                                sayNO;
+                            }
+                            s++;
+                        }
+                        else {
+                            if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
+                                sayNO;
+                            }
+                            s += 2;
+                        }
+                       l++;
                    }
                }
                locinput = l;