regexec.c: refactor find-by-class EXACTish code
authorKarl Williamson <public@khwilliamson.com>
Sun, 13 Feb 2011 02:23:34 +0000 (19:23 -0700)
committerKarl Williamson <public@khwilliamson.com>
Mon, 14 Feb 2011 15:41:39 +0000 (08:41 -0700)
This code is way out-of-date, using upper and lower case instead of fold-case.

regexec.c

index e5e6e27..a7f5526 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1256,8 +1256,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
         if ( f != c                                   \
              && (f == c1 || f == c2)                  \
              && (ln == len ||                         \
-               foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
-                             m, NULL, ln, cBOOL(UTF_PATTERN)))\
+               foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,\
+                             m, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags))\
              && (!reginfo || regtry(reginfo, &s)) )   \
              goto got_it;                             \
     }                                                  \
@@ -1266,17 +1266,9 @@ s += len
 
 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
 STMT_START {                                              \
-    re_fold_t folder;                                   \
-    switch (OP(c)) {                                      \
-       case EXACTFU: folder = foldEQ_latin1; break;      \
-       case EXACTFL: folder = foldEQ_locale; break;      \
-       case EXACTF:  folder = foldEQ; break;             \
-       default:                                          \
-           Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
-    }                                                     \
     while (s <= e) {                                      \
        if ( (CoNd)                                       \
-            && (ln == 1 || folder(s, m, ln))             \
+            && (ln == 1 || folder(s, pat_string, ln))    \
             && (!reginfo || regtry(reginfo, &s)) )       \
            goto got_it;                                  \
        s++;                                              \
@@ -1447,15 +1439,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 {
        dVAR;
        const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
-       char *m;
+       char *pat_string;   /* The pattern's exactish string */
+       char *pat_end;      /* ptr to end char of pat_string */
+       re_fold_t folder;       /* Function for computing non-utf8 folds */
+       const U8 *fold_array;   /* array for folding ords < 256 */
        STRLEN ln;
        STRLEN lnc;
        register STRLEN uskip;
-       unsigned int c1;
-       unsigned int c2;
+       U8 c1;
+       U8 c2;
        char *e;
        register I32 tmp = 1;   /* Scratch variable? */
        register const bool utf8_target = PL_reg_match_utf8;
+       UV utf8_fold_flags;
         RXi_GET_DECL(prog,progi);
 
        PERL_ARGS_ASSERT_FIND_BYCLASS;
@@ -1498,7 +1494,108 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            );
            break;
        case EXACTFU:
+           if (UTF_PATTERN || utf8_target) {
+               utf8_fold_flags = 0;
+               goto do_exactf_utf8;
+           }
+           fold_array = PL_fold_latin1;
+           folder = foldEQ_latin1;
+           /* XXX This uses the full utf8 fold because if the pattern contains
+            * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
+            * There could be a new node type, say EXACTFU_SS, which is
+            * generated by regcomp only if there is an 'ss', and then every
+            * other case could goto do_exactf_non_utf8;*/
+           goto do_exactf_utf8;
+
        case EXACTF:
+           if (UTF_PATTERN || utf8_target) {
+               utf8_fold_flags = 0;
+               goto do_exactf_utf8;
+           }
+           fold_array = PL_fold;
+           folder = foldEQ;
+           goto do_exactf_non_utf8;
+
+       case EXACTFL:
+           if (UTF_PATTERN || utf8_target) {
+               utf8_fold_flags = 0; /* XXX, add new flag for locale */
+               goto do_exactf_utf8;
+           }
+           fold_array = PL_fold_locale;
+           folder = foldEQ_locale;
+
+           /* FALL THROUGH */
+
+       do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
+
+           /* The idea in the non-utf8 EXACTF* cases is to first find the
+            * first character of the EXACTF* node and then, if necessary,
+            * case-insensitively compare the full text of the node.  c1 is the
+            * first character.  c2 is its fold.  This logic will not work for
+            * Unicode semantics and the german sharp ss, which hence should
+            * not be compiled into a node that gets here. */
+           pat_string = STRING(c);
+           ln  = STR_LEN(c);   /* length to match in octets/bytes */
+
+           e = HOP3c(strend, -((I32)ln), s);
+
+           if (!reginfo && e < s) {
+               e = s;                  /* Due to minlen logic of intuit() */
+           }
+
+           c1 = *pat_string;
+           c2 = fold_array[c1];
+           if (c1 == c2) { /* If char and fold are the same */
+               REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
+           }
+           else {
+               REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
+           }
+           break;
+
+       do_exactf_utf8:
+
+           /* If one of the operands is in utf8, we can't use the simpler
+            * folding above, due to the fact that many different characters
+            * can have the same fold, or portion of a fold, or different-
+            * length fold */
+           pat_string = STRING(c);
+           ln  = STR_LEN(c);   /* length to match in octets/bytes */
+           pat_end = pat_string + ln;
+           lnc = (UTF_PATTERN) /* length to match in characters */
+                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
+                   : ln;
+
+           e = HOP3c(strend, -((I32)lnc), s);
+
+           if (!reginfo && e < s) {
+               e = s;                  /* Due to minlen logic of intuit() */
+           }
+
+           while (s <= e) {
+               char *my_strend= (char *)strend;
+               if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
+                     pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
+                   && (!reginfo || regtry(reginfo, &s)) )
+               {
+                   goto got_it;
+               }
+               s += UTF8SKIP(s);
+           }
+           break;
+
+
+#if 0
+       case EXACTFA:
+           utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+           goto do_exactf_non_locale;
+
+       case EXACTFU:
+       case EXACTF:
+           utf8_fold_flags = 0;
+
+       do_exactf_non_locale:
+
            m   = STRING(c);
            ln  = STR_LEN(c);   /* length to match in octets/bytes */
            lnc = (I32) ln;     /* length to match in characters */
@@ -1625,6 +1722,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
            }
            break;
+#endif
        case BOUNDL:
            PL_reg_flags |= RF_tainted;
            FBC_BOUND(isALNUM_LC,