regexec.c: White space only; no code changes

author Karl Williamson <public@khwilliamson.com>

Thu, 6 Dec 2012 17:14:38 +0000 (10:14 -0700)

committer Karl Williamson <public@khwilliamson.com>

Sun, 9 Dec 2012 19:08:28 +0000 (12:08 -0700)
author Karl Williamson <public@khwilliamson.com>
Thu, 6 Dec 2012 17:14:38 +0000 (10:14 -0700)
committer Karl Williamson <public@khwilliamson.com>
Sun, 9 Dec 2012 19:08:28 +0000 (12:08 -0700)
diff --git a/regexec.c b/regexec.c

index d235cde..b3149f6 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1437,637 +1437,630 @@ STATIC char *
  S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, 
      const char *strend, regmatch_info *reginfo)
  {
-       dVAR;
-       const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
-       char *pat_string;   /* The pattern's exactish string */
-       char *pat_end;      /* ptr to end char of pat_string */
-       re_fold_t folder;       /* Function for computing non-utf8 folds */
-       const U8 *fold_array;   /* array for folding ords < 256 */
-       STRLEN ln;
-       STRLEN lnc;
-       STRLEN uskip;
-       U8 c1;
-       U8 c2;
-       char *e;
-       I32 tmp = 1;    /* Scratch variable? */
-       const bool utf8_target = PL_reg_match_utf8;
-       UV utf8_fold_flags = 0;
-        RXi_GET_DECL(prog,progi);
-
-       PERL_ARGS_ASSERT_FIND_BYCLASS;
-        
-       /* We know what class it must start with. */
-       switch (OP(c)) {
-       case ANYOF:
-           if (utf8_target) {
-               REXEC_FBC_UTF8_CLASS_SCAN(
-                          reginclass(prog, c, (U8*)s, utf8_target));
-           }
-           else {
-               REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
-           }
-           break;
-       case CANY:
-           REXEC_FBC_SCAN(
-               if (tmp && (!reginfo || regtry(reginfo, &s)))
-                   goto got_it;
-               else
-                   tmp = doevery;
-           );
-           break;
-
-       case EXACTFA:
-           if (UTF_PATTERN || utf8_target) {
-               utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
-               goto do_exactf_utf8;
-           }
-           fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
-           folder = foldEQ_latin1;         /* /a, except the sharp s one which */
-           goto do_exactf_non_utf8;        /* isn't dealt with by these */
+    dVAR;
+    const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
+    char *pat_string;   /* The pattern's exactish string */
+    char *pat_end;         /* ptr to end char of pat_string */
+    re_fold_t folder;  /* Function for computing non-utf8 folds */
+    const U8 *fold_array;   /* array for folding ords < 256 */
+    STRLEN ln;
+    STRLEN lnc;
+    STRLEN uskip;
+    U8 c1;
+    U8 c2;
+    char *e;
+    I32 tmp = 1;       /* Scratch variable? */
+    const bool utf8_target = PL_reg_match_utf8;
+    UV utf8_fold_flags = 0;
+    RXi_GET_DECL(prog,progi);
  
-       case EXACTF:
-           if (utf8_target) {
+    PERL_ARGS_ASSERT_FIND_BYCLASS;
  
-               /* regcomp.c already folded this if pattern is in UTF-8 */
-               utf8_fold_flags = 0;
-               goto do_exactf_utf8;
-           }
-           fold_array = PL_fold;
-           folder = foldEQ;
-           goto do_exactf_non_utf8;
+    /* We know what class it must start with. */
+    switch (OP(c)) {
+    case ANYOF:
+        if (utf8_target) {
+            REXEC_FBC_UTF8_CLASS_SCAN(
+                      reginclass(prog, c, (U8*)s, utf8_target));
+        }
+        else {
+            REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
+        }
+        break;
+    case CANY:
+        REXEC_FBC_SCAN(
+            if (tmp && (!reginfo || regtry(reginfo, &s)))
+                goto got_it;
+            else
+                tmp = doevery;
+        );
+        break;
  
-       case EXACTFL:
-           if (UTF_PATTERN || utf8_target) {
-               utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
-               goto do_exactf_utf8;
-           }
-           fold_array = PL_fold_locale;
-           folder = foldEQ_locale;
-           goto do_exactf_non_utf8;
+    case EXACTFA:
+        if (UTF_PATTERN || utf8_target) {
+            utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+            goto do_exactf_utf8;
+        }
+        fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
+        folder = foldEQ_latin1;                /* /a, except the sharp s one which */
+        goto do_exactf_non_utf8;       /* isn't dealt with by these */
  
-       case EXACTFU_SS:
-           if (UTF_PATTERN) {
-               utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
-           }
-           goto do_exactf_utf8;
+    case EXACTF:
+        if (utf8_target) {
  
-       case EXACTFU_TRICKYFOLD:
-       case EXACTFU:
-           if (UTF_PATTERN || utf8_target) {
-               utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
-               goto do_exactf_utf8;
-           }
+            /* regcomp.c already folded this if pattern is in UTF-8 */
+            utf8_fold_flags = 0;
+            goto do_exactf_utf8;
+        }
+        fold_array = PL_fold;
+        folder = foldEQ;
+        goto do_exactf_non_utf8;
  
-           /* Any 'ss' in the pattern should have been replaced by regcomp,
-            * so we don't have to worry here about this single special case
-            * in the Latin1 range */
-           fold_array = PL_fold_latin1;
-           folder = foldEQ_latin1;
+    case EXACTFL:
+        if (UTF_PATTERN || utf8_target) {
+            utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
+            goto do_exactf_utf8;
+        }
+        fold_array = PL_fold_locale;
+        folder = foldEQ_locale;
+        goto do_exactf_non_utf8;
  
-           /* FALL THROUGH */
+    case EXACTFU_SS:
+        if (UTF_PATTERN) {
+            utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+        }
+        goto do_exactf_utf8;
  
-       do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
-                              are no glitches with fold-length differences
-                              between the target string and pattern */
-
-           /* The idea in the non-utf8 EXACTF* cases is to first find the
-            * first character of the EXACTF* node and then, if necessary,
-            * case-insensitively compare the full text of the node.  c1 is the
-            * first character.  c2 is its fold.  This logic will not work for
-            * Unicode semantics and the german sharp ss, which hence should
-            * not be compiled into a node that gets here. */
-           pat_string = STRING(c);
-           ln  = STR_LEN(c);   /* length to match in octets/bytes */
-
-           /* We know that we have to match at least 'ln' bytes (which is the
-            * same as characters, since not utf8).  If we have to match 3
-            * characters, and there are only 2 availabe, we know without
-            * trying that it will fail; so don't start a match past the
-            * required minimum number from the far end */
-           e = HOP3c(strend, -((I32)ln), s);
-
-           if (!reginfo && e < s) {
-               e = s;                  /* Due to minlen logic of intuit() */
-           }
+    case EXACTFU_TRICKYFOLD:
+    case EXACTFU:
+        if (UTF_PATTERN || utf8_target) {
+            utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+            goto do_exactf_utf8;
+        }
  
-           c1 = *pat_string;
-           c2 = fold_array[c1];
-           if (c1 == c2) { /* If char and fold are the same */
-               REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
-           }
-           else {
-               REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
-           }
-           break;
+        /* Any 'ss' in the pattern should have been replaced by regcomp,
+         * so we don't have to worry here about this single special case
+         * in the Latin1 range */
+        fold_array = PL_fold_latin1;
+        folder = foldEQ_latin1;
+
+        /* FALL THROUGH */
+
+    do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
+                           are no glitches with fold-length differences
+                           between the target string and pattern */
+
+        /* The idea in the non-utf8 EXACTF* cases is to first find the
+         * first character of the EXACTF* node and then, if necessary,
+         * case-insensitively compare the full text of the node.  c1 is the
+         * first character.  c2 is its fold.  This logic will not work for
+         * Unicode semantics and the german sharp ss, which hence should
+         * not be compiled into a node that gets here. */
+        pat_string = STRING(c);
+        ln  = STR_LEN(c);      /* length to match in octets/bytes */
+
+        /* We know that we have to match at least 'ln' bytes (which is the
+         * same as characters, since not utf8).  If we have to match 3
+         * characters, and there are only 2 availabe, we know without
+         * trying that it will fail; so don't start a match past the
+         * required minimum number from the far end */
+        e = HOP3c(strend, -((I32)ln), s);
+
+        if (!reginfo && e < s) {
+            e = s;                     /* Due to minlen logic of intuit() */
+        }
  
-       do_exactf_utf8:
-       {
-           unsigned expansion;
-
-
-           /* If one of the operands is in utf8, we can't use the simpler
-            * folding above, due to the fact that many different characters
-            * can have the same fold, or portion of a fold, or different-
-            * length fold */
-           pat_string = STRING(c);
-           ln  = STR_LEN(c);   /* length to match in octets/bytes */
-           pat_end = pat_string + ln;
-           lnc = (UTF_PATTERN) /* length to match in characters */
-                   ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
-                   : ln;
-
-           /* We have 'lnc' characters to match in the pattern, but because of
-            * multi-character folding, each character in the target can match
-            * up to 3 characters (Unicode guarantees it will never exceed
-            * this) if it is utf8-encoded; and up to 2 if not (based on the
-            * fact that the Latin 1 folds are already determined, and the
-            * only multi-char fold in that range is the sharp-s folding to
-            * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
-            * string character.  Adjust lnc accordingly, rounding up, so that
-            * if we need to match at least 4+1/3 chars, that really is 5. */
-           expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
-           lnc = (lnc + expansion - 1) / expansion;
-
-           /* As in the non-UTF8 case, if we have to match 3 characters, and
-            * only 2 are left, it's guaranteed to fail, so don't start a
-            * match that would require us to go beyond the end of the string
-            */
-           e = HOP3c(strend, -((I32)lnc), s);
+        c1 = *pat_string;
+        c2 = fold_array[c1];
+        if (c1 == c2) { /* If char and fold are the same */
+            REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
+        }
+        else {
+            REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
+        }
+        break;
  
-           if (!reginfo && e < s) {
-               e = s;                  /* Due to minlen logic of intuit() */
-           }
+    do_exactf_utf8:
+    {
+        unsigned expansion;
+
+        /* If one of the operands is in utf8, we can't use the simpler folding
+         * above, due to the fact that many different characters can have the
+         * same fold, or portion of a fold, or different- length fold */
+        pat_string = STRING(c);
+        ln  = STR_LEN(c);      /* length to match in octets/bytes */
+        pat_end = pat_string + ln;
+        lnc = (UTF_PATTERN)     /* length to match in characters */
+                ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
+                : ln;
+
+        /* We have 'lnc' characters to match in the pattern, but because of
+         * multi-character folding, each character in the target can match
+         * up to 3 characters (Unicode guarantees it will never exceed
+         * this) if it is utf8-encoded; and up to 2 if not (based on the
+         * fact that the Latin 1 folds are already determined, and the
+         * only multi-char fold in that range is the sharp-s folding to
+         * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
+         * string character.  Adjust lnc accordingly, rounding up, so that
+         * if we need to match at least 4+1/3 chars, that really is 5. */
+        expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
+        lnc = (lnc + expansion - 1) / expansion;
+
+        /* As in the non-UTF8 case, if we have to match 3 characters, and
+         * only 2 are left, it's guaranteed to fail, so don't start a
+         * match that would require us to go beyond the end of the string
+         */
+        e = HOP3c(strend, -((I32)lnc), s);
+
+        if (!reginfo && e < s) {
+            e = s;                     /* Due to minlen logic of intuit() */
+        }
  
-           /* XXX Note that we could recalculate e to stop the loop earlier,
-            * as the worst case expansion above will rarely be met, and as we
-            * go along we would usually find that e moves further to the left.
-            * This would happen only after we reached the point in the loop
-            * where if there were no expansion we should fail.  Unclear if
-            * worth the expense */
-
-           while (s <= e) {
-               char *my_strend= (char *)strend;
-               if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
-                     pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
-                   && (!reginfo || regtry(reginfo, &s)) )
-               {
-                   goto got_it;
-               }
-               s += (utf8_target) ? UTF8SKIP(s) : 1;
-           }
-           break;
-       }
-       case BOUNDL:
-           PL_reg_flags |= RF_tainted;
-           FBC_BOUND(isALNUM_LC,
-                     isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
-                     isALNUM_LC_utf8((U8*)s));
-           break;
-       case NBOUNDL:
-           PL_reg_flags |= RF_tainted;
-           FBC_NBOUND(isALNUM_LC,
-                      isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
-                      isALNUM_LC_utf8((U8*)s));
-           break;
-       case BOUND:
-           FBC_BOUND(isWORDCHAR,
-                     isALNUM_uni(tmp),
-                     cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
-           break;
-       case BOUNDA:
-           FBC_BOUND_NOLOAD(isWORDCHAR_A,
-                            isWORDCHAR_A(tmp),
-                            isWORDCHAR_A((U8*)s));
-           break;
-       case NBOUND:
-           FBC_NBOUND(isWORDCHAR,
-                      isALNUM_uni(tmp),
-                      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
-           break;
-       case NBOUNDA:
-           FBC_NBOUND_NOLOAD(isWORDCHAR_A,
-                             isWORDCHAR_A(tmp),
-                             isWORDCHAR_A((U8*)s));
-           break;
-       case BOUNDU:
-           FBC_BOUND(isWORDCHAR_L1,
-                     isALNUM_uni(tmp),
-                     cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
-           break;
-       case NBOUNDU:
-           FBC_NBOUND(isWORDCHAR_L1,
-                      isALNUM_uni(tmp),
-                      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
-           break;
-       case ALNUML:
-           REXEC_FBC_CSCAN_TAINT(
-               isALNUM_LC_utf8((U8*)s),
-               isALNUM_LC(*s)
-           );
-           break;
-       case ALNUMU:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_ALNUM(),
-               swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
-                isWORDCHAR_L1((U8) *s)
-           );
-           break;
-       case ALNUM:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_ALNUM(),
-               swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
-                isWORDCHAR((U8) *s)
-           );
-           break;
-       case ALNUMA:
-           /* Don't need to worry about utf8, as it can match only a single
-            * byte invariant character */
-           REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
-           break;
-       case NALNUMU:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_ALNUM(),
-               !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
-                ! isWORDCHAR_L1((U8) *s)
-           );
-           break;
-       case NALNUM:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_ALNUM(),
-               !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
-                ! isALNUM(*s)
-           );
-           break;
-       case NALNUMA:
-           REXEC_FBC_CSCAN(
-               !isWORDCHAR_A(*s),
-               !isWORDCHAR_A(*s)
-           );
-           break;
-       case NALNUML:
-           REXEC_FBC_CSCAN_TAINT(
-               !isALNUM_LC_utf8((U8*)s),
-               !isALNUM_LC(*s)
-           );
-           break;
-       case SPACEU:
-           REXEC_FBC_CSCAN(
-               is_XPERLSPACE_utf8(s),
-                isSPACE_L1((U8) *s)
-           );
-           break;
-       case SPACE:
-           REXEC_FBC_CSCAN(
-               is_XPERLSPACE_utf8(s),
-                isSPACE((U8) *s)
-           );
-           break;
-       case SPACEA:
-           /* Don't need to worry about utf8, as it can match only a single
-            * byte invariant character */
-           REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
-           break;
-       case SPACEL:
-           REXEC_FBC_CSCAN_TAINT(
-               isSPACE_LC_utf8((U8*)s),
-               isSPACE_LC(*s)
-           );
-           break;
-       case NSPACEU:
-           REXEC_FBC_CSCAN(
-               ! is_XPERLSPACE_utf8(s),
-                ! isSPACE_L1((U8) *s)
-           );
-           break;
-       case NSPACE:
-           REXEC_FBC_CSCAN(
-               ! is_XPERLSPACE_utf8(s),
-                ! isSPACE((U8) *s)
-           );
-           break;
-       case NSPACEA:
-           REXEC_FBC_CSCAN(
-               !isSPACE_A(*s),
-               !isSPACE_A(*s)
-           );
-           break;
-       case NSPACEL:
-           REXEC_FBC_CSCAN_TAINT(
-               !isSPACE_LC_utf8((U8*)s),
-               !isSPACE_LC(*s)
-           );
-           break;
-       case DIGIT:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_DIGIT(),
-               swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
-               isDIGIT(*s)
-           );
-           break;
-       case DIGITA:
-           /* Don't need to worry about utf8, as it can match only a single
-            * byte invariant character */
-           REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
-           break;
-       case DIGITL:
-           REXEC_FBC_CSCAN_TAINT(
-               isDIGIT_LC_utf8((U8*)s),
-               isDIGIT_LC(*s)
-           );
-           break;
-       case NDIGIT:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_DIGIT(),
-               !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
-               !isDIGIT(*s)
-           );
-           break;
-       case NDIGITA:
-           REXEC_FBC_CSCAN(
-               !isDIGIT_A(*s),
-               !isDIGIT_A(*s)
-           );
-           break;
-       case NDIGITL:
-           REXEC_FBC_CSCAN_TAINT(
-               !isDIGIT_LC_utf8((U8*)s),
-               !isDIGIT_LC(*s)
-           );
-           break;
-       case LNBREAK:
-           REXEC_FBC_CSCAN(
-               is_LNBREAK_utf8_safe(s, strend),
-               is_LNBREAK_latin1_safe(s, strend)
-           );
-           break;
-       case VERTWS:
-           REXEC_FBC_CSCAN(
-               is_VERTWS_utf8_safe(s, strend),
-               is_VERTWS_latin1_safe(s, strend)
-           );
-           break;
-       case NVERTWS:
-           REXEC_FBC_CSCAN(
-               !is_VERTWS_utf8_safe(s, strend),
-               !is_VERTWS_latin1_safe(s, strend)
-           );
-           break;
-       case HORIZWS:
-           REXEC_FBC_CSCAN(
-               is_HORIZWS_utf8_safe(s, strend),
-               is_HORIZWS_latin1_safe(s, strend)
-           );
-           break;
-       case NHORIZWS:
-           REXEC_FBC_CSCAN(
-               !is_HORIZWS_utf8_safe(s, strend),
-               !is_HORIZWS_latin1_safe(s, strend)
-           );      
-           break;
-       case POSIXA:
-           /* Don't need to worry about utf8, as it can match only a single
-            * byte invariant character.  The flag in this node type is the
-            * class number to pass to _generic_isCC() to build a mask for
-            * searching in PL_charclass[] */
-           REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
-           break;
-       case NPOSIXA:
-           REXEC_FBC_CSCAN(
-               !_generic_isCC_A(*s, FLAGS(c)),
-               !_generic_isCC_A(*s, FLAGS(c))
-           );
-           break;
+        /* XXX Note that we could recalculate e to stop the loop earlier,
+         * as the worst case expansion above will rarely be met, and as we
+         * go along we would usually find that e moves further to the left.
+         * This would happen only after we reached the point in the loop
+         * where if there were no expansion we should fail.  Unclear if
+         * worth the expense */
+
+        while (s <= e) {
+            char *my_strend= (char *)strend;
+            if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
+                  pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
+                && (!reginfo || regtry(reginfo, &s)) )
+            {
+                goto got_it;
+            }
+            s += (utf8_target) ? UTF8SKIP(s) : 1;
+        }
+        break;
+    }
+    case BOUNDL:
+        PL_reg_flags |= RF_tainted;
+        FBC_BOUND(isALNUM_LC,
+                  isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                  isALNUM_LC_utf8((U8*)s));
+        break;
+    case NBOUNDL:
+        PL_reg_flags |= RF_tainted;
+        FBC_NBOUND(isALNUM_LC,
+                   isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
+                   isALNUM_LC_utf8((U8*)s));
+        break;
+    case BOUND:
+        FBC_BOUND(isWORDCHAR,
+                  isALNUM_uni(tmp),
+                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+        break;
+    case BOUNDA:
+        FBC_BOUND_NOLOAD(isWORDCHAR_A,
+                         isWORDCHAR_A(tmp),
+                         isWORDCHAR_A((U8*)s));
+        break;
+    case NBOUND:
+        FBC_NBOUND(isWORDCHAR,
+                   isALNUM_uni(tmp),
+                   cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+        break;
+    case NBOUNDA:
+        FBC_NBOUND_NOLOAD(isWORDCHAR_A,
+                          isWORDCHAR_A(tmp),
+                          isWORDCHAR_A((U8*)s));
+        break;
+    case BOUNDU:
+        FBC_BOUND(isWORDCHAR_L1,
+                  isALNUM_uni(tmp),
+                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+        break;
+    case NBOUNDU:
+        FBC_NBOUND(isWORDCHAR_L1,
+                   isALNUM_uni(tmp),
+                   cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
+        break;
+    case ALNUML:
+        REXEC_FBC_CSCAN_TAINT(
+            isALNUM_LC_utf8((U8*)s),
+            isALNUM_LC(*s)
+        );
+        break;
+    case ALNUMU:
+        REXEC_FBC_CSCAN_PRELOAD(
+            LOAD_UTF8_CHARCLASS_ALNUM(),
+            swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
+            isWORDCHAR_L1((U8) *s)
+        );
+        break;
+    case ALNUM:
+        REXEC_FBC_CSCAN_PRELOAD(
+            LOAD_UTF8_CHARCLASS_ALNUM(),
+            swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
+            isWORDCHAR((U8) *s)
+        );
+        break;
+    case ALNUMA:
+        /* Don't need to worry about utf8, as it can match only a single
+         * byte invariant character */
+        REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
+        break;
+    case NALNUMU:
+        REXEC_FBC_CSCAN_PRELOAD(
+            LOAD_UTF8_CHARCLASS_ALNUM(),
+            !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
+            ! isWORDCHAR_L1((U8) *s)
+        );
+        break;
+    case NALNUM:
+        REXEC_FBC_CSCAN_PRELOAD(
+            LOAD_UTF8_CHARCLASS_ALNUM(),
+            !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
+            ! isALNUM(*s)
+        );
+        break;
+    case NALNUMA:
+        REXEC_FBC_CSCAN(
+            !isWORDCHAR_A(*s),
+            !isWORDCHAR_A(*s)
+        );
+        break;
+    case NALNUML:
+        REXEC_FBC_CSCAN_TAINT(
+            !isALNUM_LC_utf8((U8*)s),
+            !isALNUM_LC(*s)
+        );
+        break;
+    case SPACEU:
+        REXEC_FBC_CSCAN(
+            is_XPERLSPACE_utf8(s),
+            isSPACE_L1((U8) *s)
+        );
+        break;
+    case SPACE:
+        REXEC_FBC_CSCAN(
+            is_XPERLSPACE_utf8(s),
+            isSPACE((U8) *s)
+        );
+        break;
+    case SPACEA:
+        /* Don't need to worry about utf8, as it can match only a single
+         * byte invariant character */
+        REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
+        break;
+    case SPACEL:
+        REXEC_FBC_CSCAN_TAINT(
+            isSPACE_LC_utf8((U8*)s),
+            isSPACE_LC(*s)
+        );
+        break;
+    case NSPACEU:
+        REXEC_FBC_CSCAN(
+            ! is_XPERLSPACE_utf8(s),
+            ! isSPACE_L1((U8) *s)
+        );
+        break;
+    case NSPACE:
+        REXEC_FBC_CSCAN(
+            ! is_XPERLSPACE_utf8(s),
+            ! isSPACE((U8) *s)
+        );
+        break;
+    case NSPACEA:
+        REXEC_FBC_CSCAN(
+            !isSPACE_A(*s),
+            !isSPACE_A(*s)
+        );
+        break;
+    case NSPACEL:
+        REXEC_FBC_CSCAN_TAINT(
+            !isSPACE_LC_utf8((U8*)s),
+            !isSPACE_LC(*s)
+        );
+        break;
+    case DIGIT:
+        REXEC_FBC_CSCAN_PRELOAD(
+            LOAD_UTF8_CHARCLASS_DIGIT(),
+            swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
+            isDIGIT(*s)
+        );
+        break;
+    case DIGITA:
+        /* Don't need to worry about utf8, as it can match only a single
+         * byte invariant character */
+        REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
+        break;
+    case DIGITL:
+        REXEC_FBC_CSCAN_TAINT(
+            isDIGIT_LC_utf8((U8*)s),
+            isDIGIT_LC(*s)
+        );
+        break;
+    case NDIGIT:
+        REXEC_FBC_CSCAN_PRELOAD(
+            LOAD_UTF8_CHARCLASS_DIGIT(),
+            !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
+            !isDIGIT(*s)
+        );
+        break;
+    case NDIGITA:
+        REXEC_FBC_CSCAN(
+            !isDIGIT_A(*s),
+            !isDIGIT_A(*s)
+        );
+        break;
+    case NDIGITL:
+        REXEC_FBC_CSCAN_TAINT(
+            !isDIGIT_LC_utf8((U8*)s),
+            !isDIGIT_LC(*s)
+        );
+        break;
+    case LNBREAK:
+        REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
+                        is_LNBREAK_latin1_safe(s, strend)
+        );
+        break;
+    case VERTWS:
+        REXEC_FBC_CSCAN(
+            is_VERTWS_utf8_safe(s, strend),
+            is_VERTWS_latin1_safe(s, strend)
+        );
+        break;
+    case NVERTWS:
+        REXEC_FBC_CSCAN(
+            !is_VERTWS_utf8_safe(s, strend),
+            !is_VERTWS_latin1_safe(s, strend)
+        );
+        break;
+    case HORIZWS:
+        REXEC_FBC_CSCAN(
+            is_HORIZWS_utf8_safe(s, strend),
+            is_HORIZWS_latin1_safe(s, strend)
+        );
+        break;
+    case NHORIZWS:
+        REXEC_FBC_CSCAN(
+            !is_HORIZWS_utf8_safe(s, strend),
+            !is_HORIZWS_latin1_safe(s, strend)
+        );
+        break;
+    case POSIXA:
+        /* Don't need to worry about utf8, as it can match only a single
+        * byte invariant character.  The flag in this node type is the
+        * class number to pass to _generic_isCC() to build a mask for
+        * searching in PL_charclass[] */
+        REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
+        break;
+    case NPOSIXA:
+        REXEC_FBC_CSCAN(
+            !_generic_isCC_A(*s, FLAGS(c)),
+            !_generic_isCC_A(*s, FLAGS(c))
+        );
+        break;
  
-       case AHOCORASICKC:
-       case AHOCORASICK: 
-           {
-               DECL_TRIE_TYPE(c);
-                /* what trie are we using right now */
-               reg_ac_data *aho
-                   = (reg_ac_data*)progi->data->data[ ARG( c ) ];
-               reg_trie_data *trie
-                   = (reg_trie_data*)progi->data->data[ aho->trie ];
-               HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
+    case AHOCORASICKC:
+    case AHOCORASICK:
+        {
+            DECL_TRIE_TYPE(c);
+            /* what trie are we using right now */
+            reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
+            reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
+            HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
  
-               const char *last_start = strend - trie->minlen;
+            const char *last_start = strend - trie->minlen;
  #ifdef DEBUGGING
-               const char *real_start = s;
+            const char *real_start = s;
  #endif
-               STRLEN maxlen = trie->maxlen;
-               SV *sv_points;
-               U8 **points; /* map of where we were in the input string
-                               when reading a given char. For ASCII this
-                               is unnecessary overhead as the relationship
-                               is always 1:1, but for Unicode, especially
-                               case folded Unicode this is not true. */
-               U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-               U8 *bitmap=NULL;
-
-
-                GET_RE_DEBUG_FLAGS_DECL;
-
-                /* We can't just allocate points here. We need to wrap it in
-                 * an SV so it gets freed properly if there is a croak while
-                 * running the match */
-                ENTER;
-               SAVETMPS;
-                sv_points=newSV(maxlen * sizeof(U8 *));
-                SvCUR_set(sv_points,
-                    maxlen * sizeof(U8 *));
-                SvPOK_on(sv_points);
-                sv_2mortal(sv_points);
-                points=(U8**)SvPV_nolen(sv_points );
-                if ( trie_type != trie_utf8_fold 
-                     && (trie->bitmap || OP(c)==AHOCORASICKC) ) 
-                {
-                    if (trie->bitmap) 
-                        bitmap=(U8*)trie->bitmap;
-                    else
-                        bitmap=(U8*)ANYOF_BITMAP(c);
-                }
-                /* this is the Aho-Corasick algorithm modified a touch
-                   to include special handling for long "unknown char" 
-                   sequences. The basic idea being that we use AC as long
-                   as we are dealing with a possible matching char, when
-                   we encounter an unknown char (and we have not encountered
-                   an accepting state) we scan forward until we find a legal 
-                   starting char. 
-                   AC matching is basically that of trie matching, except
-                   that when we encounter a failing transition, we fall back
-                   to the current states "fail state", and try the current char 
-                   again, a process we repeat until we reach the root state, 
-                   state 1, or a legal transition. If we fail on the root state 
-                   then we can either terminate if we have reached an accepting 
-                   state previously, or restart the entire process from the beginning 
-                   if we have not.
+            STRLEN maxlen = trie->maxlen;
+            SV *sv_points;
+            U8 **points; /* map of where we were in the input string
+                            when reading a given char. For ASCII this
+                            is unnecessary overhead as the relationship
+                            is always 1:1, but for Unicode, especially
+                            case folded Unicode this is not true. */
+            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+            U8 *bitmap=NULL;
+
+
+            GET_RE_DEBUG_FLAGS_DECL;
+
+            /* We can't just allocate points here. We need to wrap it in
+             * an SV so it gets freed properly if there is a croak while
+             * running the match */
+            ENTER;
+            SAVETMPS;
+            sv_points=newSV(maxlen * sizeof(U8 *));
+            SvCUR_set(sv_points,
+                maxlen * sizeof(U8 *));
+            SvPOK_on(sv_points);
+            sv_2mortal(sv_points);
+            points=(U8**)SvPV_nolen(sv_points );
+            if ( trie_type != trie_utf8_fold
+                 && (trie->bitmap || OP(c)==AHOCORASICKC) )
+            {
+                if (trie->bitmap)
+                    bitmap=(U8*)trie->bitmap;
+                else
+                    bitmap=(U8*)ANYOF_BITMAP(c);
+            }
+            /* this is the Aho-Corasick algorithm modified a touch
+               to include special handling for long "unknown char" sequences.
+               The basic idea being that we use AC as long as we are dealing
+               with a possible matching char, when we encounter an unknown char
+               (and we have not encountered an accepting state) we scan forward
+               until we find a legal starting char.
+               AC matching is basically that of trie matching, except that when
+               we encounter a failing transition, we fall back to the current
+               states "fail state", and try the current char again, a process
+               we repeat until we reach the root state, state 1, or a legal
+               transition. If we fail on the root state then we can either
+               terminate if we have reached an accepting state previously, or
+               restart the entire process from the beginning if we have not.
  
-                 */
-                while (s <= last_start) {
-                    const U32 uniflags = UTF8_ALLOW_DEFAULT;
-                    U8 *uc = (U8*)s;
-                    U16 charid = 0;
-                    U32 base = 1;
-                    U32 state = 1;
-                    UV uvc = 0;
-                    STRLEN len = 0;
-                    STRLEN foldlen = 0;
-                    U8 *uscan = (U8*)NULL;
-                    U8 *leftmost = NULL;
-#ifdef DEBUGGING                    
-                    U32 accepted_word= 0;
+             */
+            while (s <= last_start) {
+                const U32 uniflags = UTF8_ALLOW_DEFAULT;
+                U8 *uc = (U8*)s;
+                U16 charid = 0;
+                U32 base = 1;
+                U32 state = 1;
+                UV uvc = 0;
+                STRLEN len = 0;
+                STRLEN foldlen = 0;
+                U8 *uscan = (U8*)NULL;
+                U8 *leftmost = NULL;
+#ifdef DEBUGGING
+                U32 accepted_word= 0;
  #endif
-                    U32 pointpos = 0;
-
-                    while ( state && uc <= (U8*)strend ) {
-                        int failed=0;
-                        U32 word = aho->states[ state ].wordnum;
-
-                        if( state==1 ) {
-                            if ( bitmap ) {
-                                DEBUG_TRIE_EXECUTE_r(
-                                    if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
-                                        dump_exec_pos( (char *)uc, c, strend, real_start, 
-                                            (char *)uc, utf8_target );
-                                        PerlIO_printf( Perl_debug_log,
-                                            " Scanning for legal start char...\n");
-                                    }
-                                );
-                               if (utf8_target) {
-                                   while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
-                                       uc += UTF8SKIP(uc);
-                                   }
-                               } else {
-                                   while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
-                                       uc++;
-                                   }
-                               }
-                                s= (char *)uc;
-                            }
-                            if (uc >(U8*)last_start) break;
-                        }
-                                            
-                        if ( word ) {
-                            U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
-                            if (!leftmost || lpos < leftmost) {
-                                DEBUG_r(accepted_word=word);
-                                leftmost= lpos;
+                U32 pointpos = 0;
+
+                while ( state && uc <= (U8*)strend ) {
+                    int failed=0;
+                    U32 word = aho->states[ state ].wordnum;
+
+                    if( state==1 ) {
+                        if ( bitmap ) {
+                            DEBUG_TRIE_EXECUTE_r(
+                                if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+                                    dump_exec_pos( (char *)uc, c, strend, real_start,
+                                        (char *)uc, utf8_target );
+                                    PerlIO_printf( Perl_debug_log,
+                                        " Scanning for legal start char...\n");
+                                }
+                            );
+                            if (utf8_target) {
+                                while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+                                    uc += UTF8SKIP(uc);
+                                }
+                            } else {
+                                while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
+                                    uc++;
+                                }
                              }
-                            if (base==0) break;
-                            
-                        }
-                        points[pointpos++ % maxlen]= uc;
-                        if (foldlen || uc < (U8*)strend) {
-                            REXEC_TRIE_READ_CHAR(trie_type, trie,
-                                             widecharmap, uc,
-                                            uscan, len, uvc, charid, foldlen,
-                                            foldbuf, uniflags);
-                            DEBUG_TRIE_EXECUTE_r({
-                                dump_exec_pos( (char *)uc, c, strend,
-                                            real_start, s, utf8_target);
-                                PerlIO_printf(Perl_debug_log,
-                                    " Charid:%3u CP:%4"UVxf" ",
-                                     charid, uvc);
-                            });
+                            s= (char *)uc;
                          }
-                        else {
-                            len = 0;
-                            charid = 0;
+                        if (uc >(U8*)last_start) break;
+                    }
+
+                    if ( word ) {
+                        U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
+                        if (!leftmost || lpos < leftmost) {
+                            DEBUG_r(accepted_word=word);
+                            leftmost= lpos;
                          }
+                        if (base==0) break;
+
+                    }
+                    points[pointpos++ % maxlen]= uc;
+                    if (foldlen || uc < (U8*)strend) {
+                        REXEC_TRIE_READ_CHAR(trie_type, trie,
+                                         widecharmap, uc,
+                                         uscan, len, uvc, charid, foldlen,
+                                         foldbuf, uniflags);
+                        DEBUG_TRIE_EXECUTE_r({
+                            dump_exec_pos( (char *)uc, c, strend,
+                                        real_start, s, utf8_target);
+                            PerlIO_printf(Perl_debug_log,
+                                " Charid:%3u CP:%4"UVxf" ",
+                                 charid, uvc);
+                        });
+                    }
+                    else {
+                        len = 0;
+                        charid = 0;
+                    }
  
  
-                        do {
+                    do {
  #ifdef DEBUGGING
-                            word = aho->states[ state ].wordnum;
+                        word = aho->states[ state ].wordnum;
  #endif
-                            base = aho->states[ state ].trans.base;
-
-                            DEBUG_TRIE_EXECUTE_r({
-                                if (failed) 
-                                    dump_exec_pos( (char *)uc, c, strend, real_start, 
-                                        s,   utf8_target );
-                                PerlIO_printf( Perl_debug_log,
-                                    "%sState: %4"UVxf", word=%"UVxf,
-                                    failed ? " Fail transition to " : "",
-                                    (UV)state, (UV)word);
-                            });
-                            if ( base ) {
-                                U32 tmp;
-                               I32 offset;
-                                if (charid &&
-                                    ( ((offset = base + charid
-                                       - 1 - trie->uniquecharcount)) >= 0)
-                                     && ((U32)offset < trie->lasttrans)
-                                     && trie->trans[offset].check == state
-                                     && (tmp=trie->trans[offset].next))
-                                {
-                                    DEBUG_TRIE_EXECUTE_r(
-                                        PerlIO_printf( Perl_debug_log," - legal\n"));
-                                    state = tmp;
-                                    break;
-                                }
-                                else {
-                                    DEBUG_TRIE_EXECUTE_r(
-                                        PerlIO_printf( Perl_debug_log," - fail\n"));
-                                    failed = 1;
-                                    state = aho->fail[state];
-                                }
+                        base = aho->states[ state ].trans.base;
+
+                        DEBUG_TRIE_EXECUTE_r({
+                            if (failed)
+                                dump_exec_pos( (char *)uc, c, strend, real_start,
+                                    s,   utf8_target );
+                            PerlIO_printf( Perl_debug_log,
+                                "%sState: %4"UVxf", word=%"UVxf,
+                                failed ? " Fail transition to " : "",
+                                (UV)state, (UV)word);
+                        });
+                        if ( base ) {
+                            U32 tmp;
+                            I32 offset;
+                            if (charid &&
+                                 ( ((offset = base + charid
+                                    - 1 - trie->uniquecharcount)) >= 0)
+                                 && ((U32)offset < trie->lasttrans)
+                                 && trie->trans[offset].check == state
+                                 && (tmp=trie->trans[offset].next))
+                            {
+                                DEBUG_TRIE_EXECUTE_r(
+                                    PerlIO_printf( Perl_debug_log," - legal\n"));
+                                state = tmp;
+                                break;
                              }
                              else {
-                                /* we must be accepting here */
                                  DEBUG_TRIE_EXECUTE_r(
-                                        PerlIO_printf( Perl_debug_log," - accepting\n"));
+                                    PerlIO_printf( Perl_debug_log," - fail\n"));
                                  failed = 1;
-                                break;
+                                state = aho->fail[state];
                              }
-                        } while(state);
-                        uc += len;
-                        if (failed) {
-                            if (leftmost)
-                                break;
-                            if (!state) state = 1;
                          }
-                    }
-                    if ( aho->states[ state ].wordnum ) {
-                        U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
-                        if (!leftmost || lpos < leftmost) {
-                            DEBUG_r(accepted_word=aho->states[ state ].wordnum);
-                            leftmost = lpos;
+                        else {
+                            /* we must be accepting here */
+                            DEBUG_TRIE_EXECUTE_r(
+                                    PerlIO_printf( Perl_debug_log," - accepting\n"));
+                            failed = 1;
+                            break;
                          }
+                    } while(state);
+                    uc += len;
+                    if (failed) {
+                        if (leftmost)
+                            break;
+                        if (!state) state = 1;
                      }
-                    if (leftmost) {
-                        s = (char*)leftmost;
-                        DEBUG_TRIE_EXECUTE_r({
-                            PerlIO_printf( 
-                                Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
-                                (UV)accepted_word, (IV)(s - real_start)
-                            );
-                        });
-                        if (!reginfo || regtry(reginfo, &s)) {
-                            FREETMPS;
-                           LEAVE;
-                            goto got_it;
-                        }
-                        s = HOPc(s,1);
-                        DEBUG_TRIE_EXECUTE_r({
-                            PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
-                        });
-                    } else {
-                        DEBUG_TRIE_EXECUTE_r(
-                            PerlIO_printf( Perl_debug_log,"No match.\n"));
-                        break;
+                }
+                if ( aho->states[ state ].wordnum ) {
+                    U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
+                    if (!leftmost || lpos < leftmost) {
+                        DEBUG_r(accepted_word=aho->states[ state ].wordnum);
+                        leftmost = lpos;
                      }
                  }
-                FREETMPS;
-                LEAVE;
-           }
-           break;
-       default:
-           Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
-           break;
-       }
-       return 0;
-      got_it:
-       return s;
+                if (leftmost) {
+                    s = (char*)leftmost;
+                    DEBUG_TRIE_EXECUTE_r({
+                        PerlIO_printf(
+                            Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
+                            (UV)accepted_word, (IV)(s - real_start)
+                        );
+                    });
+                    if (!reginfo || regtry(reginfo, &s)) {
+                        FREETMPS;
+                        LEAVE;
+                        goto got_it;
+                    }
+                    s = HOPc(s,1);
+                    DEBUG_TRIE_EXECUTE_r({
+                        PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
+                    });
+                } else {
+                    DEBUG_TRIE_EXECUTE_r(
+                        PerlIO_printf( Perl_debug_log,"No match.\n"));
+                    break;
+                }
+            }
+            FREETMPS;
+            LEAVE;
+        }
+        break;
+    default:
+        Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
+        break;
+    }
+    return 0;
+  got_it:
+    return s;
  }
author	Karl Williamson <public@khwilliamson.com>
	Thu, 6 Dec 2012 17:14:38 +0000 (10:14 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Sun, 9 Dec 2012 19:08:28 +0000 (12:08 -0700)