regex: Separate nodes for Unicode semantics \s \w
authorKarl Williamson <public@khwilliamson.com>
Mon, 27 Dec 2010 19:04:58 +0000 (12:04 -0700)
committerKarl Williamson <public@khwilliamson.com>
Mon, 17 Jan 2011 02:13:23 +0000 (19:13 -0700)
This patch converts the \s, \w and complements Unicode semantics to
instead of using the flags field of their nodes to instead use separate
nodes.  This gains some efficiency, especially useful in tight loops and
backtracking of regexec.c, and prepares the way for easily adding other
semantic variations, such as /a.

It refactors the CCC_TRY... macros.  I tried to break this piece up into
smaller chunks, but found it much easier to get to this in one step.
Further patches will do some more refactoring of these.

As part of the CCC_TRY macro refactoring, the lines that include the
test if (! nextchr) are changed to just look for the end-of-string by
position instead of it being NUL.  In locales, it could be (however
unlikely), that NUL is a real alphabetic, digit, or space character.

regcomp.c
regexec.c

index e1f08e0..cbcabdf 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3632,7 +3632,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
-                            if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == ALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (!isWORDCHAR_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3650,7 +3650,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
-                        else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                        else if (OP(scan) == ALNUMU) {
                             for (value = 0; value < 256; value++) {
                                 if (isWORDCHAR_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3669,7 +3669,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
-                            if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == NALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (isWORDCHAR_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3688,7 +3688,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
                        else {
-                           if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == NALNUMU) {
                                 for (value = 0; value < 256; value++) {
                                     if (! isWORDCHAR_L1(value)) {
                                         ANYOF_BITMAP_SET(data->start_class, value);
@@ -3708,7 +3708,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
-                           if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                           if (OP(scan) == SPACEU) {
                                 for (value = 0; value < 256; value++) {
                                     if (!isSPACE_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3727,7 +3727,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         if (data->start_class->flags & ANYOF_LOCALE) {
                            ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
                         }
-                        else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                        else if (OP(scan) == SPACEU) {
                             for (value = 0; value < 256; value++) {
                                 if (isSPACE_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -3746,7 +3746,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    if (flags & SCF_DO_STCLASS_AND) {
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
-                            if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                            if (OP(scan) == NSPACEU) {
                                 for (value = 0; value < 256; value++) {
                                     if (isSPACE_L1(value)) {
                                         ANYOF_BITMAP_CLEAR(data->start_class, value);
@@ -3764,7 +3764,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    else {
                        if (data->start_class->flags & ANYOF_LOCALE)
                            ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
-                        else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+                        else if (OP(scan) == NSPACEU) {
                             for (value = 0; value < 256; value++) {
                                 if (!isSPACE_L1(value)) {
                                     ANYOF_BITMAP_SET(data->start_class, value);
@@ -7191,6 +7191,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     register regnode *ret = NULL;
     I32 flags;
     char *parse_start = RExC_parse;
+    U8 op;
     GET_RE_DEBUG_FLAGS_DECL;
     DEBUG_PARSE("atom");
     *flagp = WORST;            /* Tentatively. */
@@ -7362,21 +7363,37 @@ tryagain:
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
        case 'w':
-           if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(ALNUML));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(ALNUM));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = ALNUML;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = ALNUMU;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = ALNUM;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'W':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NALNUML));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NALNUM));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NALNUML;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NALNUMU;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NALNUM;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'b':
@@ -7402,21 +7419,37 @@ tryagain:
            *flagp |= SIMPLE;
            goto finish_meta_pat;
        case 's':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(SPACEL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(SPACE));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = SPACEL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = SPACEU;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = SPACE;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'S':
-            if (LOC) {
-                ret = reg_node(pRExC_state, (U8)(NSPACEL));
-            } else {
-                ret = reg_node(pRExC_state, (U8)(NSPACE));
+           switch (get_regex_charset(RExC_flags)) {
+               case REGEX_LOCALE_CHARSET:
+                   op = NSPACEL;
+                   break;
+               case REGEX_UNICODE_CHARSET:
+                   op = NSPACEU;
+                   break;
+               case REGEX_DEPENDS_CHARSET:
+                   op = NSPACE;
+                   break;
+               default:
+                   goto bad_charset;
             }
-           FLAGS(ret) = get_regex_charset(RExC_flags);
+           ret = reg_node(pRExC_state, op);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
@@ -7962,6 +7995,11 @@ tryagain:
     }
 
     return(ret);
+
+/* Jumped to when an unrecognized character set is encountered */
+bad_charset:
+    Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
+    return(NULL);
 }
 
 STATIC char *
index cb5eb10..47cee7e 100644 (file)
--- a/regexec.c
+++ b/regexec.c
 #endif
 
 
-#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
-        case NAMEL:                                                         \
-            PL_reg_flags |= RF_tainted;                                     \
-            /* FALL THROUGH */                                              \
-        case NAME:                                                          \
-            if (!nextchr)                                                   \
-                sayNO;                                                      \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
-                if (!CAT2(PL_utf8_,CLASS)) {                                \
-                    LOAD_UTF8_CHARCLASS(CLASS, STR);                        \
-                }                                                           \
-                if (!(OP(scan) == NAME                                      \
-                    ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                          \
-                {                                                           \
-                    sayNO;                                                  \
-                }                                                           \
-                locinput += PL_utf8skip[nextchr];                           \
-                nextchr = UCHARAT(locinput);                                \
-                break;                                                      \
-            }                                                               \
-           /* Drops through to the macro that calls this one */
-
-#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
-    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
-            if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
-                sayNO;                                                      \
-            nextchr = UCHARAT(++locinput);                                  \
-            break
-
-/* Almost identical to the above, but has a case for a node that matches chars
- * between 128 and 255 using Unicode (latin1) semantics. */
-#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
-    _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
-            if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) == REGEX_UNICODE_CHARSET))))) \
-                sayNO;                                                       \
-            nextchr = UCHARAT(++locinput);                                   \
-            break
-
-#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
-        case NAMEL:                                                          \
-            PL_reg_flags |= RF_tainted;                                      \
-            /* FALL THROUGH */                                               \
-        case NAME :                                                          \
-            if (!nextchr && locinput >= PL_regeol)                           \
-                sayNO;                                                       \
-            if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
-                if (!CAT2(PL_utf8_,CLASS)) {                                 \
-                    LOAD_UTF8_CHARCLASS(CLASS, STR);                         \
-                }                                                            \
-                if ((OP(scan) == NAME                                        \
-                    ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
-                    : LCFUNC_utf8((U8*)locinput)))                           \
-                {                                                            \
-                    sayNO;                                                   \
-                }                                                            \
-                locinput += PL_utf8skip[nextchr];                            \
-                nextchr = UCHARAT(locinput);                                 \
-                break;                                                       \
-            }
-
-#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
-    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
-            if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
-                sayNO;                                                       \
-            nextchr = UCHARAT(++locinput);                                   \
-            break
-
-
-#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
-    _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
-            if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) == REGEX_UNICODE_CHARSET))))) \
-                sayNO;                                                       \
-            nextchr = UCHARAT(++locinput);                                   \
-            break
-
+#define _CCC_TRY_CODE(LOAD, CLASS, STR, FUNC, TEST, POS_OR_NEG)     \
+    if (locinput >= PL_regeol) {                                    \
+       sayNO;                                                      \
+    }                                                               \
+    if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
+       LOAD(CLASS, STR);                                           \
+       if (POS_OR_NEG (TEST)) {                                    \
+           sayNO;                                                  \
+       }                                                           \
+       locinput += PL_utf8skip[nextchr];                           \
+       nextchr = UCHARAT(locinput);                                \
+       break;                                                      \
+    }                                                               \
+    if (POS_OR_NEG (FUNC(nextchr))) {                               \
+       sayNO;                                                      \
+    }                                                               \
+    nextchr = UCHARAT(++locinput);                                  \
+    break;
+
+# define _CCC_TRY_AFF_INTERIOR(LOAD, CLASS, STR, FUNC, TEST)      \
+    _CCC_TRY_CODE(LOAD, CLASS, STR, FUNC, TEST, ! )
+
+# define _CCC_TRY_NEG_INTERIOR(LOAD, CLASS, STR, FUNC, TEST)      \
+    _CCC_TRY_CODE(LOAD, CLASS, STR, FUNC, TEST, )
+
+#define CCC_TRY_AFF(NAME, NAMEL, CLASS, STR, LCFUNC_utf8, FUNC, LCFUNC)           \
+    case NAMEL:                                                         \
+       PL_reg_flags |= RF_tainted;                                     \
+       _CCC_TRY_AFF_INTERIOR(LOAD_UTF8_CHARCLASS, CLASS, STR, LCFUNC, LCFUNC_utf8((U8*)locinput))      \
+    case NAME:                                                          \
+       _CCC_TRY_AFF_INTERIOR(LOAD_UTF8_CHARCLASS, CLASS, STR, FUNC, cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)))        \
+
+#define CCC_TRY_AFF_U(NAME, NAMEL, NAMEU, CLASS, STR, LCFUNC_utf8, FUNC, FUNCU, LCFUNC)         \
+    CCC_TRY_AFF(NAME, NAMEL, CLASS, STR, LCFUNC_utf8, FUNC, LCFUNC)           \
+    case NAMEU:                                                         \
+       _CCC_TRY_AFF_INTERIOR(LOAD_UTF8_CHARCLASS, CLASS, STR, FUNCU, cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)))
+
+#define CCC_TRY_NEG(NAME, NAMEL, CLASS, STR, LCFUNC_utf8, FUNC, LCFUNC)        \
+    case NAMEL:                                                                \
+       PL_reg_flags |= RF_tainted;                                            \
+       _CCC_TRY_NEG_INTERIOR(LOAD_UTF8_CHARCLASS, CLASS, STR, LCFUNC, LCFUNC_utf8((U8*)locinput))      \
+    case NAME:                                                          \
+       _CCC_TRY_NEG_INTERIOR(LOAD_UTF8_CHARCLASS, CLASS, STR, FUNC, cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)))
+
+#define CCC_TRY_NEG_U(NAME, NAMEL, NAMEU, CLASS, STR, LCFUNC_utf8, FUNC, FUNCU, LCFUNC)         \
+    CCC_TRY_NEG(NAME, NAMEL, CLASS, STR, LCFUNC_utf8, FUNC, LCFUNC)           \
+    case NAMEU:                                                         \
+       _CCC_TRY_NEG_INTERIOR(LOAD_UTF8_CHARCLASS, CLASS, STR, FUNCU, cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)))
 
 
 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -1606,44 +1578,68 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
            if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
                goto got_it;
            break;
-       case ALNUM:
-           REXEC_FBC_CSCAN_PRELOAD(
-               LOAD_UTF8_CHARCLASS_PERL_WORD(),
-               swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-                (FLAGS(c) == REGEX_UNICODE_CHARSET) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
-           );
        case ALNUML:
            REXEC_FBC_CSCAN_TAINT(
                isALNUM_LC_utf8((U8*)s),
                isALNUM_LC(*s)
            );
+       case ALNUMU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+                isWORDCHAR_L1((U8) *s)
+           );
+       case ALNUM:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+                isWORDCHAR((U8) *s)
+           );
+       case NALNUMU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_WORD(),
+               swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target),
+                ! isWORDCHAR_L1((U8) *s)
+           );
        case NALNUM:
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_WORD(),
                !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
-                ! ((FLAGS(c) == REGEX_UNICODE_CHARSET) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
+                ! isALNUM(*s)
            );
        case NALNUML:
            REXEC_FBC_CSCAN_TAINT(
                !isALNUM_LC_utf8((U8*)s),
                !isALNUM_LC(*s)
            );
+       case SPACEU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+               *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
+                isSPACE_L1((U8) *s)
+           );
        case SPACE:
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
-                isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET))
+                isSPACE((U8) *s)
            );
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
                isSPACE_LC_utf8((U8*)s),
                isSPACE_LC(*s)
            );
+       case NSPACEU:
+           REXEC_FBC_CSCAN_PRELOAD(
+               LOAD_UTF8_CHARCLASS_PERL_SPACE(),
+               !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
+                ! isSPACE_L1((U8) *s)
+           );
        case NSPACE:
            REXEC_FBC_CSCAN_PRELOAD(
                LOAD_UTF8_CHARCLASS_PERL_SPACE(),
                !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
-                !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) == REGEX_UNICODE_CHARSET)))
+                ! isSPACE((U8) *s)
            );
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
@@ -3686,11 +3682,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            }
            break;
        /* Special char classes - The defines start on line 129 or so */
-        CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
-        CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_AFF_U( ALNUM,  ALNUML,  ALNUMU, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR, isWORDCHAR_L1, isALNUM_LC);
+        CCC_TRY_NEG_U(NALNUM, NALNUML, NALNUMU, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR, isWORDCHAR_L1, isALNUM_LC);
 
-        CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
-        CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_AFF_U( SPACE,  SPACEL,  SPACEU, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_L1, isSPACE_LC);
+        CCC_TRY_NEG_U(NSPACE, NSPACEL, NSPACEU, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_L1, isSPACE_LC);
 
        CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
        CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
@@ -5922,8 +5918,9 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan++;
        }
        break;
-    case ALNUM:
+    case ALNUMU:
        if (utf8_target) {
+    utf8_wordchar:
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
@@ -5932,14 +5929,17 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan += UTF8SKIP(scan);
                hardcount++;
            }
-        } else if (FLAGS(p) == REGEX_UNICODE_CHARSET) {
+        } else {
             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
                 scan++;
             }
-       } else {
-            while (scan < loceol && isALNUM((U8) *scan)) {
-                scan++;
-            }
+       }
+       break;
+    case ALNUM:
+       if (utf8_target)
+           goto utf8_wordchar;
+       while (scan < loceol && isALNUM((U8) *scan)) {
+           scan++;
        }
        break;
     case ALNUML:
@@ -5956,24 +5956,30 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan++;
        }
        break;
-    case NALNUM:
+    case NALNUMU:
        if (utf8_target) {
+
+    utf8_Nwordchar:
+
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_ALNUM();
            while (hardcount < max && scan < loceol &&
-                   !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
+                   ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
             {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
-        } else if (FLAGS(p) == REGEX_UNICODE_CHARSET) {
+        } else {
             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
                 scan++;
             }
-       } else {
-            while (scan < loceol && ! isALNUM((U8) *scan)) {
-                scan++;
-            }
+       }
+       break;
+    case NALNUM:
+       if (utf8_target)
+           goto utf8_Nwordchar;
+       while (scan < loceol && ! isALNUM((U8) *scan)) {
+           scan++;
        }
        break;
     case NALNUML:
@@ -5990,8 +5996,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan++;
        }
        break;
-    case SPACE:
+    case SPACEU:
        if (utf8_target) {
+
+    utf8_space:
+
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
@@ -6001,13 +6010,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan += UTF8SKIP(scan);
                hardcount++;
            }
-        } else if (FLAGS(p) == REGEX_UNICODE_CHARSET) {
+           break;
+       }
+       else {
             while (scan < loceol && isSPACE_L1((U8) *scan)) {
                 scan++;
             }
-       } else {
-            while (scan < loceol && isSPACE((U8) *scan))
-                scan++;
+           break;
+       }
+    case SPACE:
+       if (utf8_target)
+           goto utf8_space;
+
+       while (scan < loceol && isSPACE((U8) *scan)) {
+           scan++;
        }
        break;
     case SPACEL:
@@ -6024,25 +6040,34 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                scan++;
        }
        break;
-    case NSPACE:
+    case NSPACEU:
        if (utf8_target) {
+
+    utf8_Nspace:
+
            loceol = PL_regeol;
            LOAD_UTF8_CHARCLASS_SPACE();
            while (hardcount < max && scan < loceol &&
-                  !(*scan == ' ' ||
-                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
+                  ! (*scan == ' ' ||
+                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
             {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
-        } else if (FLAGS(p) == REGEX_UNICODE_CHARSET) {
+           break;
+       }
+       else {
             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
                 scan++;
             }
-       } else {
-            while (scan < loceol && ! isSPACE((U8) *scan)) {
-                scan++;
-            }
+       }
+       break;
+    case NSPACE:
+       if (utf8_target)
+           goto utf8_Nspace;
+
+       while (scan < loceol && ! isSPACE((U8) *scan)) {
+           scan++;
        }
        break;
     case NSPACEL: