regex: Remove old code that tried to handle multi-char folds
authorKarl Williamson <public@khwilliamson.com>
Sat, 13 Oct 2012 14:49:26 +0000 (08:49 -0600)
committerKarl Williamson <public@khwilliamson.com>
Sun, 14 Oct 2012 15:03:37 +0000 (09:03 -0600)
A recent commit has changed the algorithm used to handle multi-character
folding in bracketed character classes.  The old code is no longer
needed.

embed.fnc
embed.h
proto.h
regcomp.c
regcomp.sym
regexec.c
regnodes.h

index 43f131b..b85173d 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1040,7 +1040,6 @@ Ap        |SV*    |regclass_swash |NULLOK const regexp *prog \
                                |NN const struct regnode *node|bool doinit \
                                |NULLOK SV **listsvp|NULLOK SV **altsvp
 #ifdef PERL_IN_REGCOMP_C
-EMs    |void   |add_alternate  |NN AV** alternate_ptr|NN U8* string|STRLEN len
 EMsR   |SV*    |_new_invlist_C_array|NN UV* list
 : Not used currently: EXMs     |bool   |_invlistEQ     |NN SV* const a|NN SV* const b|bool complement_b
 #endif
@@ -2014,7 +2013,7 @@ Es        |void   |regcppop       |NN regexp *rex
 ERsn   |U8*    |reghop3        |NN U8 *s|I32 off|NN const U8 *lim
 ERsM   |SV*    |core_regclass_swash|NULLOK const regexp *prog \
                                |NN const struct regnode *node|bool doinit \
-                               |NULLOK SV **listsvp|NULLOK SV **altsvp
+                               |NULLOK SV **listsvp
 :not currently used EiR        |bool   |is_utf8_X_LV           |NN const U8 *p
 EiR    |bool   |is_utf8_X_LVT          |NN const U8 *p
 #ifdef XXX_dmq
diff --git a/embed.h b/embed.h
index c0f1e9b..0b54ebb 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define _append_range_to_invlist(a,b,c)        S__append_range_to_invlist(aTHX_ a,b,c)
 #define _invlist_array_init(a,b)       S__invlist_array_init(aTHX_ a,b)
 #define _new_invlist_C_array(a)        S__new_invlist_C_array(aTHX_ a)
-#define add_alternate(a,b,c)   S_add_alternate(aTHX_ a,b,c)
 #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b)
 #define add_data               S_add_data
 #define alloc_maybe_populate_EXACT(a,b,c,d,e)  S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e)
 #define _swash_to_invlist(a)   Perl__swash_to_invlist(aTHX_ a)
 #  endif
 #  if defined(PERL_IN_REGEXEC_C)
-#define core_regclass_swash(a,b,c,d,e) S_core_regclass_swash(aTHX_ a,b,c,d,e)
+#define core_regclass_swash(a,b,c,d)   S_core_regclass_swash(aTHX_ a,b,c,d)
 #define find_byclass(a,b,c,d,e)        S_find_byclass(aTHX_ a,b,c,d,e)
 #define is_utf8_X_LVT(a)       S_is_utf8_X_LVT(aTHX_ a)
 #define reg_check_named_buff_matched(a,b)      S_reg_check_named_buff_matched(aTHX_ a,b)
diff --git a/proto.h b/proto.h
index e290c47..c6b05b7 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -6336,12 +6336,6 @@ STATIC SV*       S__new_invlist_C_array(pTHX_ UV* list)
 #define PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY  \
        assert(list)
 
-STATIC void    S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
-                       __attribute__nonnull__(pTHX_1)
-                       __attribute__nonnull__(pTHX_2);
-#define PERL_ARGS_ASSERT_ADD_ALTERNATE \
-       assert(alternate_ptr); assert(string)
-
 PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp)
                        __attribute__warn_unused_result__;
 
@@ -6740,7 +6734,7 @@ PERL_CALLCONV SV* Perl__swash_to_invlist(pTHX_ SV* const swash)
 
 #endif
 #if defined(PERL_IN_REGEXEC_C)
-STATIC SV*     S_core_regclass_swash(pTHX_ const regexp *prog, const struct regnode *node, bool doinit, SV **listsvp, SV **altsvp)
+STATIC SV*     S_core_regclass_swash(pTHX_ const regexp *prog, const struct regnode *node, bool doinit, SV **listsvp)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH   \
index d9281f4..75736c6 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -11405,24 +11405,6 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
        }                                                                  \
     }
 
-STATIC void
-S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
-{
-    /* Adds input 'string' with length 'len' to the ANYOF node's unicode
-     * alternate list, pointed to by 'alternate_ptr'.  This is an array of
-     * the multi-character folds of characters in the node */
-    SV *sv;
-
-    PERL_ARGS_ASSERT_ADD_ALTERNATE;
-
-    if (! *alternate_ptr) {
-       *alternate_ptr = newAV();
-    }
-    sv = newSVpvn_utf8((char*)string, len, TRUE);
-    av_push(*alternate_ptr, sv);
-    return;
-}
-
 /* The names of properties whose definitions are not known at compile time are
  * stored in this SV, after a constant heading.  So if the length has been
  * changed since initialization, then there is a run-time definition. */
@@ -11500,8 +11482,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
      * of the target string */
     SV* cp_list = NULL;
 
-    /* List of multi-character folds that are matched by this node */
-    AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     /* In a range, counts how many 0-2 of the ends of it came from literals,
      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
@@ -12664,6 +12644,7 @@ parseit:
                U8 foldbuf[UTF8_MAXBYTES_CASE+1];
                STRLEN foldlen;
                 UV f;
+                SV** listp;
 
                 if (j < 256) {
 
@@ -12693,30 +12674,13 @@ parseit:
                         && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
                     {
                         /* Certain Latin1 characters have matches outside
-                         * Latin1, or are multi-character.  To get here, 'j' is
-                         * one of those characters.   None of these matches is
-                         * valid for ASCII characters under /aa, which is why
-                         * the 'if' just above excludes those.  The matches
-                         * fall into three categories:
-                         * 1) They are singly folded-to or -from an above 255
-                         *    character, e.g., LATIN SMALL LETTER Y WITH
-                         *    DIAERESIS and LATIN CAPITAL LETTER Y WITH
-                         *    DIAERESIS;
-                         * 2) They are part of a multi-char fold with another
-                         *    latin1 character; only LATIN SMALL LETTER
-                         *    SHARP S => "ss" fits this;
-                         * 3) They are part of a multi-char fold with a
-                         *    character outside of Latin1, such as various
-                         *    ligatures.
-                        * We aren't dealing fully with multi-char folds, except
-                        * we do deal with the pattern containing a character
-                        * that has a multi-char fold (not so much the inverse).
-                        * For types 1) and 3), the matches only happen when the
-                        * target string is utf8; that's not true for 2), and we
-                        * set a flag for it.
-                        *
-                        * The code below adds the single fold closures for 'j'
-                        * to the inversion list. */
+                         * Latin1.  To get here, <j> is one of those
+                         * characters.   None of these matches is valid for
+                         * ASCII characters under /aa, which is why the 'if'
+                         * just above excludes those.  These matches only
+                         * happen when the target string is utf8.  The code
+                         * below adds the single fold closures for <j> to the
+                         * inversion list. */
                         switch (j) {
                             case 'k':
                             case 'K':
@@ -12746,20 +12710,6 @@ parseit:
                             case LATIN_SMALL_LETTER_SHARP_S:
                                 cp_list = add_cp_to_invlist(cp_list,
                                                 LATIN_CAPITAL_LETTER_SHARP_S);
-
-                                /* Under /a, /d, and /u, this can match the two
-                                 * chars "ss" */
-                                if (! ASCII_FOLD_RESTRICTED) {
-                                    add_alternate(&unicode_alternate,
-                                                  (U8 *) "ss", 2);
-
-                                    /* And under /u or /a, it can match even if
-                                     * the target is not utf8 */
-                                    if (AT_LEAST_UNI_SEMANTICS) {
-                                        ANYOF_FLAGS(ret) |=
-                                                    ANYOF_NONBITMAP_NON_UTF8;
-                                    }
-                                }
                                 break;
                             case 'F': case 'f':
                             case 'I': case 'i':
@@ -12776,7 +12726,8 @@ parseit:
                                  * express, so they can't match unless the
                                  * target string is in UTF-8, so no action here
                                  * is necessary, as regexec.c properly handles
-                                 * the general case for UTF-8 matching */
+                                 * the general case for UTF-8 matching and
+                                 * multi-char folds */
                                 break;
                             default:
                                 /* Use deprecated warning to increase the
@@ -12789,50 +12740,19 @@ parseit:
                 }
 
                 /* Here is an above Latin1 character.  We don't have the rules
-                 * hard-coded for it.  First, get its fold */
+                 * hard-coded for it.  First, get its fold.  This is the simple
+                 * fold, as the multi-character folds have been handled earlier
+                 * and separated out */
                f = _to_uni_fold_flags(j, foldbuf, &foldlen,
-                                    ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
-                                    | ((LOC)
+                                        ((LOC)
                                         ? FOLD_FLAGS_LOCALE
                                         : (ASCII_FOLD_RESTRICTED)
                                             ? FOLD_FLAGS_NOMIX_ASCII
                                             : 0));
 
-               if (foldlen > (STRLEN)UNISKIP(f)) {
-
-                   /* Any multicharacter foldings (disallowed in lookbehind
-                    * patterns) require the following transform: [ABCDEF] ->
-                    * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
-                    * folds into "rst", all other characters fold to single
-                    * characters.  We save away these multicharacter foldings,
-                    * to be later saved as part of the additional "s" data. */
-                   if (! RExC_in_lookbehind) {
-                       U8* loc = foldbuf;
-                       U8* e = foldbuf + foldlen;
-
-                       /* If any of the folded characters of this are in the
-                        * Latin1 range, tell the regex engine that this can
-                        * match a non-utf8 target string.  */
-                        while (loc < e) {
-                            if (UTF8_IS_INVARIANT(*loc)
-                                || UTF8_IS_DOWNGRADEABLE_START(*loc))
-                            {
-                                ANYOF_FLAGS(ret)
-                                        |= ANYOF_NONBITMAP_NON_UTF8;
-                                break;
-                            }
-                            loc += UTF8SKIP(loc);
-                        }
-
-                       add_alternate(&unicode_alternate, foldbuf, foldlen);
-                   }
-               }
-                else {
                     /* Single character fold of above Latin1.  Add everything
                      * in its fold closure to the list that this node should
                      * match */
-                   SV** listp;
-
                    /* The fold closures data structure is a hash with the keys
                     * being every character that is folded to, like 'k', and
                     * the values each an array of everything that folds to its
@@ -12871,7 +12791,6 @@ parseit:
                            }
                        }
                    }
-               }
             }
        }
        SvREFCNT_dec(fold_intersection);
@@ -12980,7 +12899,6 @@ parseit:
     if (invert
         && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
        && ! depends_list
-       && ! unicode_alternate
        && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
     {
         _invlist_invert(cp_list);
@@ -12999,7 +12917,7 @@ parseit:
      * until runtime; set the run-time fold flag for these.  (We don't have to
      * worry about properties folding, as that is taken care of by the swash
      * fetching) */
-    if (FOLD && (LOC || unicode_alternate))
+    if (FOLD && LOC)
     {
        ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
     }
@@ -13020,7 +12938,6 @@ parseit:
      * node types they could possibly match using _invlistEQ(). */
 
     if (cp_list
-        && ! unicode_alternate
         && ! invert
         && ! depends_list
         && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
@@ -13209,12 +13126,10 @@ parseit:
     }
 
     if (! cp_list
-       && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
-       && ! unicode_alternate)
+       && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
     {
        ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
        SvREFCNT_dec(listsv);
-       SvREFCNT_dec(unicode_alternate);
     }
     else {
        /* av[0] stores the character class description in its textual form:
@@ -13223,8 +13138,7 @@ parseit:
         * av[1] if NULL, is a placeholder to later contain the swash computed
         *       from av[0].  But if no further computation need be done, the
         *       swash is stored there now.
-        * av[2] stores the multicharacter foldings, used later in
-        *       regexec.c:S_reginclass().
+         * av[2] is always NULL
         * av[3] stores the cp_list inversion list for use in addition or
         *       instead of av[0]; used only if av[1] is NULL
         * av[4] is set if any component of the class is from a user-defined
@@ -13247,17 +13161,7 @@ parseit:
            }
        }
 
-        /* Store any computed multi-char folds only if we are allowing
-         * them */
-        if (allow_full_fold) {
-            av_store(av, 2, MUTABLE_SV(unicode_alternate));
-            if (unicode_alternate) { /* This node is variable length */
-                OP(ret) = ANYOFV;
-            }
-        }
-        else {
             av_store(av, 2, NULL);
-        }
        rv = newRV_noinc(MUTABLE_SV(av));
        n = add_data(pRExC_state, 1, "s");
        RExC_rxi->data->data[n] = (void*)rv;
@@ -14068,7 +13972,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
 
        if (ANYOF_NONBITMAP(o)) {
            SV *lv; /* Set if there is something outside the bit map */
-           SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
+           SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
             bool byte_output = FALSE;   /* If something in the bitmap has been
                                            output */
 
index e8400b9..3b4db9c 100644 (file)
@@ -55,7 +55,6 @@ REG_ANY     REG_ANY,    no 0 S    ; Match any one character (except newline).
 SANY        REG_ANY,    no 0 S    ; Match any one character.
 CANY        REG_ANY,    no 0 S    ; Match any one byte.
 ANYOF       ANYOF,      sv 0 S    ; Match character in (or not in) this class, single char match only
-ANYOFV      ANYOF,      sv 0 V    ; Match character in (or not in) this class, can match-multiple chars
 
 # Order (within each group) of the below is important.  See ordering comment
 # above.  The PLACEHOLDERn ones are wasting a value.  Right now, we have plenty
index febc222..5adb305 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -101,7 +101,7 @@ const char* const non_utf8_target_but_utf8_required
 #define        STATIC  static
 #endif
 
-/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
+/* Valid for non-utf8 strings: avoids the reginclass
  * call if there are no complications: i.e., if everything matchable is
  * straight forward in the bitmap */
 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
@@ -1452,9 +1452,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         
        /* We know what class it must start with. */
        switch (OP(c)) {
-       case ANYOFV:
        case ANYOF:
-           if (utf8_target || OP(c) == ANYOFV) {
+           if (utf8_target) {
                STRLEN inclasslen = strend - s;
                REXEC_FBC_UTF8_CLASS_SCAN(
                           reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
@@ -4224,11 +4223,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                    sayNO;
            break;
 
-       case ANYOFV: /*  /[abx{df}]/i  */
        case ANYOF:  /*  /[abc]/       */
             if (NEXTCHR_IS_EOS)
                 sayNO;
-           if (utf8_target || state_num == ANYOFV) {
+           if (utf8_target) {
                STRLEN inclasslen = PL_regeol - locinput;
                if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
                    sayNO;
@@ -6613,9 +6611,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
            }
        }
        break;
-    case ANYOFV:
     case ANYOF:
-       if (utf8_target || OP(p) == ANYOFV) {
+       if (utf8_target) {
            STRLEN inclasslen;
            loceol = PL_regeol;
            inclasslen = loceol - scan;
@@ -7030,32 +7027,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
 /*
 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
-create a copy so that changes the caller makes won't change the shared one
+create a copy so that changes the caller makes won't change the shared one.
+If <altsvp> is non-null, will return NULL in it, for back-compat.
  */
 SV *
 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
 {
     PERL_ARGS_ASSERT_REGCLASS_SWASH;
-    return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
+
+    if (altsvp) {
+        *altsvp = NULL;
+    }
+
+    return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
 }
 #endif
 
 STATIC SV *
-S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
+S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
 {
     /* Returns the swash for the input 'node' in the regex 'prog'.
      * If <doinit> is true, will attempt to create the swash if not already
      *   done.
      * If <listsvp> is non-null, will return the swash initialization string in
      *   it.
-     * If <altsvp> is non-null, will return the alternates to the regular swash
-     *   in it
      * Tied intimately to how regcomp.c sets up the data structure */
 
     dVAR;
     SV *sw  = NULL;
     SV *si  = NULL;
-    SV *alt = NULL;
     SV*  invlist = NULL;
 
     RXi_GET_DECL(prog,progi);
@@ -7105,13 +7105,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
                                      &swash_init_flags);
                (void)av_store(av, 1, sw);
            }
-
-           /* Element [2] is for any multi-char folds.  Note that is a
-            * fundamentally flawed design, because can't backtrack and try
-            * again.  See [perl #89774] */
-           if (SvTYPE(ary[2]) == SVt_PVAV) {
-               alt = ary[2];
-           }
        }
     }
        
@@ -7136,9 +7129,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo
        *listsvp = matches_string;
     }
 
-    if (altsvp)
-       *altsvp  = alt;
-
     return sw;
 }
 
@@ -7278,167 +7268,19 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                             || (! (flags & ANYOF_LOCALE))
                             || (flags & ANYOF_IS_SYNTHETIC)))))
        {
-           AV *av;
-           SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-
+           SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
            if (sw) {
                U8 * utf8_p;
                if (utf8_target) {
                    utf8_p = (U8 *) p;
-               } else {
-
-                   /* Not utf8.  Convert as much of the string as available up
-                    * to the limit of how far the (single) character in the
-                    * pattern can possibly match (no need to go further).  If
-                    * the node is a straight ANYOF or not folding, it can't
-                    * match more than one.  Otherwise, It can match up to how
-                    * far a single char can fold to.  Since not utf8, each
-                    * character is a single byte, so the max it can be in
-                    * bytes is the same as the max it can be in characters */
-                   STRLEN len = (OP(n) == ANYOF
-                                 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
-                                 ? 1
-                                 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
-                                   ? maxlen
-                                   : UTF8_MAX_FOLD_CHAR_EXPAND;
+               } else { /* Convert to utf8 */
+                   STRLEN len = 1;
                    utf8_p = bytes_to_utf8(p, &len);
                }
 
-               if (swash_fetch(sw, utf8_p, TRUE))
+               if (swash_fetch(sw, utf8_p, TRUE)) {
                    match = TRUE;
-               else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
-
-                   /* Here, we need to test if the fold of the target string
-                    * matches.  The non-multi char folds have all been moved to
-                     * the compilation phase, and the multi-char folds have
-                     * been stored by regcomp into 'av'; we linearly check to
-                     * see if any match the target string (folded).   We know
-                     * that the originals were each one character, but we don't
-                     * currently know how many characters/bytes each folded to,
-                     * except we do know that there are small limits imposed by
-                     * Unicode.  XXX A performance enhancement would be to have
-                     * regcomp.c store the max number of chars/bytes that are
-                     * in an av entry, as, say the 0th element.  Even better
-                     * would be to have a hash of the few characters that can
-                     * start a multi-char fold to the max number of chars of
-                     * those folds.
-                    *
-                    * If there is a match, we will need to advance (if lenp is
-                    * specified) the match pointer in the target string.  But
-                    * what we are comparing here isn't that string directly,
-                    * but its fold, whose length may differ from the original.
-                    * As we go along in constructing the fold, therefore, we
-                    * create a map so that we know how many bytes in the
-                    * source to advance given that we have matched a certain
-                    * number of bytes in the fold.  This map is stored in
-                    * 'map_fold_len_back'.  Let n mean the number of bytes in
-                    * the fold of the first character that we are folding.
-                    * Then map_fold_len_back[n] is set to the number of bytes
-                    * in that first character.  Similarly let m be the
-                    * corresponding number for the second character to be
-                    * folded.  Then map_fold_len_back[n+m] is set to the
-                    * number of bytes occupied by the first two source
-                    * characters. ... */
-                   U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
-                   U8 folded[UTF8_MAXBYTES_CASE+1];
-                   STRLEN foldlen = 0; /* num bytes in fold of 1st char */
-                   STRLEN total_foldlen = 0; /* num bytes in fold of all
-                                                 chars */
-
-                   if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
-
-                       /* Here, only need to fold the first char of the target
-                        * string.  It the source wasn't utf8, is 1 byte long */
-                       to_utf8_fold(utf8_p, folded, &foldlen);
-                       total_foldlen = foldlen;
-                       map_fold_len_back[foldlen] = (utf8_target)
-                                                    ? UTF8SKIP(utf8_p)
-                                                    : 1;
-                   }
-                   else {
-
-                       /* Here, need to fold more than the first char.  Do so
-                        * up to the limits */
-                       U8* source_ptr = utf8_p;    /* The source for the fold
-                                                      is the regex target
-                                                      string */
-                       U8* folded_ptr = folded;
-                       U8* e = utf8_p + maxlen;    /* Can't go beyond last
-                                                      available byte in the
-                                                      target string */
-                       U8 i;
-                       for (i = 0;
-                            i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
-                            i++)
-                       {
-
-                           /* Fold the next character */
-                           U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
-                           STRLEN this_char_foldlen;
-                           to_utf8_fold(source_ptr,
-                                        this_char_folded,
-                                        &this_char_foldlen);
-
-                           /* Bail if it would exceed the byte limit for
-                            * folding a single char. */
-                           if (this_char_foldlen + folded_ptr - folded >
-                                                           UTF8_MAXBYTES_CASE)
-                           {
-                               break;
-                           }
-
-                           /* Add the fold of this character */
-                           Copy(this_char_folded,
-                                folded_ptr,
-                                this_char_foldlen,
-                                U8);
-                           source_ptr += UTF8SKIP(source_ptr);
-                           folded_ptr += this_char_foldlen;
-                           total_foldlen = folded_ptr - folded;
-
-                           /* Create map from the number of bytes in the fold
-                            * back to the number of bytes in the source.  If
-                            * the source isn't utf8, the byte count is just
-                            * the number of characters so far */
-                           map_fold_len_back[total_foldlen]
-                                                     = (utf8_target)
-                                                       ? source_ptr - utf8_p
-                                                       : i + 1;
-                       }
-                       *folded_ptr = '\0';
-                   }
-
-
-                   /* Do the linear search to see if the fold is in the list
-                    * of multi-char folds. */
-                   if (av) {
-                       I32 i;
-                       for (i = 0; i <= av_len(av); i++) {
-                           SV* const sv = *av_fetch(av, i, FALSE);
-                           STRLEN len;
-                           const char * const s = SvPV_const(sv, len);
-
-                           if (len <= total_foldlen
-                               && memEQ(s, (char*)folded, len)
-
-                                  /* If 0, means matched a partial char. See
-                                   * [perl #90536] */
-                               && map_fold_len_back[len])
-                           {
-
-                               /* Advance the target string ptr to account for
-                                * this fold, but have to translate from the
-                                * folded length to the corresponding source
-                                * length. */
-                               if (lenp) {
-                                   *lenp = map_fold_len_back[len];
-                               }
-                               match = TRUE;
-                               break;
-                           }
-                       }
-                   }
-               }
+                }
 
                /* If we allocated a string above, free it */
                if (! utf8_target) Safefree(utf8_p);
index b53487d..b8278cc 100644 (file)
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            122
-#define REGMATCH_STATE_MAX     162
+#define REGNODE_MAX            121
+#define REGMATCH_STATE_MAX     161
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        SANY                    19      /* 0x13 Match any one character. */
 #define        CANY                    20      /* 0x14 Match any one byte. */
 #define        ANYOF                   21      /* 0x15 Match character in (or not in) this class, single char match only */
-#define        ANYOFV                  22      /* 0x16 Match character in (or not in) this class, can match-multiple chars */
-#define        ALNUM                   23      /* 0x17 Match any alphanumeric character using native charset semantics for non-utf8 */
-#define        ALNUML                  24      /* 0x18 Match any alphanumeric char in locale */
-#define        ALNUMU                  25      /* 0x19 Match any alphanumeric char using Unicode semantics */
-#define        ALNUMA                  26      /* 0x1a Match [A-Za-z_0-9] */
-#define        NALNUM                  27      /* 0x1b Match any non-alphanumeric character using native charset semantics for non-utf8 */
-#define        NALNUML                 28      /* 0x1c Match any non-alphanumeric char in locale */
-#define        NALNUMU                 29      /* 0x1d Match any non-alphanumeric char using Unicode semantics */
-#define        NALNUMA                 30      /* 0x1e Match [^A-Za-z_0-9] */
-#define        SPACE                   31      /* 0x1f Match any whitespace character using native charset semantics for non-utf8 */
-#define        SPACEL                  32      /* 0x20 Match any whitespace char in locale */
-#define        SPACEU                  33      /* 0x21 Match any whitespace char using Unicode semantics */
-#define        SPACEA                  34      /* 0x22 Match [ \t\n\f\r] */
-#define        NSPACE                  35      /* 0x23 Match any non-whitespace character using native charset semantics for non-utf8 */
-#define        NSPACEL                 36      /* 0x24 Match any non-whitespace char in locale */
-#define        NSPACEU                 37      /* 0x25 Match any non-whitespace char using Unicode semantics */
-#define        NSPACEA                 38      /* 0x26 Match [^ \t\n\f\r] */
-#define        DIGIT                   39      /* 0x27 Match any numeric character using native charset semantics for non-utf8 */
-#define        DIGITL                  40      /* 0x28 Match any numeric character in locale */
-#define        PLACEHOLDER1            41      /* 0x29 placeholder for missing DIGITU */
-#define        DIGITA                  42      /* 0x2a Match [0-9] */
-#define        NDIGIT                  43      /* 0x2b Match any non-numeric character using native charset semantics for non-utf8 */
-#define        NDIGITL                 44      /* 0x2c Match any non-numeric character in locale */
-#define        PLACEHOLDER2            45      /* 0x2d placeholder for missing NDIGITU */
-#define        NDIGITA                 46      /* 0x2e Match [^0-9] */
-#define        POSIXD                  47      /* 0x2f currently unused except as a placeholder */
-#define        POSIXL                  48      /* 0x30 currently unused except as a placeholder */
-#define        POSIXU                  49      /* 0x31 currently unused except as a placeholder */
-#define        POSIXA                  50      /* 0x32 Some [[:class:]] under /a; the FLAGS field gives which one */
-#define        NPOSIXD                 51      /* 0x33 currently unused except as a placeholder */
-#define        NPOSIXL                 52      /* 0x34 currently unused except as a placeholder */
-#define        NPOSIXU                 53      /* 0x35 currently unused except as a placeholder */
-#define        NPOSIXA                 54      /* 0x36 complement of POSIXA, [[:^class:]] */
-#define        CLUMP                   55      /* 0x37 Match any extended grapheme cluster sequence */
-#define        BRANCH                  56      /* 0x38 Match this alternative, or the next... */
-#define        BACK                    57      /* 0x39 Match "", "next" ptr points backward. */
-#define        EXACT                   58      /* 0x3a Match this string (preceded by length). */
-#define        EXACTF                  59      /* 0x3b Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
-#define        EXACTFL                 60      /* 0x3c Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define        EXACTFU                 61      /* 0x3d Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
-#define        EXACTFA                 62      /* 0x3e Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define        EXACTFU_SS              63      /* 0x3f Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define        EXACTFU_TRICKYFOLD      64      /* 0x40 Match this folded UTF-8 string using /iu rules */
-#define        NOTHING                 65      /* 0x41 Match empty string. */
-#define        TAIL                    66      /* 0x42 Match empty string. Can jump here from outside. */
-#define        STAR                    67      /* 0x43 Match this (simple) thing 0 or more times. */
-#define        PLUS                    68      /* 0x44 Match this (simple) thing 1 or more times. */
-#define        CURLY                   69      /* 0x45 Match this simple thing {n,m} times. */
-#define        CURLYN                  70      /* 0x46 Capture next-after-this simple thing */
-#define        CURLYM                  71      /* 0x47 Capture this medium-complex thing {n,m} times. */
-#define        CURLYX                  72      /* 0x48 Match this complex thing {n,m} times. */
-#define        WHILEM                  73      /* 0x49 Do curly processing and see if rest matches. */
-#define        OPEN                    74      /* 0x4a Mark this point in input as start of */
-#define        CLOSE                   75      /* 0x4b Analogous to OPEN. */
-#define        REF                     76      /* 0x4c Match some already matched string */
-#define        REFF                    77      /* 0x4d Match already matched string, folded using native charset semantics for non-utf8 */
-#define        REFFL                   78      /* 0x4e Match already matched string, folded in loc. */
-#define        REFFU                   79      /* 0x4f Match already matched string, folded using unicode semantics for non-utf8 */
-#define        REFFA                   80      /* 0x50 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define        NREF                    81      /* 0x51 Match some already matched string */
-#define        NREFF                   82      /* 0x52 Match already matched string, folded using native charset semantics for non-utf8 */
-#define        NREFFL                  83      /* 0x53 Match already matched string, folded in loc. */
-#define        NREFFU                  84      /* 0x54 Match already matched string, folded using unicode semantics for non-utf8 */
-#define        NREFFA                  85      /* 0x55 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
-#define        IFMATCH                 86      /* 0x56 Succeeds if the following matches. */
-#define        UNLESSM                 87      /* 0x57 Fails if the following matches. */
-#define        SUSPEND                 88      /* 0x58 "Independent" sub-RE. */
-#define        IFTHEN                  89      /* 0x59 Switch, should be preceded by switcher . */
-#define        GROUPP                  90      /* 0x5a Whether the group matched. */
-#define        LONGJMP                 91      /* 0x5b Jump far away. */
-#define        BRANCHJ                 92      /* 0x5c BRANCH with long offset. */
-#define        EVAL                    93      /* 0x5d Execute some Perl code. */
-#define        MINMOD                  94      /* 0x5e Next operator is not greedy. */
-#define        LOGICAL                 95      /* 0x5f Next opcode should set the flag only. */
-#define        RENUM                   96      /* 0x60 Group with independently numbered parens. */
-#define        TRIE                    97      /* 0x61 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define        TRIEC                   98      /* 0x62 Same as TRIE, but with embedded charclass data */
-#define        AHOCORASICK             99      /* 0x63 Aho Corasick stclass. flags==type */
-#define        AHOCORASICKC            100     /* 0x64 Same as AHOCORASICK, but with embedded charclass data */
-#define        GOSUB                   101     /* 0x65 recurse to paren arg1 at (signed) ofs arg2 */
-#define        GOSTART                 102     /* 0x66 recurse to start of pattern */
-#define        NGROUPP                 103     /* 0x67 Whether the group matched. */
-#define        INSUBP                  104     /* 0x68 Whether we are in a specific recurse. */
-#define        DEFINEP                 105     /* 0x69 Never execute directly. */
-#define        ENDLIKE                 106     /* 0x6a Used only for the type field of verbs */
-#define        OPFAIL                  107     /* 0x6b Same as (?!) */
-#define        ACCEPT                  108     /* 0x6c Accepts the current matched string. */
-#define        VERB                    109     /* 0x6d Used only for the type field of verbs */
-#define        PRUNE                   110     /* 0x6e Pattern fails at this startpoint if no-backtracking through this */
-#define        MARKPOINT               111     /* 0x6f Push the current location for rollback by cut. */
-#define        SKIP                    112     /* 0x70 On failure skip forward (to the mark) before retrying */
-#define        COMMIT                  113     /* 0x71 Pattern fails outright if backtracking through this */
-#define        CUTGROUP                114     /* 0x72 On failure go to the next alternation in the group */
-#define        KEEPS                   115     /* 0x73 $& begins here. */
-#define        LNBREAK                 116     /* 0x74 generic newline pattern */
-#define        VERTWS                  117     /* 0x75 vertical whitespace         (Perl 6) */
-#define        NVERTWS                 118     /* 0x76 not vertical whitespace     (Perl 6) */
-#define        HORIZWS                 119     /* 0x77 horizontal whitespace       (Perl 6) */
-#define        NHORIZWS                120     /* 0x78 not horizontal whitespace   (Perl 6) */
-#define        OPTIMIZED               121     /* 0x79 Placeholder for dump. */
-#define        PSEUDO                  122     /* 0x7a Pseudo opcode for internal use. */
+#define        ALNUM                   22      /* 0x16 Match any alphanumeric character using native charset semantics for non-utf8 */
+#define        ALNUML                  23      /* 0x17 Match any alphanumeric char in locale */
+#define        ALNUMU                  24      /* 0x18 Match any alphanumeric char using Unicode semantics */
+#define        ALNUMA                  25      /* 0x19 Match [A-Za-z_0-9] */
+#define        NALNUM                  26      /* 0x1a Match any non-alphanumeric character using native charset semantics for non-utf8 */
+#define        NALNUML                 27      /* 0x1b Match any non-alphanumeric char in locale */
+#define        NALNUMU                 28      /* 0x1c Match any non-alphanumeric char using Unicode semantics */
+#define        NALNUMA                 29      /* 0x1d Match [^A-Za-z_0-9] */
+#define        SPACE                   30      /* 0x1e Match any whitespace character using native charset semantics for non-utf8 */
+#define        SPACEL                  31      /* 0x1f Match any whitespace char in locale */
+#define        SPACEU                  32      /* 0x20 Match any whitespace char using Unicode semantics */
+#define        SPACEA                  33      /* 0x21 Match [ \t\n\f\r] */
+#define        NSPACE                  34      /* 0x22 Match any non-whitespace character using native charset semantics for non-utf8 */
+#define        NSPACEL                 35      /* 0x23 Match any non-whitespace char in locale */
+#define        NSPACEU                 36      /* 0x24 Match any non-whitespace char using Unicode semantics */
+#define        NSPACEA                 37      /* 0x25 Match [^ \t\n\f\r] */
+#define        DIGIT                   38      /* 0x26 Match any numeric character using native charset semantics for non-utf8 */
+#define        DIGITL                  39      /* 0x27 Match any numeric character in locale */
+#define        PLACEHOLDER1            40      /* 0x28 placeholder for missing DIGITU */
+#define        DIGITA                  41      /* 0x29 Match [0-9] */
+#define        NDIGIT                  42      /* 0x2a Match any non-numeric character using native charset semantics for non-utf8 */
+#define        NDIGITL                 43      /* 0x2b Match any non-numeric character in locale */
+#define        PLACEHOLDER2            44      /* 0x2c placeholder for missing NDIGITU */
+#define        NDIGITA                 45      /* 0x2d Match [^0-9] */
+#define        POSIXD                  46      /* 0x2e currently unused except as a placeholder */
+#define        POSIXL                  47      /* 0x2f currently unused except as a placeholder */
+#define        POSIXU                  48      /* 0x30 currently unused except as a placeholder */
+#define        POSIXA                  49      /* 0x31 Some [[:class:]] under /a; the FLAGS field gives which one */
+#define        NPOSIXD                 50      /* 0x32 currently unused except as a placeholder */
+#define        NPOSIXL                 51      /* 0x33 currently unused except as a placeholder */
+#define        NPOSIXU                 52      /* 0x34 currently unused except as a placeholder */
+#define        NPOSIXA                 53      /* 0x35 complement of POSIXA, [[:^class:]] */
+#define        CLUMP                   54      /* 0x36 Match any extended grapheme cluster sequence */
+#define        BRANCH                  55      /* 0x37 Match this alternative, or the next... */
+#define        BACK                    56      /* 0x38 Match "", "next" ptr points backward. */
+#define        EXACT                   57      /* 0x39 Match this string (preceded by length). */
+#define        EXACTF                  58      /* 0x3a Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
+#define        EXACTFL                 59      /* 0x3b Match this string (not guaranteed to be folded) using /il rules (w/len). */
+#define        EXACTFU                 60      /* 0x3c Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define        EXACTFA                 61      /* 0x3d Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define        EXACTFU_SS              62      /* 0x3e Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define        EXACTFU_TRICKYFOLD      63      /* 0x3f Match this folded UTF-8 string using /iu rules */
+#define        NOTHING                 64      /* 0x40 Match empty string. */
+#define        TAIL                    65      /* 0x41 Match empty string. Can jump here from outside. */
+#define        STAR                    66      /* 0x42 Match this (simple) thing 0 or more times. */
+#define        PLUS                    67      /* 0x43 Match this (simple) thing 1 or more times. */
+#define        CURLY                   68      /* 0x44 Match this simple thing {n,m} times. */
+#define        CURLYN                  69      /* 0x45 Capture next-after-this simple thing */
+#define        CURLYM                  70      /* 0x46 Capture this medium-complex thing {n,m} times. */
+#define        CURLYX                  71      /* 0x47 Match this complex thing {n,m} times. */
+#define        WHILEM                  72      /* 0x48 Do curly processing and see if rest matches. */
+#define        OPEN                    73      /* 0x49 Mark this point in input as start of */
+#define        CLOSE                   74      /* 0x4a Analogous to OPEN. */
+#define        REF                     75      /* 0x4b Match some already matched string */
+#define        REFF                    76      /* 0x4c Match already matched string, folded using native charset semantics for non-utf8 */
+#define        REFFL                   77      /* 0x4d Match already matched string, folded in loc. */
+#define        REFFU                   78      /* 0x4e Match already matched string, folded using unicode semantics for non-utf8 */
+#define        REFFA                   79      /* 0x4f Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define        NREF                    80      /* 0x50 Match some already matched string */
+#define        NREFF                   81      /* 0x51 Match already matched string, folded using native charset semantics for non-utf8 */
+#define        NREFFL                  82      /* 0x52 Match already matched string, folded in loc. */
+#define        NREFFU                  83      /* 0x53 Match already matched string, folded using unicode semantics for non-utf8 */
+#define        NREFFA                  84      /* 0x54 Match already matched string, folded using unicode semantics for non-utf8, no mixing ASCII, non-ASCII */
+#define        IFMATCH                 85      /* 0x55 Succeeds if the following matches. */
+#define        UNLESSM                 86      /* 0x56 Fails if the following matches. */
+#define        SUSPEND                 87      /* 0x57 "Independent" sub-RE. */
+#define        IFTHEN                  88      /* 0x58 Switch, should be preceded by switcher . */
+#define        GROUPP                  89      /* 0x59 Whether the group matched. */
+#define        LONGJMP                 90      /* 0x5a Jump far away. */
+#define        BRANCHJ                 91      /* 0x5b BRANCH with long offset. */
+#define        EVAL                    92      /* 0x5c Execute some Perl code. */
+#define        MINMOD                  93      /* 0x5d Next operator is not greedy. */
+#define        LOGICAL                 94      /* 0x5e Next opcode should set the flag only. */
+#define        RENUM                   95      /* 0x5f Group with independently numbered parens. */
+#define        TRIE                    96      /* 0x60 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define        TRIEC                   97      /* 0x61 Same as TRIE, but with embedded charclass data */
+#define        AHOCORASICK             98      /* 0x62 Aho Corasick stclass. flags==type */
+#define        AHOCORASICKC            99      /* 0x63 Same as AHOCORASICK, but with embedded charclass data */
+#define        GOSUB                   100     /* 0x64 recurse to paren arg1 at (signed) ofs arg2 */
+#define        GOSTART                 101     /* 0x65 recurse to start of pattern */
+#define        NGROUPP                 102     /* 0x66 Whether the group matched. */
+#define        INSUBP                  103     /* 0x67 Whether we are in a specific recurse. */
+#define        DEFINEP                 104     /* 0x68 Never execute directly. */
+#define        ENDLIKE                 105     /* 0x69 Used only for the type field of verbs */
+#define        OPFAIL                  106     /* 0x6a Same as (?!) */
+#define        ACCEPT                  107     /* 0x6b Accepts the current matched string. */
+#define        VERB                    108     /* 0x6c Used only for the type field of verbs */
+#define        PRUNE                   109     /* 0x6d Pattern fails at this startpoint if no-backtracking through this */
+#define        MARKPOINT               110     /* 0x6e Push the current location for rollback by cut. */
+#define        SKIP                    111     /* 0x6f On failure skip forward (to the mark) before retrying */
+#define        COMMIT                  112     /* 0x70 Pattern fails outright if backtracking through this */
+#define        CUTGROUP                113     /* 0x71 On failure go to the next alternation in the group */
+#define        KEEPS                   114     /* 0x72 $& begins here. */
+#define        LNBREAK                 115     /* 0x73 generic newline pattern */
+#define        VERTWS                  116     /* 0x74 vertical whitespace         (Perl 6) */
+#define        NVERTWS                 117     /* 0x75 not vertical whitespace     (Perl 6) */
+#define        HORIZWS                 118     /* 0x76 horizontal whitespace       (Perl 6) */
+#define        NHORIZWS                119     /* 0x77 not horizontal whitespace   (Perl 6) */
+#define        OPTIMIZED               120     /* 0x78 Placeholder for dump. */
+#define        PSEUDO                  121     /* 0x79 Pseudo opcode for internal use. */
        /* ------------ States ------------- */
 #define        TRIE_next               (REGNODE_MAX + 1)       /* state for TRIE */
 #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for TRIE */
@@ -202,7 +201,6 @@ EXTCONST U8 PL_regkind[] = {
        REG_ANY,        /* SANY                   */
        REG_ANY,        /* CANY                   */
        ANYOF,          /* ANYOF                  */
-       ANYOF,          /* ANYOFV                 */
        ALNUM,          /* ALNUM                  */
        ALNUM,          /* ALNUML                 */
        ALNUM,          /* ALNUMU                 */
@@ -373,7 +371,6 @@ static const U8 regarglen[] = {
        0,                                      /* SANY         */
        0,                                      /* CANY         */
        0,                                      /* ANYOF        */
-       0,                                      /* ANYOFV       */
        0,                                      /* ALNUM        */
        0,                                      /* ALNUML       */
        0,                                      /* ALNUMU       */
@@ -501,7 +498,6 @@ static const char reg_off_by_arg[] = {
        0,      /* SANY         */
        0,      /* CANY         */
        0,      /* ANYOF        */
-       0,      /* ANYOFV       */
        0,      /* ALNUM        */
        0,      /* ALNUML       */
        0,      /* ALNUMU       */
@@ -634,107 +630,106 @@ EXTCONST char * const PL_reg_name[] = {
        "SANY",                         /* 0x13 */
        "CANY",                         /* 0x14 */
        "ANYOF",                        /* 0x15 */
-       "ANYOFV",                       /* 0x16 */
-       "ALNUM",                        /* 0x17 */
-       "ALNUML",                       /* 0x18 */
-       "ALNUMU",                       /* 0x19 */
-       "ALNUMA",                       /* 0x1a */
-       "NALNUM",                       /* 0x1b */
-       "NALNUML",                      /* 0x1c */
-       "NALNUMU",                      /* 0x1d */
-       "NALNUMA",                      /* 0x1e */
-       "SPACE",                        /* 0x1f */
-       "SPACEL",                       /* 0x20 */
-       "SPACEU",                       /* 0x21 */
-       "SPACEA",                       /* 0x22 */
-       "NSPACE",                       /* 0x23 */
-       "NSPACEL",                      /* 0x24 */
-       "NSPACEU",                      /* 0x25 */
-       "NSPACEA",                      /* 0x26 */
-       "DIGIT",                        /* 0x27 */
-       "DIGITL",                       /* 0x28 */
-       "PLACEHOLDER1",                 /* 0x29 */
-       "DIGITA",                       /* 0x2a */
-       "NDIGIT",                       /* 0x2b */
-       "NDIGITL",                      /* 0x2c */
-       "PLACEHOLDER2",                 /* 0x2d */
-       "NDIGITA",                      /* 0x2e */
-       "POSIXD",                       /* 0x2f */
-       "POSIXL",                       /* 0x30 */
-       "POSIXU",                       /* 0x31 */
-       "POSIXA",                       /* 0x32 */
-       "NPOSIXD",                      /* 0x33 */
-       "NPOSIXL",                      /* 0x34 */
-       "NPOSIXU",                      /* 0x35 */
-       "NPOSIXA",                      /* 0x36 */
-       "CLUMP",                        /* 0x37 */
-       "BRANCH",                       /* 0x38 */
-       "BACK",                         /* 0x39 */
-       "EXACT",                        /* 0x3a */
-       "EXACTF",                       /* 0x3b */
-       "EXACTFL",                      /* 0x3c */
-       "EXACTFU",                      /* 0x3d */
-       "EXACTFA",                      /* 0x3e */
-       "EXACTFU_SS",                   /* 0x3f */
-       "EXACTFU_TRICKYFOLD",           /* 0x40 */
-       "NOTHING",                      /* 0x41 */
-       "TAIL",                         /* 0x42 */
-       "STAR",                         /* 0x43 */
-       "PLUS",                         /* 0x44 */
-       "CURLY",                        /* 0x45 */
-       "CURLYN",                       /* 0x46 */
-       "CURLYM",                       /* 0x47 */
-       "CURLYX",                       /* 0x48 */
-       "WHILEM",                       /* 0x49 */
-       "OPEN",                         /* 0x4a */
-       "CLOSE",                        /* 0x4b */
-       "REF",                          /* 0x4c */
-       "REFF",                         /* 0x4d */
-       "REFFL",                        /* 0x4e */
-       "REFFU",                        /* 0x4f */
-       "REFFA",                        /* 0x50 */
-       "NREF",                         /* 0x51 */
-       "NREFF",                        /* 0x52 */
-       "NREFFL",                       /* 0x53 */
-       "NREFFU",                       /* 0x54 */
-       "NREFFA",                       /* 0x55 */
-       "IFMATCH",                      /* 0x56 */
-       "UNLESSM",                      /* 0x57 */
-       "SUSPEND",                      /* 0x58 */
-       "IFTHEN",                       /* 0x59 */
-       "GROUPP",                       /* 0x5a */
-       "LONGJMP",                      /* 0x5b */
-       "BRANCHJ",                      /* 0x5c */
-       "EVAL",                         /* 0x5d */
-       "MINMOD",                       /* 0x5e */
-       "LOGICAL",                      /* 0x5f */
-       "RENUM",                        /* 0x60 */
-       "TRIE",                         /* 0x61 */
-       "TRIEC",                        /* 0x62 */
-       "AHOCORASICK",                  /* 0x63 */
-       "AHOCORASICKC",                 /* 0x64 */
-       "GOSUB",                        /* 0x65 */
-       "GOSTART",                      /* 0x66 */
-       "NGROUPP",                      /* 0x67 */
-       "INSUBP",                       /* 0x68 */
-       "DEFINEP",                      /* 0x69 */
-       "ENDLIKE",                      /* 0x6a */
-       "OPFAIL",                       /* 0x6b */
-       "ACCEPT",                       /* 0x6c */
-       "VERB",                         /* 0x6d */
-       "PRUNE",                        /* 0x6e */
-       "MARKPOINT",                    /* 0x6f */
-       "SKIP",                         /* 0x70 */
-       "COMMIT",                       /* 0x71 */
-       "CUTGROUP",                     /* 0x72 */
-       "KEEPS",                        /* 0x73 */
-       "LNBREAK",                      /* 0x74 */
-       "VERTWS",                       /* 0x75 */
-       "NVERTWS",                      /* 0x76 */
-       "HORIZWS",                      /* 0x77 */
-       "NHORIZWS",                     /* 0x78 */
-       "OPTIMIZED",                    /* 0x79 */
-       "PSEUDO",                       /* 0x7a */
+       "ALNUM",                        /* 0x16 */
+       "ALNUML",                       /* 0x17 */
+       "ALNUMU",                       /* 0x18 */
+       "ALNUMA",                       /* 0x19 */
+       "NALNUM",                       /* 0x1a */
+       "NALNUML",                      /* 0x1b */
+       "NALNUMU",                      /* 0x1c */
+       "NALNUMA",                      /* 0x1d */
+       "SPACE",                        /* 0x1e */
+       "SPACEL",                       /* 0x1f */
+       "SPACEU",                       /* 0x20 */
+       "SPACEA",                       /* 0x21 */
+       "NSPACE",                       /* 0x22 */
+       "NSPACEL",                      /* 0x23 */
+       "NSPACEU",                      /* 0x24 */
+       "NSPACEA",                      /* 0x25 */
+       "DIGIT",                        /* 0x26 */
+       "DIGITL",                       /* 0x27 */
+       "PLACEHOLDER1",                 /* 0x28 */
+       "DIGITA",                       /* 0x29 */
+       "NDIGIT",                       /* 0x2a */
+       "NDIGITL",                      /* 0x2b */
+       "PLACEHOLDER2",                 /* 0x2c */
+       "NDIGITA",                      /* 0x2d */
+       "POSIXD",                       /* 0x2e */
+       "POSIXL",                       /* 0x2f */
+       "POSIXU",                       /* 0x30 */
+       "POSIXA",                       /* 0x31 */
+       "NPOSIXD",                      /* 0x32 */
+       "NPOSIXL",                      /* 0x33 */
+       "NPOSIXU",                      /* 0x34 */
+       "NPOSIXA",                      /* 0x35 */
+       "CLUMP",                        /* 0x36 */
+       "BRANCH",                       /* 0x37 */
+       "BACK",                         /* 0x38 */
+       "EXACT",                        /* 0x39 */
+       "EXACTF",                       /* 0x3a */
+       "EXACTFL",                      /* 0x3b */
+       "EXACTFU",                      /* 0x3c */
+       "EXACTFA",                      /* 0x3d */
+       "EXACTFU_SS",                   /* 0x3e */
+       "EXACTFU_TRICKYFOLD",           /* 0x3f */
+       "NOTHING",                      /* 0x40 */
+       "TAIL",                         /* 0x41 */
+       "STAR",                         /* 0x42 */
+       "PLUS",                         /* 0x43 */
+       "CURLY",                        /* 0x44 */
+       "CURLYN",                       /* 0x45 */
+       "CURLYM",                       /* 0x46 */
+       "CURLYX",                       /* 0x47 */
+       "WHILEM",                       /* 0x48 */
+       "OPEN",                         /* 0x49 */
+       "CLOSE",                        /* 0x4a */
+       "REF",                          /* 0x4b */
+       "REFF",                         /* 0x4c */
+       "REFFL",                        /* 0x4d */
+       "REFFU",                        /* 0x4e */
+       "REFFA",                        /* 0x4f */
+       "NREF",                         /* 0x50 */
+       "NREFF",                        /* 0x51 */
+       "NREFFL",                       /* 0x52 */
+       "NREFFU",                       /* 0x53 */
+       "NREFFA",                       /* 0x54 */
+       "IFMATCH",                      /* 0x55 */
+       "UNLESSM",                      /* 0x56 */
+       "SUSPEND",                      /* 0x57 */
+       "IFTHEN",                       /* 0x58 */
+       "GROUPP",                       /* 0x59 */
+       "LONGJMP",                      /* 0x5a */
+       "BRANCHJ",                      /* 0x5b */
+       "EVAL",                         /* 0x5c */
+       "MINMOD",                       /* 0x5d */
+       "LOGICAL",                      /* 0x5e */
+       "RENUM",                        /* 0x5f */
+       "TRIE",                         /* 0x60 */
+       "TRIEC",                        /* 0x61 */
+       "AHOCORASICK",                  /* 0x62 */
+       "AHOCORASICKC",                 /* 0x63 */
+       "GOSUB",                        /* 0x64 */
+       "GOSTART",                      /* 0x65 */
+       "NGROUPP",                      /* 0x66 */
+       "INSUBP",                       /* 0x67 */
+       "DEFINEP",                      /* 0x68 */
+       "ENDLIKE",                      /* 0x69 */
+       "OPFAIL",                       /* 0x6a */
+       "ACCEPT",                       /* 0x6b */
+       "VERB",                         /* 0x6c */
+       "PRUNE",                        /* 0x6d */
+       "MARKPOINT",                    /* 0x6e */
+       "SKIP",                         /* 0x6f */
+       "COMMIT",                       /* 0x70 */
+       "CUTGROUP",                     /* 0x71 */
+       "KEEPS",                        /* 0x72 */
+       "LNBREAK",                      /* 0x73 */
+       "VERTWS",                       /* 0x74 */
+       "NVERTWS",                      /* 0x75 */
+       "HORIZWS",                      /* 0x76 */
+       "NHORIZWS",                     /* 0x77 */
+       "OPTIMIZED",                    /* 0x78 */
+       "PSEUDO",                       /* 0x79 */
        /* ------------ States ------------- */
        "TRIE_next",                    /* REGNODE_MAX +0x01 */
        "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
@@ -828,9 +823,9 @@ EXTCONST char * const PL_reg_extflags_name[] = {
 EXTCONST U8 PL_varies[] __attribute__deprecated__;
 #else
 EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
-    ANYOFV, CLUMP, BRANCH, BACK, STAR, PLUS, CURLY, CURLYN, CURLYM, CURLYX,
-    WHILEM, REF, REFF, REFFL, REFFU, REFFA, NREF, NREFF, NREFFL, NREFFU,
-    NREFFA, SUSPEND, IFTHEN, BRANCHJ,
+    CLUMP, BRANCH, BACK, STAR, PLUS, CURLY, CURLYN, CURLYM, CURLYX, WHILEM,
+    REF, REFF, REFFL, REFFU, REFFA, NREF, NREFF, NREFFL, NREFFU, NREFFA,
+    SUSPEND, IFTHEN, BRANCHJ,
     0
 };
 #endif /* DOINIT */
@@ -839,7 +834,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x80, 0x03, 0xF8, 0xF3, 0x3F, 0x13, 0x00, 0x00, 0x00, 0x00
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0, 0x01, 0xFC, 0xF9, 0x9F, 0x09, 0x00, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
@@ -864,7 +859,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0x00, 0xBC, 0xFF, 0xFF, 0xDD, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0, 0x01
+    0x00, 0x00, 0xFC, 0xFF, 0xFF, 0xEE, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x00
 };
 #endif /* DOINIT */