Move Unicode property defn processing to compile time
authorKarl Williamson <public@khwilliamson.com>
Mon, 28 Nov 2011 17:26:28 +0000 (10:26 -0700)
committerKarl Williamson <public@khwilliamson.com>
Fri, 13 Jan 2012 16:58:36 +0000 (09:58 -0700)
This patch moves the processing of most Unicode property definitions
from execution (regexec.c) to compilation (regcomp.c).  There is a cost
to do this.  By deferring it to execution, it may be that the affected
path will never be taken, and hence the work won't have to be done;
whereas, it's always done if it gets done at compilation.

However, doing it at compilation, has many advantages.  We can't
optimize what we don't know about, so this allows for better
optimization, as well as feature enhancements, such as set
manipulations, restricting matches to certain scripts, etc.  A big one,
about to be committed allows for significantly reducing the number of
copies of the data structure used for each property.  (Currently, every
mention in every regular expression of a given property will generate a
new instance of its hash, and so results of look-ups of code points in
one instance aren't automatically known to other instances, so the code
point has to be looked-up again.)

This commit leaves the processing to execution time when the class is to
be inverted.  This was done purely to make the commit smaller, and will
be removed in a future commit; hence the redundant test here will be
removed shortly.

It also has to leave to execution time processing of properties whose
definition is not known yet.  That can happen when the property is
user-defined.  We call _core_swash_init(), and if it fails, we assume
that it's because it's such a property, and if it turns out that it was
an unknown property, we leave to execution time the raising of a warning
for it, just as before.

Currently, the processing of properties in inverted character classes is
also left to execution time.  This restriction will be lifted in a
future commit, and this patch assumes that, and doesn't indent some code
that it otherwise would, in anticipation of the surrounding 'if' tests
being removed.

regcomp.c

index eb2b951..5dd7c4e 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10082,8 +10082,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     SV *listsv = NULL;
     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                      than just initialized.  */
+    SV* properties = NULL;    /* Code points that match \p{} \P{} */
+    UV element_count = 0;   /* Number of distinct elements in the class.
+                              Optimizations may be possible if this is tiny */
     UV n;
 
+    /* Unicode properties are stored in a swash; this holds the current one
+     * being parsed. */
+    SV* swash = NULL;          /* Code points that match \p{} \P{} */
+
+    /* Set if a component of this character class is user-defined; just passed
+     * on to the engine */
+    UV has_user_defined_property = 0;
+
     /* code points this node matches that can't be stored in the bitmap */
     SV* nonbitmap = NULL;
 
@@ -10177,8 +10188,10 @@ parseit:
 
        namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
 
-       if (!range)
+       if (!range) {
            rangebegin = RExC_parse;
+           element_count++;
+       }
        if (UTF) {
            value = utf8n_to_uvchr((U8*)RExC_parse,
                                   RExC_end - RExC_parse,
@@ -10254,6 +10267,9 @@ parseit:
                    n = 1;
                }
                if (!SIZE_ONLY) {
+                    SV** invlistsvp;
+                    SV* invlist;
+                    char* name;
                    if (UCHARAT(RExC_parse) == '^') {
                         RExC_parse++;
                         n--;
@@ -10263,18 +10279,106 @@ parseit:
                              n--;
                         }
                    }
+                    /* Try to get the definition of the property into
+                     * <invlist>.  If /i is in effect, the effective property
+                     * will have its name be <__NAME_i>.  The design is
+                     * discussed in commit
+                     * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
+                    Newx(name, n + sizeof("_i__\n"), char);
+
+                    sprintf(name, "%s%.*s%s\n",
+                                    (FOLD) ? "__" : "",
+                                    (int)n,
+                                    RExC_parse,
+                                    (FOLD) ? "_i" : ""
+                    );
+
+                    /* Look up the property name, and get its swash and
+                     * inversion list, if the property is found  */
+                    if (! (ANYOF_FLAGS(ret) & ANYOF_INVERT)) {
+                    if (swash) {
+                        SvREFCNT_dec(swash);
+                    }
+                    swash = _core_swash_init("utf8", name, &PL_sv_undef,
+                                             1, /* binary */
+                                             0, /* not tr/// */
+                                             TRUE, /* this routine will handle
+                                                      undefined properties */
+                                             NULL, FALSE /* No inversion list */
+                                            );
+                    }
 
-                   /* Add the property name to the list.  If /i matching, give
-                    * a different name which consists of the normal name
-                    * sandwiched between two underscores and '_i'.  The design
-                    * is discussed in the commit message for this. */
-                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
-                                       (value=='p' ? '+' : '!'),
-                                       (FOLD) ? "__" : "",
-                                       (int)n,
-                                       RExC_parse,
-                                       (FOLD) ? "_i" : ""
-                                   );
+                    if (   ANYOF_FLAGS(ret) & ANYOF_INVERT
+                        || ! swash
+                        || ! SvROK(swash)
+                        || ! SvTYPE(SvRV(swash)) == SVt_PVHV
+                        || ! (invlistsvp =
+                               hv_fetchs(MUTABLE_HV(SvRV(swash)),
+                                "INVLIST", FALSE))
+                        || ! (invlist = *invlistsvp))
+                   {
+                        if (swash) {
+                            SvREFCNT_dec(swash);
+                            swash = NULL;
+                        }
+
+                        /* Here didn't find it.  It could be a user-defined
+                         * property that will be available at run-time.  Add it
+                         * to the list to look up then */
+                        Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
+                                        (value == 'p' ? '+' : '!'),
+                                        name);
+                        has_user_defined_property = 1;
+
+                        /* We don't know yet, so have to assume that the
+                         * property could match something in the Latin1 range,
+                         * hence something that isn't utf8 */
+                        ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
+                    }
+                    else {
+
+                        /* Here, did get the swash and its inversion list.  If
+                         * the swash is from a user-defined property, then this
+                         * whole character class should be regarded as such */
+                        SV** user_defined_svp =
+                                            hv_fetchs(MUTABLE_HV(SvRV(swash)),
+                                                        "USER_DEFINED", FALSE);
+                        if (user_defined_svp) {
+                            has_user_defined_property
+                                                    |= SvUV(*user_defined_svp);
+                        }
+
+                        /* Invert if asking for the complement */
+                        if (value == 'P') {
+
+                           /* Add to any existing list */
+                           if (! properties) {
+                               properties = invlist_clone(invlist);
+                               _invlist_invert(properties);
+                           }
+                           else {
+                               invlist = invlist_clone(invlist);
+                               _invlist_invert(invlist);
+                               _invlist_union(properties, invlist, &properties);
+                               SvREFCNT_dec(invlist);
+                           }
+
+                            /* The swash can't be used as-is, because we've
+                            * inverted things; delay removing it to here after
+                            * have copied its invlist above */
+                            SvREFCNT_dec(swash);
+                            swash = NULL;
+                        }
+                        else {
+                           if (! properties) {
+                               properties = invlist_clone(invlist);
+                           }
+                           else {
+                               _invlist_union(properties, invlist, &properties);
+                           }
+                       }
+                   }
+                   Safefree(name);
                }
                RExC_parse = e + 1;
 
@@ -10794,6 +10898,20 @@ parseit:
        }
     }
 
+    /* And combine the result (if any) with any inversion list from properties.
+     * The lists are kept separate up to now because we don't want to fold the
+     * properties */
+    if (properties) {
+       if (nonbitmap) {
+           _invlist_union(nonbitmap, properties, &nonbitmap);
+           SvREFCNT_dec(properties);
+       }
+       else {
+           nonbitmap = properties;
+       }
+    }
+
+
     /* Here, we have calculated what code points should be in the character
      * class.
      *
@@ -10950,6 +11068,12 @@ parseit:
         return ret;
     }
 
+    /* If there is a swash and more than one element, we can't use the swash in
+     * the optimization below. */
+    if (swash && element_count > 1) {
+       SvREFCNT_dec(swash);
+       swash = NULL;
+    }
     if (! nonbitmap
        && SvCUR(listsv) == initial_listsv_len
        && ! unicode_alternate)
@@ -10970,14 +11094,15 @@ parseit:
         * used later (regexec.c:S_reginclass()).
         * Element [3] stores the nonbitmap inversion list for use in addition
         * or instead of element [0].
-        * Element [4] is currently FALSE */
+        * Element [4] is set if any component of the class is from a
+        * user-defined property */
        av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
                        ? &PL_sv_undef
                        : listsv);
        av_store(av, 1, NULL);  /* Placeholder for generated swash */
            if (nonbitmap) {
                av_store(av, 3, nonbitmap);
-               av_store(av, 4, newSVuv(0));
+               av_store(av, 4, newSVuv(has_user_defined_property));
            }
 
         /* Store any computed multi-char folds only if we are allowing