regcomp.c: Avoid calling heavy duty functions when possible
authorKarl Williamson <public@khwilliamson.com>
Fri, 24 Jan 2014 03:09:43 +0000 (20:09 -0700)
committerKarl Williamson <public@khwilliamson.com>
Tue, 28 Jan 2014 06:03:47 +0000 (23:03 -0700)
This code calls the general purpose functions to fold and convert to
utf8.  These can be avoided for many frequently used code points.

regcomp.c

index 6ec7788..9bf05c0 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10801,7 +10801,20 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
 
     if (! len_passed_in) {
         if (UTF) {
-            if (FOLD && (! LOC || code_point > 255)) {
+            if (UNI_IS_INVARIANT(code_point)) {
+                if (LOC || ! FOLD) {    /* /l defers folding until runtime */
+                    *character = (U8) code_point;
+                }
+                else { /* Here is /i and not /l (toFOLD() is defined on just
+                          ASCII, which isn't the same thing as INVARIANT on
+                          EBCDIC, but it works there, as the extra invariants
+                          fold to themselves) */
+                    *character = toFOLD((U8) code_point);
+                }
+                len = 1;
+            }
+            else if (FOLD && (! LOC || code_point > 255)) {
+                /* Folding, and ok to do so now */
                 _to_uni_fold_flags(code_point,
                                    character,
                                    &len,
@@ -10811,6 +10824,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
                                                        ? FOLD_FLAGS_NOMIX_ASCII
                                                        : 0));
             }
+            else if (code_point <= MAX_UTF8_TWO_BYTE) {
+
+                /* Not folding this cp, and can output it directly */
+                *character = UTF8_TWO_BYTE_HI(code_point);
+                *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
+                len = 2;
+            }
             else {
                 uvchr_to_utf8( character, code_point);
                 len = UTF8SKIP(character);