regcomp.c: Rework join_exact()

author Karl Williamson <public@khwilliamson.com>

Sat, 24 Dec 2011 02:46:10 +0000 (19:46 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 19 Jan 2012 18:58:19 +0000 (11:58 -0700)
author Karl Williamson <public@khwilliamson.com>
Sat, 24 Dec 2011 02:46:10 +0000 (19:46 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 19 Jan 2012 18:58:19 +0000 (11:58 -0700)
diff --git a/regcomp.c b/regcomp.c

index 9e7960c..02a6f75 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -2599,61 +2599,70 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32
       * can now analyze for sequences of problematic code points.  (Prior to
       * this final joining, sequences could have been split over boundaries, and
       * hence missed).  The sequences only happen in folding */
+    if (OP(scan) != EXACT) {
+        char *s, *t;
+        char * s0 = STRING(scan);
+        char * const s_end = s0 + STR_LEN(scan);
+
+       /* First we look at the sequences that can occur only in UTF-8 strings.
+        * The sequences are of length 6 */
+       if (UTF && STR_LEN(scan) >= 6) {
+
+           /* Two problematic code points in Unicode casefolding of EXACT
+            * nodes:
+            *
+            * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+            * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+            *
+            * which casefold to
+            *
+            * Unicode                      UTF-8
+            *
+            * U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
+            * U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
+             *
+            * This means that in case-insensitive matching (or "loose
+            * matching", as Unicode calls it), an EXACTF of length six (the
+            * UTF-8 encoded byte length of the above casefolded versions) can
+            * match a target string of length two (the byte length of UTF-8
+            * encoded U+0390 or U+03B0).  This would rather mess up the
+            * minimum length computation.  (there are other code points that
+            * also fold to these two sequences, but the delta is smaller)
+            *
+            * What we'll do is to look for the tail four bytes, and then peek
+            * at the preceding two bytes to see whether we need to decrease
+            * the minimum length by four (six minus two).
+            *
+            * Thanks to the design of UTF-8, there cannot be false matches:
+            * A sequence of valid UTF-8 bytes cannot be a subsequence of
+            * another valid sequence of UTF-8 bytes. */
  
-    if (UTF
-       && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
-       && ( STR_LEN(scan) >= 6 ) )
-    {
-    /*
-    Two problematic code points in Unicode casefolding of EXACT nodes:
-    
-    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
-    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-    
-    which casefold to
-    
-    Unicode                      UTF-8
-    
-    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
-    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
-    
-    This means that in case-insensitive matching (or "loose matching",
-    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
-    length of the above casefolded versions) can match a target string
-    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
-    This would rather mess up the minimum length computation.
-    
-    What we'll do is to look for the tail four bytes, and then peek
-    at the preceding two bytes to see whether we need to decrease
-    the minimum length by four (six minus two).
-    
-    Thanks to the design of UTF-8, there cannot be false matches:
-    A sequence of valid UTF-8 bytes cannot be a subsequence of
-    another valid sequence of UTF-8 bytes.
-    
-    */
-         char * const s0 = STRING(scan), *s, *t;
-         char * const s1 = s0 + STR_LEN(scan) - 1;
-         char * const s2 = s1 - 4;
  #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-        const char t0[] = "\xaf\x49\xaf\x42";
-#else
-         const char t0[] = "\xcc\x88\xcc\x81";
-#endif
-         const char * const t1 = t0 + 3;
-    
-         for (s = s0 + 2;
-              s < s2 && (t = ninstr(s, s1, t0, t1));
-              s = t + 4) {
-#ifdef EBCDIC
-             if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
-                 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+           const char U390_first_byte = '\xb4';
+           const char U390_2nd_byte = '\x68';
+           const char U3B0_first_byte = '\xb5';
+           const char U3B0_2nd_byte = '\x46';
+           const char tail[] = "\xaf\x49\xaf\x42";
  #else
-              if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
-                  ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+           const char U390_first_byte = '\xce';
+           const char U390_2nd_byte = '\xb9';
+           const char U3B0_first_byte = '\xcf';
+           const char U3B0_2nd_byte = '\x85';
+           const char tail[] = "\xcc\x88\xcc\x81";
  #endif
+           const STRLEN tail_len = sizeof(tail) - 1;
+           for (s = s0 + 2;    /* +2 is to skip the non-tail */
+                s <= s_end - tail_len
+                  && (t = ninstr(s, s_end, tail, tail + tail_len));
+                s = t + tail_len)
+           {
+               if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
+                   || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
+               {
                     *min_change -= 4;
-         }
+               }
+           }
+       }
      }
  
  #ifdef DEBUGGING
author	Karl Williamson <public@khwilliamson.com>
	Sat, 24 Dec 2011 02:46:10 +0000 (19:46 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 19 Jan 2012 18:58:19 +0000 (11:58 -0700)