regexec.c: Fix EXACT node handling in regrepeat()

author Karl Williamson <public@khwilliamson.com>

Thu, 11 Oct 2012 20:56:27 +0000 (14:56 -0600)

committer Karl Williamson <public@khwilliamson.com>

Fri, 12 Oct 2012 02:37:50 +0000 (20:37 -0600)
author Karl Williamson <public@khwilliamson.com>
Thu, 11 Oct 2012 20:56:27 +0000 (14:56 -0600)
committer Karl Williamson <public@khwilliamson.com>
Fri, 12 Oct 2012 02:37:50 +0000 (20:37 -0600)
diff --git a/regexec.c b/regexec.c

index bad11f2..febc222 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -6488,31 +6488,48 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
      case EXACT:
         c = (U8)*STRING(p);
  
-       if (! utf8_target || UNI_IS_INVARIANT(c)) {
+        /* Can use a simple loop if the pattern char to match on is invariant
+         * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
+         * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
+         * true iff it doesn't matter if the argument is in UTF-8 or not */
+        if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
             while (scan < loceol && UCHARAT(scan) == c) {
                 scan++;
             }
         }
         else if (UTF_PATTERN) {
-            STRLEN scan_char_len;
-
-           loceol = PL_regeol;
+            if (utf8_target) {
+                STRLEN scan_char_len;
+                loceol = PL_regeol;
+
+                /* When both target and pattern are UTF-8, we have to do s
+                 * string EQ */
+                while (hardcount < max
+                       && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
+                       && scan_char_len <= STR_LEN(p)
+                       && memEQ(scan, STRING(p), scan_char_len))
+                {
+                    scan += scan_char_len;
+                    hardcount++;
+                }
+            }
+            else if (! UTF8_IS_ABOVE_LATIN1(c)) {
  
-           while (hardcount < max
-                   && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
-                   && scan_char_len <= STR_LEN(p)
-                   && memEQ(scan, STRING(p), scan_char_len))
-            {
-               scan += scan_char_len;
-               hardcount++;
-           }
+                /* Target isn't utf8; convert the character in the UTF-8
+                 * pattern to non-UTF8, and do a simple loop */
+                c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
+                while (scan < loceol && UCHARAT(scan) == c) {
+                    scan++;
+                }
+            } /* else pattern char is above Latin1, can't possibly match the
+                 non-UTF-8 target */
          }
-       else {
+        else {
  
-           /* Here, the string is utf8, the pattern isn't, but <c> is different
-            * in utf8 than not, so can't compare them directly.  Outside the
-            * loop, find the two utf8 bytes that represent c, and then
-            * look for those in sequence in the utf8 string */
+            /* Here, the string must be utf8; pattern isn't, and <c> is
+             * different in utf8 than not, so can't compare them directly.
+             * Outside the loop, find the two utf8 bytes that represent c, and
+             * then look for those in sequence in the utf8 string */
             U8 high = UTF8_TWO_BYTE_HI(c);
             U8 low = UTF8_TWO_BYTE_LO(c);
             loceol = PL_regeol;
@@ -6527,6 +6544,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
             }
         }
         break;
+
      case EXACTFA:
         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
         goto do_exactf;
diff --git a/t/re/pat.t b/t/re/pat.t

index 08f784d..619b2ea 100644 (file)
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -19,7 +19,7 @@ BEGIN {
      require './test.pl';
  }
  
-plan tests => 453;  # Update this when adding/deleting tests.
+plan tests => 465;  # Update this when adding/deleting tests.
  
  run_tests() unless caller;
  
@@ -1284,6 +1284,21 @@ EOP
          ok("\x{017F}\x{017F}" =~ qr/^[\x{00DF}]?$/i, "[] to EXACTish optimization");
      }
  
+    {
+        for my $char (":", "\x{f7}", "\x{2010}") {
+            my $utf8_char = $char;
+            utf8::upgrade($utf8_char);
+            my $display = $char;
+            $display = display($display);
+            my $utf8_display = "utf8::upgrade(\"$display\")";
+
+            like($char, qr/^$char?$/, "\"$display\" =~ /^$display?\$/");
+            like($char, qr/^$utf8_char?$/, "my \$p = \"$display\"; utf8::upgrade(\$p); \"$display\" =~ /^\$p?\$/");
+            like($utf8_char, qr/^$char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); \"\$c\" =~ /^$display?\$/");
+            like($utf8_char, qr/^$utf8_char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); my \$p = \"$display\"; utf8::upgrade(\$p); \"\$c\" =~ /^\$p?\$/");
+        }
+    }
+
  } # End of sub run_tests
  
  1;
author	Karl Williamson <public@khwilliamson.com>
	Thu, 11 Oct 2012 20:56:27 +0000 (14:56 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Fri, 12 Oct 2012 02:37:50 +0000 (20:37 -0600)
regexec.c		patch \| blob \| history
t/re/pat.t		patch \| blob \| history