utf8.c: Don't invert beyond-Unicode code points

author Karl Williamson <public@khwilliamson.com>

Wed, 28 Sep 2011 01:36:35 +0000 (19:36 -0600)

committer Karl Williamson <public@khwilliamson.com>

Sat, 1 Oct 2011 15:15:32 +0000 (09:15 -0600)
author Karl Williamson <public@khwilliamson.com>
Wed, 28 Sep 2011 01:36:35 +0000 (19:36 -0600)
committer Karl Williamson <public@khwilliamson.com>
Sat, 1 Oct 2011 15:15:32 +0000 (09:15 -0600)
diff --git a/t/re/pat.t b/t/re/pat.t

index 4ef9663..988c23e 100644 (file)
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -21,7 +21,7 @@ BEGIN {
      require './test.pl';
  }
  
-plan tests => 451;  # Update this when adding/deleting tests.
+plan tests => 455;  # Update this when adding/deleting tests.
  
  run_tests() unless caller;
  
@@ -1167,6 +1167,20 @@ sub run_tests {
          is($got,$want,'RT #84294: check that "ab" =~ /((\w+)(?{ push @got, $2 })){2}/ leaves @got in the correct state');
      }
  
+    {
+        # Suppress warnings, as the non-unicode one comes out even if turn off
+        # warnings here (because the execution is done in another scope).
+        local $SIG{__WARN__} = sub {};
+        my $str = "\x{110000}";
+
+        # No non-unicode code points match any Unicode property, even inverse
+        # ones
+        unlike($str, qr/\p{ASCII_Hex_Digit=True}/, "Non-Unicode doesn't match \\p{}");
+        unlike($str, qr/\p{ASCII_Hex_Digit=False}/, "Non-Unicode doesn't match \\p{}");
+        like($str, qr/\P{ASCII_Hex_Digit=True}/, "Non-Unicode matches \\P{}");
+        like($str, qr/\P{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\P{}");
+    }
+
  } # End of sub run_tests
  
  1;
diff --git a/utf8.c b/utf8.c

index 003e3fc..b8a5227 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2476,11 +2476,25 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
  
      /* Invert if the data says it should be */
      if (invert_it_svp && SvUV(*invert_it_svp)) {
+
+       /* Unicode properties should come with all bits above PERL_UNICODE_MAX
+        * be 0, and their inversion should also be 0, as we don't succeed any
+        * Unicode property matches for non-Unicode code points */
+       if (start <= PERL_UNICODE_MAX) {
+
+           /* The code below assumes that we never cross the
+            * Unicode/above-Unicode boundary in a range, as otherwise we would
+            * have to figure out where to stop flipping the bits.  Since this
+            * boundary is divisible by a large power of 2, and swatches comes
+            * in small powers of 2, this should be a valid assumption */
+           assert(start + span - 1 <= PERL_UNICODE_MAX);
+
         send = s + scur;
         while (s < send) {
             *s = ~(*s);
             s++;
         }
+       }
      }
  
      /* read $swash->{EXTRAS}
author	Karl Williamson <public@khwilliamson.com>
	Wed, 28 Sep 2011 01:36:35 +0000 (19:36 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Sat, 1 Oct 2011 15:15:32 +0000 (09:15 -0600)
t/re/pat.t		patch \| blob \| history
utf8.c		patch \| blob \| history