UCD.pm: Don't use CompositionExclusions.txt

author Karl Williamson <public@khwilliamson.com>

Fri, 19 Nov 2010 19:04:53 +0000 (12:04 -0700)

committer Father Chrysostomos <sprout@cpan.org>

Sun, 21 Nov 2010 02:15:15 +0000 (18:15 -0800)
author Karl Williamson <public@khwilliamson.com>
Fri, 19 Nov 2010 19:04:53 +0000 (12:04 -0700)
committer Father Chrysostomos <sprout@cpan.org>
Sun, 21 Nov 2010 02:15:15 +0000 (18:15 -0800)
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index eb4de28..522c540 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -710,45 +710,43 @@ sub bidi_types {
  
      my $compexcl = compexcl(0x09dc);
  
-This returns B<true> if the
-L</code point argument> should not be produced by composition normalization,
-B<AND> if that fact is not otherwise determinable from the Unicode data base.
-It currently does not return B<true> if the code point has a decomposition
+This routine is included for backwards compatibility, but as of Perl 5.12, for
+most purposes it is probably more convenient to use one of the following
+instead:
+
+    my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
+    my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
+
+or even
+
+    my $compexcl = chr(0x09dc) =~ /\p{CE};
+    my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
+
+The first two forms return B<true> if the L</code point argument> should not
+be produced by composition normalization.  The final two forms
+additionally require that this fact not otherwise be determinable from
+the Unicode data base for them to return B<true>.
+
+This routine behaves identically to the final two forms.  That is,
+it does not return B<true> if the code point has a decomposition
  consisting of another single code point, nor if its decomposition starts
  with a code point whose combining class is non-zero.  Code points that meet
  either of these conditions should also not be produced by composition
-normalization.
+normalization, which is probably why you should use the
+C<Full_Composition_Exclusion> property instead, as shown above.
  
-It returns B<false> otherwise.
+The routine returns B<false> otherwise.
  
  =cut
  
-my %COMPEXCL;
-
-sub _compexcl {
-    unless (%COMPEXCL) {
-       if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
-           local $_;
-           while (<$COMPEXCLFH>) {
-               if (/^([0-9A-F]+)\s+\#\s+/) {
-                   my $code = hex($1);
-                   $COMPEXCL{$code} = undef;
-               }
-           }
-           close($COMPEXCLFH);
-       }
-    }
-}
-
  sub compexcl {
      my $arg  = shift;
      my $code = _getcode($arg);
      croak __PACKAGE__, "::compexcl: unknown code '$arg'"
         unless defined $code;
  
-    _compexcl() unless %COMPEXCL;
-
-    return exists $COMPEXCL{$code};
+    no warnings "utf8";     # So works on surrogates and non-Unicode code points
+    return chr($code) =~ /\p{Composition_Exclusion}/;
  }
  
  =head2 B<casefold()>
@@ -1233,8 +1231,6 @@ if you are wondering where one of your filehandles went, that's where.
  
  Does not yet support EBCDIC platforms.
  
-L</compexcl()> should give a complete list of excluded code points.
-
  =head1 AUTHOR
  
  Jarkko Hietaniemi
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t

index 795888a..ae8432c 100644 (file)
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -301,6 +301,8 @@ is(Unicode::UCD::UnicodeVersion, '6.0.0', 'UnicodeVersion');
  use Unicode::UCD qw(compexcl);
  
  ok(!compexcl(0x0100), 'compexcl');
+ok(!compexcl(0xD801), 'compexcl of surrogate');
+ok(!compexcl(0x110000), 'compexcl of non-Unicode code point');
  ok( compexcl(0x0958));
  
  use Unicode::UCD qw(casefold);
author	Karl Williamson <public@khwilliamson.com>
	Fri, 19 Nov 2010 19:04:53 +0000 (12:04 -0700)
committer	Father Chrysostomos <sprout@cpan.org>
	Sun, 21 Nov 2010 02:15:15 +0000 (18:15 -0800)
lib/Unicode/UCD.pm		patch \| blob \| history
lib/Unicode/UCD.t		patch \| blob \| history