From 71a442a8e083048f771614ff45898af022ade6b7 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 19 Nov 2010 12:04:53 -0700 Subject: [PATCH] UCD.pm: Don't use CompositionExclusions.txt The motiviation for this patch was to remove dependence of UCD on another Unicode DB .txt file. But the subroutine that uses it is out-of-date, now that this property, and an even more convenient one are accessible from the core. So the documentation is also updated to educate people. Instead of using the file, the routine just uses the core's access method --- lib/Unicode/UCD.pm | 52 ++++++++++++++++++++++++---------------------------- lib/Unicode/UCD.t | 2 ++ 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index eb4de28..522c540 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -710,45 +710,43 @@ sub bidi_types { my $compexcl = compexcl(0x09dc); -This returns B if the -L should not be produced by composition normalization, -B if that fact is not otherwise determinable from the Unicode data base. -It currently does not return B if the code point has a decomposition +This routine is included for backwards compatibility, but as of Perl 5.12, for +most purposes it is probably more convenient to use one of the following +instead: + + my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex}; + my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion}; + +or even + + my $compexcl = chr(0x09dc) =~ /\p{CE}; + my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion}; + +The first two forms return B if the L should not +be produced by composition normalization. The final two forms +additionally require that this fact not otherwise be determinable from +the Unicode data base for them to return B. + +This routine behaves identically to the final two forms. That is, +it does not return B if the code point has a decomposition consisting of another single code point, nor if its decomposition starts with a code point whose combining class is non-zero. Code points that meet either of these conditions should also not be produced by composition -normalization. +normalization, which is probably why you should use the +C property instead, as shown above. -It returns B otherwise. +The routine returns B otherwise. =cut -my %COMPEXCL; - -sub _compexcl { - unless (%COMPEXCL) { - if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { - local $_; - while (<$COMPEXCLFH>) { - if (/^([0-9A-F]+)\s+\#\s+/) { - my $code = hex($1); - $COMPEXCL{$code} = undef; - } - } - close($COMPEXCLFH); - } - } -} - sub compexcl { my $arg = shift; my $code = _getcode($arg); croak __PACKAGE__, "::compexcl: unknown code '$arg'" unless defined $code; - _compexcl() unless %COMPEXCL; - - return exists $COMPEXCL{$code}; + no warnings "utf8"; # So works on surrogates and non-Unicode code points + return chr($code) =~ /\p{Composition_Exclusion}/; } =head2 B @@ -1233,8 +1231,6 @@ if you are wondering where one of your filehandles went, that's where. Does not yet support EBCDIC platforms. -L should give a complete list of excluded code points. - =head1 AUTHOR Jarkko Hietaniemi diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 795888a..ae8432c 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -301,6 +301,8 @@ is(Unicode::UCD::UnicodeVersion, '6.0.0', 'UnicodeVersion'); use Unicode::UCD qw(compexcl); ok(!compexcl(0x0100), 'compexcl'); +ok(!compexcl(0xD801), 'compexcl of surrogate'); +ok(!compexcl(0x110000), 'compexcl of non-Unicode code point'); ok( compexcl(0x0958)); use Unicode::UCD qw(casefold); -- 2.7.4