From: Karl Williamson Date: Thu, 1 Jul 2010 22:06:51 +0000 (-0600) Subject: Extend \N{} enhancements to vianame() X-Git-Tag: accepted/trunk/20130322.191538~8549 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=630981911ba00041d18690de9fd4a6105d539fba;p=platform%2Fupstream%2Fperl.git Extend \N{} enhancements to vianame() This patch refactors charnames so that vianame and \N call the same common subroutine so that they have as identical behavior as possible. --- diff --git a/lib/charnames.pm b/lib/charnames.pm index 67babcc..ba580f8 100644 --- a/lib/charnames.pm +++ b/lib/charnames.pm @@ -2,7 +2,7 @@ package charnames; use strict; use warnings; use File::Spec; -our $VERSION = '1.10'; +our $VERSION = '1.11'; use bytes (); # for $bytes::hint_bits @@ -473,31 +473,33 @@ sub alias_file ($) 0; } # alias_file -# This is not optimized in any way yet -sub charnames -{ + +sub lookup_name { my $name = shift; + my $runtime = shift; # compile vs run time + + # Finds the ordinal of a character name, first in the aliases, then in + # the large table. If not found, returns undef if runtime; complains + # and returns the Unicode replacement if compile. + # This is not optimized in any way yet + my $ord; - my $fname; # User alias should be checked first or else can't override ours, and if we # add any, could conflict with theirs. if (exists $user_numeric_aliases{$name}) { $ord = $user_numeric_aliases{$name}; - $fname = $name; } elsif (exists $user_name_aliases{$name}) { $name = $user_name_aliases{$name}; } elsif (exists $system_aliases{$name}) { $ord = $system_aliases{$name}; - $fname = $name; } elsif (exists $deprecated_aliases{$name}) { require warnings; warnings::warnif('deprecated', "Unicode character name \"$name\" is deprecated, use \"" . viacode($deprecated_aliases{$name}) . "\" instead"); $ord = $deprecated_aliases{$name}; - $fname = $name; } my @off; @@ -511,41 +513,45 @@ sub charnames ## @off will hold the index into the code/name string of the start and ## end of the name as we find it. - ## If :full, look for the name exactly - if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) { - @off = ($-[0], $+[0]); + ## If :full, look for the name exactly; runtime implies full + if (($runtime || $^H{charnames_full}) && $txt =~ /\t\t\Q$name\E$/m) { + @off = ($-[0] + 2, $+[0]); # The 2 is for the 2 tabs } ## If we didn't get above, and :short allowed, look for the short name. ## The short name is like "greek:Sigma" unless (@off) { - if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) { - my ($script, $cname) = ($1, $2); - my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; - if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { - @off = ($-[0], $+[0]); - } + if (($runtime || $^H{charnames_short}) && $name =~ /^(.+?):(.+)/s) { + my ($script, $cname) = ($1, $2); + my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; + if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) { + @off = ($-[0] + 2, $+[0]); + } } } ## If we still don't have it, check for the name among the loaded ## scripts. - if (not @off) { + if (! $runtime && not @off) { my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"; for my $script (@{$^H{charnames_scripts}}) { - if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { - @off = ($-[0], $+[0]); - last; - } + if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) { + @off = ($-[0] + 2, $+[0]); + last; + } } } ## If we don't have it by now, give up. unless (@off) { + return if $runtime; carp "Unknown charname '$name'"; return "\x{FFFD}"; } + # Get the official name in case need to output a message + $name = substr($txt, $off[0], $off[1] - $off[0]); + ## ## Now know where in the string the name starts. ## The code, in hex, is before that. @@ -563,22 +569,30 @@ sub charnames ## we know where it starts, so turn into number - ## the ordinal for the char. - $ord = CORE::hex substr($txt, $hexstart, $off[0] - $hexstart); + $ord = CORE::hex substr($txt, $hexstart, $off[0] - 2 - $hexstart); } - if ($^H & $bytes::hint_bits) { # "use bytes" in effect? - use bytes; - return chr $ord if $ord <= 255; - my $hex = sprintf "%04x", $ord; - if (not defined $fname) { - $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; - } - croak "Character 0x$hex with name '$fname' is above 0xFF"; - } + return $ord if $runtime || $ord <= 255 || ! ($^H & $bytes::hint_bits); + + # Here is compile time, "use bytes" is in effect, and the character + # won't fit in a byte + + croak sprintf("Character 0x%04x with name '$name' is above 0xFF", $ord); +} # lookup_name + +sub charnames { + my $name = shift; + + # For \N{...}. Looks up the character name and returns its ordinal if + # found, undef otherwise. If not in 'use bytes', forces into utf8 + + my $ord = lookup_name($name, 0); # 0 means compile-time + return unless defined $ord; + return chr $ord if $^H & $bytes::hint_bits; no warnings 'utf8'; # allow even illegal characters return pack "U", $ord; -} # charnames +} sub import { @@ -641,10 +655,12 @@ sub import } } # import -my %viacode; +my %viacode; # Cache of already-found codes + +sub viacode { + + # Returns the name of the code point argument -sub viacode -{ if (@_ != 1) { carp "charnames::viacode() expects one argument"; return; @@ -690,7 +706,7 @@ sub viacode return $inverse_user_aliases{$hex}; } # viacode -my %vianame; +my %vianame; # Cache of already-found names sub vianame { @@ -699,30 +715,24 @@ sub vianame return () } - my $arg = shift; + # Looks up the character name and returns its ordinal if + # found, undef otherwise. - return chr CORE::hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/; + my $arg = shift; - return $vianame{$arg} if exists $vianame{$arg}; + if ($arg =~ /^U\+([0-9a-fA-F]+)$/) { - $txt = do "unicore/Name.pl" unless $txt; + # khw claims that this is bad. The function should return either a + # an ord or a chr for all inputs; not be bipolar. Also, under 'use + # bytes', can create a chr above 255. + return chr CORE::hex $1; + } - my $pos = index $txt, "\t\t$arg\n"; - if (0 <= $pos) { - my $posLF = rindex $txt, "\n", $pos; - (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d; - return $vianame{$arg} = CORE::hex $code; - - # If $pos is at the 1st line, $posLF must be -1 (not found); - # then $posLF + 1 equals to 0 (at the beginning of $txt). - # Otherwise $posLF is the position of "\n"; - # then $posLF + 1 must be the position of the next to "\n" - # (the beginning of the line). - # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t", - # "10300\t", "100000", etc. So we can get the code via removing TAB. - } else { - return; + if (! exists $vianame{$arg}) { + $vianame{$arg} = lookup_name($arg, 1); # 1 means run-time } + + return $vianame{$arg}; } # vianame diff --git a/lib/charnames.t b/lib/charnames.t index 8df4d70..38a3c61 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -56,6 +56,7 @@ EOE mychar2 => 983040, # U+F0000 mychar3 => "U+100000", myctrl => 0x80, + mylarge => "U+111000", }; is ("\N{mychar1}", chr(0xE8000), "Verify that can define hex alias"); is (charnames::viacode(0xE8000), "mychar1", "And that can get the alias back"); @@ -63,6 +64,7 @@ EOE is (charnames::viacode(0xF0000), "mychar2", "And that can get the alias back"); is ("\N{mychar3}", chr(0x100000), "Verify that can define U+... alias"); is (charnames::viacode(0x100000), "mychar3", "And that can get the alias back"); + is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond Unicode"); is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless control"); } @@ -151,13 +153,19 @@ sub to_bytes { { is(charnames::viacode(0x1234), "ETHIOPIC SYLLABLE SEE"); - # Unused Hebrew. - ok(! defined charnames::viacode(0x0590)); + # No name + ok(! defined charnames::viacode(0xFFFF)); } { is(sprintf("%04X", charnames::vianame("GOTHIC LETTER AHSA")), "10330"); + use warnings; + my $warning_count = @WARN; ok (! defined charnames::vianame("NONE SUCH")); + cmp_ok($warning_count, '==', @WARN, "Verify vianame doesn't warn on unknown names"); + + use bytes; + is(charnames::vianame("GOTHIC LETTER AHSA"), 0x10330, "Verify vianame \\N{name} is unaffected by 'use bytes'"); } { diff --git a/pod/perl5133delta.pod b/pod/perl5133delta.pod index 1341d65..013d29d 100644 --- a/pod/perl5133delta.pod +++ b/pod/perl5133delta.pod @@ -28,12 +28,13 @@ here, but most should go in the L section. [ List each enhancement as a =head2 entry ] -=head2 C<\N{I}> enhancements +=head2 C<\N{I}> and C enhancements -C<\N{}> now knows about the abbreviated character names listed by Unicode, such -as NBSP, SHY, LRO, ZWJ, etc., as well as all the customary abbreviations for -the C0 and C1 control characters (such as ACK, BEL, CAN, etc.), as well as a -few new variants in common usage of some C1 full names. +C<\N{}> and C now know about the abbreviated character +names listed by Unicode, such as NBSP, SHY, LRO, ZWJ, etc., as well as all the +customary abbreviations for the C0 and C1 control characters (such as ACK, BEL, +CAN, etc.), as well as a few new variants in common usage of some C1 full +names. In the past, it was ineffective to override one of Perl's abbreviations with your own custom alias. Now it works.