From: Karl Williamson Date: Sat, 16 Feb 2013 16:35:56 +0000 (-0700) Subject: Unicode::UCD: Work on non-ASCII platforms X-Git-Tag: upstream/5.20.0~2089^2~75 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a1ae4420d1f4dbfd69d098a251e40794ffa6ef9a;p=platform%2Fupstream%2Fperl.git Unicode::UCD: Work on non-ASCII platforms Now that mktables generates native tables, it is a fairly simple matter to get Unicode::UCD to work on those platforms. --- diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index f1b00a4..81e6710 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -5,7 +5,7 @@ use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -our $VERSION = '0.53'; +our $VERSION = '0.54'; require Exporter; @@ -30,6 +30,8 @@ our @EXPORT_OK = qw(charinfo use Carp; +sub IS_ASCII_PLATFORM { ord("A") == 65 } + =head1 NAME Unicode::UCD - Unicode character database @@ -104,18 +106,18 @@ Character Database. =head2 code point argument Some of the functions are called with a I, which is either -a decimal or a hexadecimal scalar designating a Unicode code point, or C -followed by hexadecimals designating a Unicode code point. In other words, if -you want a code point to be interpreted as a hexadecimal number, you must -prefix it with either C<0x> or C, because a string like e.g. C<123> will be -interpreted as a decimal code point. +a decimal or a hexadecimal scalar designating a code point in the platform's +native character set (extended to Unicode), or C followed by hexadecimals +designating a Unicode code point. A leading 0 will force a hexadecimal +interpretation, as will a hexadecimal digit that isn't a decimal digit. Examples: - 223 # Decimal 223 - 0223 # Hexadecimal 223 (= 547 decimal) - 0xDF # Hexadecimal DF (= 223 decimal - U+DF # Hexadecimal DF + 223 # Decimal 223 in native character set + 0223 # Hexadecimal 223, native (= 547 decimal) + 0xDF # Hexadecimal DF, native (= 223 decimal + U+DF # Hexadecimal DF, in Unicode's character set + (= LATIN SMALL LETTER SHARP S) Note that the largest code point in Unicode is U+10FFFF. @@ -197,7 +199,8 @@ The keys in the hash with the meanings of their values are: =item B -the input L expressed in hexadecimal, with leading zeros +the input native L expressed in hexadecimal, with +leading zeros added if necessary to make it contain at least four hexdigits =item B @@ -326,8 +329,16 @@ sub _getcode { if ($arg =~ /^[1-9]\d*$/) { return $arg; - } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { - return hex($1); + } + elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) { + return CORE::hex($1); + } + elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means + # wants the Unicode code + # point, not the native one + my $decimal = CORE::hex($1); + return $decimal if IS_ASCII_PLATFORM; + return utf8::unicode_to_native($decimal); } return; @@ -596,16 +607,15 @@ have blocks, all code points are considered to be in C.) See also L. If supplied with an argument that can't be a code point, charblock() tries to -do the opposite and interpret the argument as an old-style block name. The -return value -is a I with one range: an anonymous list with a single element that -consists of another anonymous list whose first element is the first code point -in the block, and whose second (and final) element is the final code point in -the block. (The extra list consisting of just one element is so that the same -program logic can be used to handle both this return, and the return from -L which can have multiple ranges.) You can test whether a code -point is in a range using the L function. If the argument is -not a known block, C is returned. +do the opposite and interpret the argument as an old-style block name. On an +ASCII platform, the return value is a I with one range: an +anonymous list with a single element that consists of another anonymous list +whose first element is the first code point in the block, and whose second +(and final) element is the final code point in the block. On an EBCDIC +platform, the first two Unicode blocks are not contiguous. Their range sets +are lists containing I, I code point pairs. You +can test whether a code point is in a range set using the L +function. If the argument is not a known block, C is returned. =cut @@ -635,6 +645,36 @@ sub _charblocks { } } close($BLOCKSFH); + if (! IS_ASCII_PLATFORM) { + # The first two blocks, through 0xFF, are wrong on EBCDIC + # platforms. + + my @new_blocks = _read_table("To/Blk.pl"); + + # Get rid of the first two ranges in the Unicode version, and + # replace them with the ones computed by mktables. + shift @BLOCKS; + shift @BLOCKS; + delete $BLOCKS{'Basic Latin'}; + delete $BLOCKS{'Latin-1 Supplement'}; + + # But there are multiple entries in the computed versions, and + # we change their names to (which we know) to be the old-style + # ones. + for my $i (0.. @new_blocks - 1) { + if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/ + or $new_blocks[$i][2] =~ + s/Latin_1_Supplement/Latin-1 Supplement/) + { + push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i]; + } + else { + splice @new_blocks, $i; + last; + } + } + unshift @BLOCKS, @new_blocks; + } } } } @@ -978,7 +1018,8 @@ with the following fields is returned: =item B -the input L expressed in hexadecimal, with leading zeros +the input native L expressed in hexadecimal, with +leading zeros added if necessary to make it contain at least four hexdigits =item B @@ -1243,7 +1284,8 @@ The keys in the bottom layer hash with the meanings of their values are: =item B -the input L expressed in hexadecimal, with leading zeros +the input native L expressed in hexadecimal, with +leading zeros added if necessary to make it contain at least four hexdigits =item B @@ -1331,6 +1373,20 @@ sub _casespec { my ($hexcode, $lower, $title, $upper, $condition) = ($1, $2, $3, $4, $5); + if (! IS_ASCII_PLATFORM) { # Remap entry to native + foreach my $var_ref (\$hexcode, + \$lower, + \$title, + \$upper) + { + next unless defined $$var_ref; + $$var_ref = join " ", + map { sprintf("%04X", + utf8::unicode_to_native(hex $_)) } + split " ", $$var_ref; + } + } + my $code = hex($hexcode); # In 2.1.8, there were duplicate entries; ignore all but @@ -2905,10 +2961,8 @@ RETRY: my $code_point = hex $hex_code_point; # The name of all controls is the default: the empty string. - # The set of controls is immutable, so these hard-coded - # constants work. - next if $code_point <= 0x9F - && ($code_point <= 0x1F || $code_point >= 0x7F); + # The set of controls is immutable + next if chr($code_point) =~ /[[:cntrl:]]/u; # If this is a name_alias, it isn't a name next if grep { $_ eq $name } @{$aliases{$code_point}}; @@ -3669,10 +3723,6 @@ for its block using C). Note that starting in Unicode 6.1, many of the block names have shorter synonyms. These are always given in the new style. -=head1 BUGS - -Does not yet support EBCDIC platforms. - =head1 AUTHOR Jarkko Hietaniemi. Now maintained by perl5 porters.