Unicode::UCD: Work on non-ASCII platforms

author Karl Williamson <public@khwilliamson.com>

Sat, 16 Feb 2013 16:35:56 +0000 (09:35 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 29 Aug 2013 15:55:51 +0000 (09:55 -0600)
author Karl Williamson <public@khwilliamson.com>
Sat, 16 Feb 2013 16:35:56 +0000 (09:35 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:51 +0000 (09:55 -0600)
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index f1b00a4..81e6710 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
  no warnings 'surrogate';    # surrogates can be inputs to this
  use charnames ();
  
-our $VERSION = '0.53';
+our $VERSION = '0.54';
  
  require Exporter;
  
@@ -30,6 +30,8 @@ our @EXPORT_OK = qw(charinfo
  
  use Carp;
  
+sub IS_ASCII_PLATFORM { ord("A") == 65 }
+
  =head1 NAME
  
  Unicode::UCD - Unicode character database
@@ -104,18 +106,18 @@ Character Database.
  =head2 code point argument
  
  Some of the functions are called with a I<code point argument>, which is either
-a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
-followed by hexadecimals designating a Unicode code point.  In other words, if
-you want a code point to be interpreted as a hexadecimal number, you must
-prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
-interpreted as a decimal code point.
+a decimal or a hexadecimal scalar designating a code point in the platform's
+native character set (extended to Unicode), or C<U+> followed by hexadecimals
+designating a Unicode code point.  A leading 0 will force a hexadecimal
+interpretation, as will a hexadecimal digit that isn't a decimal digit.
  
  Examples:
  
-    223     # Decimal 223
-    0223    # Hexadecimal 223 (= 547 decimal)
-    0xDF    # Hexadecimal DF (= 223 decimal
-    U+DF    # Hexadecimal DF
+    223     # Decimal 223 in native character set
+    0223    # Hexadecimal 223, native (= 547 decimal)
+    0xDF    # Hexadecimal DF, native (= 223 decimal
+    U+DF    # Hexadecimal DF, in Unicode's character set
+                              (= LATIN SMALL LETTER SHARP S)
  
  Note that the largest code point in Unicode is U+10FFFF.
  
@@ -197,7 +199,8 @@ The keys in the hash with the meanings of their values are:
  
  =item B<code>
  
-the input L</code point argument> expressed in hexadecimal, with leading zeros
+the input native L</code point argument> expressed in hexadecimal, with
+leading zeros
  added if necessary to make it contain at least four hexdigits
  
  =item B<name>
@@ -326,8 +329,16 @@ sub _getcode {
  
      if ($arg =~ /^[1-9]\d*$/) {
         return $arg;
-    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
-       return hex($1);
+    }
+    elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) {
+       return CORE::hex($1);
+    }
+    elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means
+                                                # wants the Unicode code
+                                                # point, not the native one
+        my $decimal = CORE::hex($1);
+        return $decimal if IS_ASCII_PLATFORM;
+        return utf8::unicode_to_native($decimal);
      }
  
      return;
@@ -596,16 +607,15 @@ have blocks, all code points are considered to be in C<No_Block>.)
  See also L</Blocks versus Scripts>.
  
  If supplied with an argument that can't be a code point, charblock() tries to
-do the opposite and interpret the argument as an old-style block name. The
-return value
-is a I<range set> with one range: an anonymous list with a single element that
-consists of another anonymous list whose first element is the first code point
-in the block, and whose second (and final) element is the final code point in
-the block.  (The extra list consisting of just one element is so that the same
-program logic can be used to handle both this return, and the return from
-L</charscript()> which can have multiple ranges.) You can test whether a code
-point is in a range using the L</charinrange()> function.  If the argument is
-not a known block, C<undef> is returned.
+do the opposite and interpret the argument as an old-style block name.  On an
+ASCII platform, the return value is a I<range set> with one range: an
+anonymous list with a single element that consists of another anonymous list
+whose first element is the first code point in the block, and whose second
+(and final) element is the final code point in the block.  On an EBCDIC
+platform, the first two Unicode blocks are not contiguous.  Their range sets
+are lists containing I<start-of-range>, I<end-of-range> code point pairs. You
+can test whether a code point is in a range set using the L</charinrange()>
+function. If the argument is not a known block, C<undef> is returned.
  
  =cut
  
@@ -635,6 +645,36 @@ sub _charblocks {
                 }
             }
             close($BLOCKSFH);
+            if (! IS_ASCII_PLATFORM) {
+                # The first two blocks, through 0xFF, are wrong on EBCDIC
+                # platforms.
+
+                my @new_blocks = _read_table("To/Blk.pl");
+
+                # Get rid of the first two ranges in the Unicode version, and
+                # replace them with the ones computed by mktables.
+                shift @BLOCKS;
+                shift @BLOCKS;
+                delete $BLOCKS{'Basic Latin'};
+                delete $BLOCKS{'Latin-1 Supplement'};
+
+                # But there are multiple entries in the computed versions, and
+                # we change their names to (which we know) to be the old-style
+                # ones.
+                for my $i (0.. @new_blocks - 1) {
+                    if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/
+                        or $new_blocks[$i][2] =~
+                                    s/Latin_1_Supplement/Latin-1 Supplement/)
+                    {
+                        push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i];
+                    }
+                    else {
+                        splice @new_blocks, $i;
+                        last;
+                    }
+                }
+                unshift @BLOCKS, @new_blocks;
+            }
         }
      }
  }
@@ -978,7 +1018,8 @@ with the following fields is returned:
  
  =item B<code>
  
-the input L</code point argument> expressed in hexadecimal, with leading zeros
+the input native L</code point argument> expressed in hexadecimal, with
+leading zeros
  added if necessary to make it contain at least four hexdigits
  
  =item B<full>
@@ -1243,7 +1284,8 @@ The keys in the bottom layer hash with the meanings of their values are:
  
  =item B<code>
  
-the input L</code point argument> expressed in hexadecimal, with leading zeros
+the input native L</code point argument> expressed in hexadecimal, with
+leading zeros
  added if necessary to make it contain at least four hexdigits
  
  =item B<lower>
@@ -1331,6 +1373,20 @@ sub _casespec {
  
                     my ($hexcode, $lower, $title, $upper, $condition) =
                         ($1, $2, $3, $4, $5);
+                    if (! IS_ASCII_PLATFORM) { # Remap entry to native
+                        foreach my $var_ref (\$hexcode,
+                                             \$lower,
+                                             \$title,
+                                             \$upper)
+                        {
+                            next unless defined $$var_ref;
+                            $$var_ref = join " ",
+                                        map { sprintf("%04X",
+                                              utf8::unicode_to_native(hex $_)) }
+                                        split " ", $$var_ref;
+                        }
+                    }
+
                     my $code = hex($hexcode);
  
                      # In 2.1.8, there were duplicate entries; ignore all but
@@ -2905,10 +2961,8 @@ RETRY:
                  my $code_point = hex $hex_code_point;
  
                  # The name of all controls is the default: the empty string.
-                # The set of controls is immutable, so these hard-coded
-                # constants work.
-                next if $code_point <= 0x9F
-                        && ($code_point <= 0x1F || $code_point >= 0x7F);
+                # The set of controls is immutable
+                next if chr($code_point) =~ /[[:cntrl:]]/u;
  
                  # If this is a name_alias, it isn't a name
                  next if grep { $_ eq $name } @{$aliases{$code_point}};
@@ -3669,10 +3723,6 @@ for its block using C<charblock>).
  Note that starting in Unicode 6.1, many of the block names have shorter
  synonyms.  These are always given in the new style.
  
-=head1 BUGS
-
-Does not yet support EBCDIC platforms.
-
  =head1 AUTHOR
  
  Jarkko Hietaniemi.  Now maintained by perl5 porters.
author	Karl Williamson <public@khwilliamson.com>
	Sat, 16 Feb 2013 16:35:56 +0000 (09:35 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 29 Aug 2013 15:55:51 +0000 (09:55 -0600)