charnames: Make work in EBCDIC

author Karl Williamson <public@khwilliamson.com>

Sat, 16 Feb 2013 18:05:44 +0000 (11:05 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 29 Aug 2013 15:55:51 +0000 (09:55 -0600)
author Karl Williamson <public@khwilliamson.com>
Sat, 16 Feb 2013 18:05:44 +0000 (11:05 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:55:51 +0000 (09:55 -0600)
diff --git a/lib/_charnames.pm b/lib/_charnames.pm

index 7492e65..8955b6f 100644 (file)
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -7,7 +7,7 @@ package _charnames;
  use strict;
  use warnings;
  use File::Spec;
-our $VERSION = '1.37';
+our $VERSION = '1.39';
  use unicore::Name;    # mktables-generated algorithmically-defined names
  
  use bytes ();          # for $bytes::hint_bits
@@ -66,10 +66,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
  
  my %system_aliases = (
  
-    'SINGLE-SHIFT 2'                => pack("U", 0x8E),
-    'SINGLE-SHIFT 3'                => pack("U", 0x8F),
-    'PRIVATE USE 1'                 => pack("U", 0x91),
-    'PRIVATE USE 2'                 => pack("U", 0x92),
+    'SINGLE-SHIFT 2'                => pack("U", utf8::unicode_to_native(0x8E)),
+    'SINGLE-SHIFT 3'                => pack("U", utf8::unicode_to_native(0x8F)),
+    'PRIVATE USE 1'                 => pack("U", utf8::unicode_to_native(0x91)),
+    'PRIVATE USE 2'                 => pack("U", utf8::unicode_to_native(0x92)),
  );
  
  # These are the aliases above that differ under :loose and :full matching
@@ -78,7 +78,7 @@ my %system_aliases = (
  #);
  
  #my %deprecated_aliases;
-#$deprecated_aliases{'BELL'} = pack("U", 0x07) if $^V lt v5.17.0;
+#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0;
  
  #my %loose_deprecated_aliases = (
  #);
@@ -157,7 +157,9 @@ sub alias (@) # Set up a single alias
      # hex, but makes the code easier to maintain, and is called
      # infrequently, only at compile-time
      if ($value !~ $decimal_qr && $value =~ $hex_qr) {
-      $value = CORE::hex $1;
+      my $temp = CORE::hex $1;
+      $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/;
+      $value = $temp;
      }
      if ($value =~ $decimal_qr) {
          no warnings qw(non_unicode surrogate nonchar); # Allow any of these
@@ -199,7 +201,8 @@ sub alias (@) # Set up a single alias
    if (@errors) {
      foreach my $name (@errors) {
        my $ok = "";
-      $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():\xa0]* ) /x;
+      my $nbsp = chr utf8::unicode_to_native(0xa0);
+      $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():$nbsp]* ) /x;
        my $first_bad = substr($name, length($ok), 1);
        $name = "Invalid character in charnames alias definition; marked by <-- HERE in '$ok$first_bad<-- HERE " . substr($name, length($ok) + 1) . "'";
      }
@@ -697,6 +700,11 @@ sub import
  # not an issue.
  my %viacode;
  
+my $no_name_code_points_re = join "|", map { sprintf("%05X",
+                                             utf8::unicode_to_native($_)) }
+                                            0x80, 0x81, 0x84, 0x99;
+$no_name_code_points_re = qr/$no_name_code_points_re/;
+
  sub viacode {
  
    # Returns the name of the code point argument
@@ -717,8 +725,10 @@ sub viacode {
    if ($arg =~ $decimal_qr) {
      $hex = sprintf "%05X", $arg;
    } elsif ($arg =~ $hex_qr) {
+    $hex = CORE::hex $1;
+    $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/;
      # Below is the line that differs from the _getcode() source
-    $hex = sprintf "%05X", hex $1;
+    $hex = sprintf "%05X", $hex;
    } else {
      carp("unexpected arg \"$arg\" to charnames::viacode()");
      return;
@@ -751,7 +761,7 @@ sub viacode {
          $return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
  
          # If not one of these 4 code points, return what we've found.
-        if ($hex !~ / ^ 000 (?: 8[014] | 99 ) $ /x) {
+        if ($hex !~ / ^ $no_name_code_points_re $ /x) {
            $viacode{$hex} = $return;
            return $return;
          }
diff --git a/lib/charnames.pm b/lib/charnames.pm

index acb174d..52f8cee 100644 (file)
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -1,7 +1,7 @@
  package charnames;
  use strict;
  use warnings;
-our $VERSION = '1.38';
+our $VERSION = '1.39';
  use unicore::Name;    # mktables-generated algorithmically-defined names
  use _charnames ();    # The submodule for this where most of the work gets done
  
@@ -171,7 +171,7 @@ charnames ();">> did not enable C<\N{I<CHARNAME>}>.)
  
  Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
  also inserts a character into a string.
-The character it inserts is the one whose code point
+The character it inserts is the one whose Unicode code point
  (ordinal value) is equal to the number.  For example, C<"\N{U+263a}"> is
  the Unicode (white background, black foreground) smiley face
  equivalent to C<"\N{WHITE SMILING FACE}">.
@@ -300,6 +300,10 @@ with C<"U+"> or C<"0x"> with the remainder considered to be a
  hexadecimal integer.  A literal numeric constant must be unsigned; it
  will be interpreted as hex if it has a leading zero or contains
  non-decimal hex digits; otherwise it will be interpreted as decimal.
+If it begins with C<"U+">, it is interpreted as the Unicode code point;
+otherwise it is interpreted as native.  (Only code points below 256 can
+differ between Unicode and native.)  Thus C<U+41> is always the Latin letter
+"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
  
  Aliases are added either by the use of anonymous hashes:
  
@@ -402,6 +406,10 @@ with C<"U+"> or C<"0x"> with the remainder considered to be a
  hexadecimal integer.  A literal numeric constant must be unsigned; it
  will be interpreted as hex if it has a leading zero or contains
  non-decimal hex digits; otherwise it will be interpreted as decimal.
+If it begins with C<"U+">, it is interpreted as the Unicode code point;
+otherwise it is interpreted as native.  (Only code points below 256 can
+differ between Unicode and native.)  Thus C<U+41> is always the Latin letter
+"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
  
  As mentioned above under L</ALIASES>, Unicode 6.1 defines extra names
  (synonyms or aliases) for some code points, most of which were already
author	Karl Williamson <public@khwilliamson.com>
	Sat, 16 Feb 2013 18:05:44 +0000 (11:05 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 29 Aug 2013 15:55:51 +0000 (09:55 -0600)
lib/_charnames.pm		patch \| blob \| history
lib/charnames.pm		patch \| blob \| history