use strict;
use warnings;
use File::Spec;
-our $VERSION = '1.37';
+our $VERSION = '1.39';
use unicore::Name; # mktables-generated algorithmically-defined names
use bytes (); # for $bytes::hint_bits
my %system_aliases = (
- 'SINGLE-SHIFT 2' => pack("U", 0x8E),
- 'SINGLE-SHIFT 3' => pack("U", 0x8F),
- 'PRIVATE USE 1' => pack("U", 0x91),
- 'PRIVATE USE 2' => pack("U", 0x92),
+ 'SINGLE-SHIFT 2' => pack("U", utf8::unicode_to_native(0x8E)),
+ 'SINGLE-SHIFT 3' => pack("U", utf8::unicode_to_native(0x8F)),
+ 'PRIVATE USE 1' => pack("U", utf8::unicode_to_native(0x91)),
+ 'PRIVATE USE 2' => pack("U", utf8::unicode_to_native(0x92)),
);
# These are the aliases above that differ under :loose and :full matching
#);
#my %deprecated_aliases;
-#$deprecated_aliases{'BELL'} = pack("U", 0x07) if $^V lt v5.17.0;
+#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0;
#my %loose_deprecated_aliases = (
#);
# hex, but makes the code easier to maintain, and is called
# infrequently, only at compile-time
if ($value !~ $decimal_qr && $value =~ $hex_qr) {
- $value = CORE::hex $1;
+ my $temp = CORE::hex $1;
+ $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/;
+ $value = $temp;
}
if ($value =~ $decimal_qr) {
no warnings qw(non_unicode surrogate nonchar); # Allow any of these
if (@errors) {
foreach my $name (@errors) {
my $ok = "";
- $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():\xa0]* ) /x;
+ my $nbsp = chr utf8::unicode_to_native(0xa0);
+ $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():$nbsp]* ) /x;
my $first_bad = substr($name, length($ok), 1);
$name = "Invalid character in charnames alias definition; marked by <-- HERE in '$ok$first_bad<-- HERE " . substr($name, length($ok) + 1) . "'";
}
# not an issue.
my %viacode;
+my $no_name_code_points_re = join "|", map { sprintf("%05X",
+ utf8::unicode_to_native($_)) }
+ 0x80, 0x81, 0x84, 0x99;
+$no_name_code_points_re = qr/$no_name_code_points_re/;
+
sub viacode {
# Returns the name of the code point argument
if ($arg =~ $decimal_qr) {
$hex = sprintf "%05X", $arg;
} elsif ($arg =~ $hex_qr) {
+ $hex = CORE::hex $1;
+ $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/;
# Below is the line that differs from the _getcode() source
- $hex = sprintf "%05X", hex $1;
+ $hex = sprintf "%05X", $hex;
} else {
carp("unexpected arg \"$arg\" to charnames::viacode()");
return;
$return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
# If not one of these 4 code points, return what we've found.
- if ($hex !~ / ^ 000 (?: 8[014] | 99 ) $ /x) {
+ if ($hex !~ / ^ $no_name_code_points_re $ /x) {
$viacode{$hex} = $return;
return $return;
}
package charnames;
use strict;
use warnings;
-our $VERSION = '1.38';
+our $VERSION = '1.39';
use unicore::Name; # mktables-generated algorithmically-defined names
use _charnames (); # The submodule for this where most of the work gets done
Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
also inserts a character into a string.
-The character it inserts is the one whose code point
+The character it inserts is the one whose Unicode code point
(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
the Unicode (white background, black foreground) smiley face
equivalent to C<"\N{WHITE SMILING FACE}">.
hexadecimal integer. A literal numeric constant must be unsigned; it
will be interpreted as hex if it has a leading zero or contains
non-decimal hex digits; otherwise it will be interpreted as decimal.
+If it begins with C<"U+">, it is interpreted as the Unicode code point;
+otherwise it is interpreted as native. (Only code points below 256 can
+differ between Unicode and native.) Thus C<U+41> is always the Latin letter
+"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
Aliases are added either by the use of anonymous hashes:
hexadecimal integer. A literal numeric constant must be unsigned; it
will be interpreted as hex if it has a leading zero or contains
non-decimal hex digits; otherwise it will be interpreted as decimal.
+If it begins with C<"U+">, it is interpreted as the Unicode code point;
+otherwise it is interpreted as native. (Only code points below 256 can
+differ between Unicode and native.) Thus C<U+41> is always the Latin letter
+"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
As mentioned above under L</ALIASES>, Unicode 6.1 defines extra names
(synonyms or aliases) for some code points, most of which were already