From 58b75e361ab3c69893639c37026611faa2ed9b35 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 29 Dec 2011 19:14:07 -0700 Subject: [PATCH] Unicode::UCD: Use 6.1 format for Name_Alias The Name_Alias property is changing significantly in Unicode 6.1. It will have an extra value for each alias (and there will be a lot more aliases). That extra value is considered informative, and gives the type of alias this is. prop_invmap() should return both the alias and its type. This commit changes it so that even in 6.0, it will return what it is going to return in 6.1 (for the 6.0 aliases). This function has not been released in a stable Perl version. Thus, this gets the ultimate format into Perl before anyone has come to depend on the earlier one. squish --- lib/Unicode/UCD.pm | 23 ++++++++++++++++++----- lib/unicore/mktables | 12 ++++++++++-- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index e1a6cdb..42303c2 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -6,7 +6,7 @@ no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); use Unicode::Normalize qw(getCombinClass NFD); -our $VERSION = '0.37'; +our $VERSION = '0.38'; use Storable qw(dclone); @@ -2233,6 +2233,23 @@ of calling C() with the "Script Extensions" property: Here, the code points 0x964 and 0x965 are used in the Bengali, Devanagari, Gurmukhi, and Oriya scripts. +The Name_Alias property is of this form. But each scalar consists of two +components: 1) the name, and 2) the type of alias this is. They are +separated by a colon and a space. In Unicode 6.0, there are two alias types: +C<"correction">, which indicates that the name is a corrected form for the +original name (which remains valid) for the same code point; and C<"control">, +which adds a new name for a control character. + +For example, + + @aliases_ranges @alias_maps + ... + 0x01A2 LATIN CAPITAL LETTER GHA: correction + 0x01A3 LATIN SMALL LETTER GHA: correction + +Unicode 6.1 will introduce other types, and some map entries will be lists of +multiple name-alias pairs for a single code point. + =item C means that all the elements of the map array are either rational numbers or @@ -2452,10 +2469,6 @@ Note that the inversion maps returned for the C and C properties do not include the Turkic-locale mappings. Use L for these. -The C property is potentially undergoing signficant revision by -Unicode at the time of this writing. The format of the values returned for it -may change substantially in future Unicode versions. - C does not know about any user-defined properties, and will return C if called with one of those. diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 69c71a7..7bcf8f1 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -11392,7 +11392,12 @@ sub filter_script_extensions_line { } sub setup_v6_name_alias { - property_ref('Name_Alias')->add_map(7, 7, "ALERT"); + property_ref('Name_Alias')->add_map(7, 7, "ALERT: control"); +} + +sub filter_early_version_name_alias_line { + $_ .= ": correction"; + return; } sub finish_Unicode() { @@ -12211,7 +12216,8 @@ sub compile_perl() { if ($range->start != $range->end) { Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;"); } - $perl_charname->add_duplicate($range->start, $range->value); + $perl_charname->add_duplicate($range->start, + $range->value =~ s/:.*//r); } $alias_sentence = < ($v_version ge v6.0.0) ? \&setup_v6_name_alias : undef, + Each_Line_Handler => + \&filter_early_version_name_alias_line, ), Input_file->new("BidiTest.txt", v5.2.0, Skip => 'Validation Tests', -- 2.7.4