Unicode::UCD: add prop_aliases(), prop_value_aliases()

author Karl Williamson <public@khwilliamson.com>

Mon, 31 Oct 2011 20:07:25 +0000 (14:07 -0600)

committer Karl Williamson <public@khwilliamson.com>

Tue, 8 Nov 2011 15:09:32 +0000 (08:09 -0700)
author Karl Williamson <public@khwilliamson.com>
Mon, 31 Oct 2011 20:07:25 +0000 (14:07 -0600)
committer Karl Williamson <public@khwilliamson.com>
Tue, 8 Nov 2011 15:09:32 +0000 (08:09 -0700)
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 81ef541..2087333 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -6,7 +6,7 @@ no warnings 'surrogate';    # surrogates can be inputs to this
  use charnames ();
  use Unicode::Normalize qw(getCombinClass NFD);
  
-our $VERSION = '0.36';
+our $VERSION = '0.37';
  
  use Storable qw(dclone);
  
@@ -23,6 +23,8 @@ our @EXPORT_OK = qw(charinfo
                     casefold casespec
                     namedseq
                      num
+                    prop_aliases
+                    prop_value_aliases
                  );
  
  use Carp;
@@ -62,6 +64,12 @@ Unicode::UCD - Unicode character database
      my $categories = general_categories();
      my $types = bidi_types();
  
+    use Unicode::UCD 'prop_aliases';
+    my @space_names = prop_aliases("space");
+
+    use Unicode::UCD 'prop_value_aliases';
+    my @gc_punct_names = prop_value_aliases("Gc", "Punct");
+
      use Unicode::UCD 'compexcl';
      my $compexcl = compexcl($codepoint);
  
@@ -154,6 +162,9 @@ C<E<lt>controlE<gt>>.
  The short name of the general category of I<code>.
  This will match one of the keys in the hash returned by L</general_categories()>.
  
+The L</prop_value_aliases()> function can be used to get all the synonyms
+of the category name.
+
  =item B<combining>
  
  the combining class number for I<code> used in the Canonical Ordering Algorithm.
@@ -161,11 +172,17 @@ For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior
  available at
  L<http://www.unicode.org/versions/Unicode5.1.0/>
  
+The L</prop_value_aliases()> function can be used to get all the synonyms
+of the combining class number.
+
  =item B<bidi>
  
  bidirectional type of I<code>.
  This will match one of the keys in the hash returned by L</bidi_types()>.
  
+The L</prop_value_aliases()> function can be used to get all the synonyms
+of the bidi type name.
+
  =item B<decomposition>
  
  is empty if I<code> has no decomposition; or is one or more codes
@@ -732,6 +749,9 @@ from the long names to the short names.  The general category is the
  one returned from
  L</charinfo()> under the C<category> key.
  
+The L</prop_value_aliases()> function can be used to get all the synonyms of
+the category name.
+
  =cut
  
  my %BIDI_TYPES =
@@ -774,6 +794,9 @@ the Unicode TR9 is recommended reading:
  L<http://www.unicode.org/reports/tr9/>
  (as of Unicode 5.0.0)
  
+The L</prop_value_aliases()> function can be used to get all the synonyms of
+the bidi type name.
+
  =cut
  
  sub bidi_types {
@@ -1382,6 +1405,382 @@ sub num {
      return $value;
  }
  
+=pod
+
+=head2 B<prop_aliases()>
+
+    use Unicode::UCD 'prop_aliases';
+
+    my ($short_name, $full_name, @other_names) = prop_aliases("space");
+    my $same_full_name = prop_aliases("Space");     # Scalar context
+    my ($same_short_name) = prop_aliases("Space");  # gets 0th element
+    print "The full name is $full_name\n";
+    print "The short name is $short_name\n";
+    print "The other aliases are: ", join(", ", @other_names), "\n";
+
+    prints:
+    The full name is White_Space
+    The short name is WSpace
+    The other aliases are: Space
+
+Most Unicode properties have several synonymous names.  Typically, there is at
+least a short name, convenient to type, and a long name that more fully
+describes the property, and hence is more easily understood.
+
+If you know one name for a Unicode property, you can use C<prop_aliases> to find
+either the long name (when called in scalar context), or a list of all of the
+names, somewhat ordered so that the short name is in the 0th element, the long
+name in the next element, and any other synonyms are in the remaining
+elements, in no particular order.
+
+The long name is returned in a form nicely capitalized, suitable for printing.
+
+The input parameter name is loosely matched, which means that white space,
+hyphens, and underscores are ignored (except for the trailing underscore in
+the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
+both of which mean C<General_Category=Cased Letter>).
+
+If the name is unknown, C<undef> is returned (or an empty list in list
+context).  Note that Perl typically recognizes property names in regular
+expressions with an optional C<"Is_>" (with or without the underscore)
+prefixed to them, such as C<\p{isgc=punct}>.  This function does not recognize
+those in the input, returning C<undef>.  Nor are they included in the output
+as possible synonyms.
+
+C<prop_aliases> does know about the Perl extensions to Unicode properties,
+such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
+properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>.  The
+final example demonstrates that the C<"Is_"> prefix is recognized for these
+extensions; it is needed to resolve ambiguities.  For example,
+C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
+C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>.  This is
+because C<islc> is a Perl extension which is short for
+C<General_Category=Cased Letter>.  The lists returned for the Perl extensions
+will not include the C<"Is_"> prefix (whether or not the input had it) unless
+needed to resolve ambiguities, as shown in the C<"islc"> example, where the
+returned list had one element containing C<"Is_">, and the other without.
+
+It is also possible for the reverse to happen:  C<prop_aliases('isc')> returns
+the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
+C<(C, Other)> (the latter being a Perl extension meaning
+C<General_Category=Other>.  L<perluniprops> lists the available forms,
+including which ones are discouraged from use.
+
+Those discouraged forms are accepted as input to C<prop_aliases>, but are not
+returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
+which are old synonyms for C<"Is_LC"> and should not be used in new code, are
+examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
+function allows you to take a discourarged form, and find its acceptable
+alternatives.  The same goes with single-form Block property equivalences.
+Only the forms that begin with C<"In_"> are not discouraged; if you pass
+C<prop_aliases> a discouraged form, you will get back the equivalent ones that
+begin with C<"In_">.  It will otherwise look like a new-style block name (see.
+L</Old-style versus new-style block names>).
+
+C<prop_aliases> does not know about any user-defined properties, and will
+return C<undef> if called with one of those.  Likewise for Perl internal
+properties, with the exception of "Perl_Decimal_Digit" which it does know
+about (and which is documented below in L</prop_invmap()>).
+
+=cut
+
+# It may be that there are use cases where the discouraged forms should be
+# returned.  If that comes up, an optional boolean second parameter to the
+# function could be created, for example.
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our %string_property_loose_to_name;
+our %ambiguous_names;
+our %loose_perlprop_to_name;
+our %prop_aliases;
+
+sub prop_aliases ($) {
+    my $prop = $_[0];
+    return unless defined $prop;
+
+    require "unicore/UCD.pl";
+    require "unicore/Heavy.pl";
+    require "utf8_heavy.pl";
+
+    # The property name may be loosely or strictly matched; we don't know yet.
+    # But both types use lower-case.
+    $prop = lc $prop;
+
+    # It is loosely matched if its lower case isn't known to be strict.
+    my $list_ref;
+    if (! exists $utf8::stricter_to_file_of{$prop}) {
+        my $loose = utf8::_loose_name($prop);
+
+        # There is a hash that converts from any loose name to its standard
+        # form, mapping all synonyms for a  name to one name that can be used
+        # as a key into another hash.  The whole concept is for memory
+        # savings, as the second hash doesn't have to have all the
+        # combinations.  Actually, there are two hashes that do the
+        # converstion.  One is used in utf8_heavy.pl (stored in Heavy.pl) for
+        # looking up properties matchable in regexes.  This function needs to
+        # access string properties, which aren't available in regexes, so a
+        # second conversion hash is made for them (stored in UCD.pl).  Look in
+        # the string one now, as the rest can have an optional 'is' prefix,
+        # which these don't.
+        if (exists $string_property_loose_to_name{$loose}) {
+
+            # Convert to its standard loose name.
+            $prop = $string_property_loose_to_name{$loose};
+        }
+        else {
+            my $retrying = 0;   # bool.  ? Has an initial 'is' been stripped
+        RETRY:
+            if (exists $utf8::loose_property_name_of{$loose}
+                && (! $retrying
+                    || ! exists $ambiguous_names{$loose}))
+            {
+                # Found an entry giving the standard form.  We don't get here
+                # (in the test above) when we've stripped off an
+                # 'is' and the result is an ambiguous name.  That is because
+                # these are official Unicode properties (though Perl can have
+                # an optional 'is' prefix meaning the official property), and
+                # all ambiguous cases involve a Perl single-form extension
+                # for the gc, script, or block properties, and the stripped
+                # 'is' means that they mean one of those, and not one of
+                # these
+                $prop = $utf8::loose_property_name_of{$loose};
+            }
+            elsif (exists $loose_perlprop_to_name{$loose}) {
+
+                # This hash is specifically for this function to list Perl
+                # extensions that aren't in the earlier hashes.  If there is
+                # only one element, the short and long names are identical.
+                # Otherwise the form is already in the same form as
+                # %prop_aliases, which is handled at the end of the function.
+                $list_ref = $loose_perlprop_to_name{$loose};
+                if (@$list_ref == 1) {
+                    my @list = ($list_ref->[0], $list_ref->[0]);
+                    $list_ref = \@list;
+                }
+            }
+            elsif (! exists $utf8::loose_to_file_of{$loose}) {
+
+                # loose_to_file_of is a complete list of loose names.  If not
+                # there, the input is unknown.
+                return;
+            }
+            else {
+
+                # Here we found the name but not its aliases, so it has to
+                # exist.  This means it must be one of the Perl single-form
+                # extensions.  First see if it is for a property-value
+                # combination in one of the following properties.
+                my @list;
+                foreach my $property ("gc", "script") {
+                    @list = prop_value_aliases($property, $loose);
+                    last if @list;
+                }
+                if (@list) {
+
+                    # Here, it is one of those property-value combination
+                    # single-form synonyms.  There are ambiguities with some
+                    # of these.  Check against the list for these, and adjust
+                    # if necessary.
+                    for my $i (0 .. @list -1) {
+                        if (exists $ambiguous_names
+                                   {utf8::_loose_name(lc $list[$i])})
+                        {
+                            # The ambiguity is resolved by toggling whether or
+                            # not it has an 'is' prefix
+                            $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
+                        }
+                    }
+                    return @list;
+                }
+
+                # Here, it wasn't one of the gc or script single-form
+                # extensions.  It could be a block property single-form
+                # extension.  An 'in' prefix definitely means that, and should
+                # be looked up without the prefix.
+                my $began_with_in = $loose =~ s/^in//;
+                @list = prop_value_aliases("block", $loose);
+                if (@list) {
+                    map { $_ =~ s/^/In_/ } @list;
+                    return @list;
+                }
+
+                # Here still haven't found it.  The last opportunity for it
+                # being valid is only if it began with 'is'.  We retry without
+                # the 'is', setting a flag to that effect so that we don't
+                # accept things that begin with 'isis...'
+                if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
+                    $retrying = 1;
+                    goto RETRY;
+                }
+
+                # Here, didn't find it.  Since it was in %loose_to_file_of, we
+                # should have been able to find it.
+                carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'.  Send bug report to perlbug\@perl.org";
+                return;
+            }
+        }
+    }
+
+    if (! $list_ref) {
+        # Here, we have set $prop to a standard form name of the input.  Look
+        # it up in the structure created by mktables for this purpose, which
+        # contains both strict and loosely matched properties.  Avoid
+        # autovivifying.
+        $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
+        return unless $list_ref;
+    }
+
+    # The full name is in element 1.
+    return $list_ref->[1] unless wantarray;
+
+    return @{dclone $list_ref};
+}
+
+=pod
+
+=head2 B<prop_value_aliases()>
+
+    use Unicode::UCD 'prop_value_aliases';
+
+    my ($short_name, $full_name, @other_names)
+                                   = prop_value_aliases("Gc", "Punct");
+    my $same_full_name = prop_value_aliases("Gc", "P");   # Scalar cntxt
+    my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
+                                                           # element
+    print "The full name is $full_name\n";
+    print "The short name is $short_name\n";
+    print "The other aliases are: ", join(", ", @other_names), "\n";
+
+    prints:
+    The full name is Punctuation
+    The short name is P
+    The other aliases are: Punct
+
+Some Unicode properties have a restricted set of legal values.  For example,
+all binary properties are restricted to just C<true> or C<false>; and there
+are only a few dozen possible General Categories.
+
+For such properties, there are usually several synonyms for each possible
+value.  For example, in binary properties, I<truth> can be represented by any of
+the strings "Y", "Yes", "T", or "True"; and the General Category
+"Punctuation" by that string, or "Punct", or simply "P".
+
+Like property names, there is typically at least a short name for each such
+property-value, and a long name.  If you know any name of the property-value,
+you can use C<prop_value_aliases>() to get the long name (when called in
+scalar context), or a list of all the names, with the short name in the 0th
+element, the long name in the next element, and any other synonyms in the
+remaining elements, in no particular order, except that any all-numeric
+synonyms will be last.
+
+The long name is returned in a form nicely capitalized, suitable for printing.
+
+Case, white space, hyphens, and underscores are ignored in the input parameters
+(except for the trailing underscore in the old-form grandfathered-in general
+category property value C<"L_">, which is better written as C<"LC">).
+
+If either name is unknown, C<undef> is returned.  Note that Perl typically
+recognizes property names in regular expressions with an optional C<"Is_>"
+(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
+This function does not recognize those in the property parameter, returning
+C<undef>.
+
+If called with a property that doesn't have synonyms for its values, it
+returns the input value, possibly normalized with capitalization and
+underscores.
+
+For the block property, new-style block names are returned (see
+L</Old-style versus new-style block names>).
+
+To find the synonyms for single-forms, such as C<\p{Any}>, use
+L</prop_aliases()> instead.
+
+C<prop_value_aliases> does not know about any user-defined properties, and
+will return C<undef> if called with one of those.
+
+=cut
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our %loose_to_standard_value;
+our %prop_value_aliases;
+
+sub prop_value_aliases ($$) {
+    my ($prop, $value) = @_;
+    return unless defined $prop && defined $value;
+
+    require "unicore/UCD.pl";
+    require "utf8_heavy.pl";
+
+    # Find the property name synonym that's used as the key in other hashes,
+    # which is element 0 in the returned list.
+    ($prop) = prop_aliases($prop);
+    return if ! $prop;
+    $prop = utf8::_loose_name(lc $prop);
+
+    # Here is a legal property, but the hash below (created by mktables for
+    # this purpose) only knows about the properties that have a very finite
+    # number of potential values, that is not ones whose value could be
+    # anything, like most (if not all) string properties.  These don't have
+    # synonyms anyway.  Simply return the input.  For example, there is no
+    # synonym for ('Uppercase_Mapping', A').
+    return $value if ! exists $prop_value_aliases{$prop};
+
+    # The value name may be loosely or strictly matched; we don't know yet.
+    # But both types use lower-case.
+    $value = lc $value;
+
+    # If the name isn't found under loose matching, it certainly won't be
+    # found under strict
+    my $loose_value = utf8::_loose_name($value);
+    return unless exists $loose_to_standard_value{"$prop=$loose_value"};
+
+    # Similarly if the combination under loose matching doesn't exist, it
+    # won't exist under strict.
+    my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
+    return unless exists $prop_value_aliases{$prop}{$standard_value};
+
+    # Here we did find a combination under loose matching rules.  But it could
+    # be that is a strict property match that shouldn't have matched.
+    # %prop_value_aliases is set up so that the strict matches will appear as
+    # if they were in loose form.  Thus, if the non-loose version is legal,
+    # we're ok, can skip the further check.
+    if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
+
+        # We're also ok and skip the further check if value loosely matches.
+        # mktables has verified that no strict name under loose rules maps to
+        # an existing loose name.  This code relies on the very limited
+        # circumstances that strict names can be here.  Strict name matching
+        # happens under two conditions:
+        # 1) when the name begins with an underscore.  But this function
+        #    doesn't accept those, and %prop_value_aliases doesn't have
+        #    them.
+        # 2) When the values are numeric, in which case we need to look
+        #    further, but their squeezed-out loose values will be in
+        #    %stricter_to_file_of
+        && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
+    {
+        # The only thing that's legal loosely under strict is that can have an
+        # underscore between digit pairs XXX
+        while ($value =~ s/(\d)_(\d)/$1$2/g) {}
+        return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
+    }
+
+    # Here, we know that the combination exists.  Return it.
+    my $list_ref = $prop_value_aliases{$prop}{$standard_value};
+    if (@$list_ref > 1) {
+        # The full name is in element 1.
+        return $list_ref->[1] unless wantarray;
+
+        return @{dclone $list_ref};
+    }
+
+    return $list_ref->[0] unless wantarray;
+
+    # Only 1 element means that it repeats
+    return ( $list_ref->[0], $list_ref->[0] );
+}
  
  
  =head2 Unicode::UCD::UnicodeVersion
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t

index 98bab65..df601d1 100644 (file)
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -508,4 +508,345 @@ is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMB
  is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
  is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9');
  
+# Create a user-defined property
+sub InKana {<<'END'}
+3040    309F
+30A0    30FF
+END
+
+use Unicode::UCD qw(prop_aliases);
+
+is(prop_aliases(undef), undef, "prop_aliases(undef) returns <undef>");
+is(prop_aliases("unknown property"), undef,
+                "prop_aliases(<unknown property>) returns <undef>");
+is(prop_aliases("InKana"), undef,
+                "prop_aliases(<user-defined property>) returns <undef>");
+is(prop_aliases("Perl_Decomposition_Mapping"), undef, "prop_aliases('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only");
+is(prop_aliases("Perl_Charnames"), undef,
+    "prop_aliases('Perl_Charnames') returns <undef> since internal-Perl-only");
+is(prop_aliases("isgc"), undef,
+    "prop_aliases('isgc') returns <undef> since is not covered Perl extension");
+is(prop_aliases("Is_Is_Any"), undef,
+                "prop_aliases('Is_Is_Any') returns <undef> since two is's");
+
+require 'utf8_heavy.pl';
+require "unicore/Heavy.pl";
+
+# Keys are lists of properties. Values are defined if have been tested.
+my %props;
+
+# To test for loose matching, add in the characters that are ignored there.
+my $extra_chars = "-_ ";
+
+# The one internal property we accept
+$props{'Perl_Decimal_Digit'} = 1;
+my @list = prop_aliases("perldecimaldigit");
+is_deeply(\@list,
+          [ "Perl_Decimal_Digit",
+            "Perl_Decimal_Digit"
+          ], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names");
+
+# Get the official Unicode property name synonyms and test them.
+open my $props, "<", "../lib/unicore/PropertyAliases.txt"
+                or die "Can't open Unicode PropertyAliases.txt";
+while (<$props>) {
+    s/\s*#.*//;           # Remove comments
+    next if /^\s* $/x;    # Ignore empty and comment lines
+
+    chomp;
+    my $count = 0;  # 0th field in line is short name; 1th is long name
+    my $short_name;
+    my $full_name;
+    my @names_via_short;
+    foreach my $alias (split /\s*;\s*/) {    # Fields are separated by
+                                             # semi-colons
+        # Add in the characters that are supposed to be ignored, to test loose
+        # matching, which the tested function does on all inputs.
+        my $mod_name = "$extra_chars$alias";
+
+        my $loose = utf8::_loose_name(lc $alias);
+
+        # Indicate we have tested this.
+        $props{$loose} = 1;
+
+        my @all_names = prop_aliases($mod_name);
+        if (grep { $_ eq $loose } @Unicode::UCD::suppressed_properties) {
+            is(@all_names, 0, "prop_aliases('$mod_name') returns undef since $alias is not installed");
+            next;
+        }
+        elsif (! @all_names) {
+            fail("prop_aliases('$mod_name')");
+            diag("'$alias' is unknown to prop_aliases()");
+            next;
+        }
+
+        if ($count == 0) {  # Is short name
+
+            @names_via_short = prop_aliases($mod_name);
+
+            # If the 0th test fails, no sense in continuing with the others
+            last unless is($names_via_short[0], $alias,
+                    "prop_aliases: '$alias' is the short name for '$mod_name'");
+            $short_name = $alias;
+        }
+        elsif ($count == 1) {   # Is full name
+
+            # Some properties have the same short and full name; no sense
+            # repeating the test if the same.
+            if ($alias ne $short_name) {
+                my @names_via_full = prop_aliases($mod_name);
+                is_deeply(\@names_via_full, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'");
+            }
+
+            # Tests scalar context
+            is(prop_aliases($short_name), $alias,
+                "prop_aliases: '$alias' is the long name for '$short_name'");
+        }
+        else {  # Is another alias
+            is_deeply(\@all_names, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'");
+            ok((grep { $_ =~ /^$alias$/i } @all_names),
+                "prop_aliases: '$alias' is listed as an alias for '$mod_name'");
+        }
+
+        $count++;
+    }
+}
+
+# Now test anything we can find that wasn't covered by the tests of the
+# official properties.  We have no way of knowing if mktables omitted a Perl
+# extension or not, but we do the best we can from its generated lists
+
+foreach my $alias (keys %utf8::loose_to_file_of) {
+    next if $alias =~ /=/;
+    my $lc_name = lc $alias;
+    my $loose = utf8::_loose_name($lc_name);
+    next if exists $props{$loose};  # Skip if already tested
+    $props{$loose} = 1;
+    my $mod_name = "$extra_chars$alias";    # Tests loose matching
+    my @aliases = prop_aliases($mod_name);
+    my $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases;
+    if ($found_it) {
+        pass("prop_aliases: '$lc_name' is listed as an alias for '$mod_name'");
+    }
+    elsif ($lc_name =~ /l[_&]$/) {
+
+        # These two names are special in that they don't appear in the
+        # returned list because they are discouraged from use.  Verify
+        # that they return the same list as a non-discouraged version.
+        my @LC = prop_aliases('Is_LC');
+        is_deeply(\@aliases, \@LC, "prop_aliases: '$lc_name' returns the same list as 'Is_LC'");
+    }
+    else {
+        my $stripped = $lc_name =~ s/^is//;
+
+        # Could be that the input includes a prefix 'is', which is rarely
+        # returned as an alias, so having successfully stripped it off above,
+        # try again.
+        if ($stripped) {
+            $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases;
+        }
+
+        # If that didn't work, it could be that it's a block, which is always
+        # returned with a leading 'In_' to avoid ambiguity.  Try comparing
+        # with that stripped off.
+        if (! $found_it) {
+            $found_it = grep { utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name }
+                              @aliases;
+            # Could check that is a real block, but tests for invmap will
+            # likely pickup any errors, since this will be tested there.
+            $lc_name = "in$lc_name" if $found_it;   # Change for message below
+        }
+        my $message = "prop_aliases: '$lc_name' is listed as an alias for '$mod_name'";
+        ($found_it) ? pass($message) : fail($message);
+    }
+}
+
+my $done_equals = 0;
+foreach my $alias (keys %utf8::stricter_to_file_of) {
+    if ($alias =~ /=/) {    # Only test one case where there is an equals
+        next if $done_equals;
+        $done_equals = 1;
+    }
+    my $lc_name = lc $alias;
+    my @list = prop_aliases($alias);
+    if ($alias =~ /^_/) {
+        is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since it is internal_only");
+    }
+    elsif ($alias =~ /=/) {
+        is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since is illegal property name");
+    }
+    else {
+        ok((grep { lc $_ eq $lc_name } @list),
+                "prop_aliases: '$lc_name' is listed as an alias for '$alias'");
+    }
+}
+
+use Unicode::UCD qw(prop_value_aliases);
+
+is(prop_value_aliases("unknown property", "unknown value"), undef,
+    "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>");
+is(prop_value_aliases(undef, undef), undef,
+                           "prop_value_aliases(undef, undef) returns <undef>");
+is((prop_value_aliases("na", "A")), "A", "test that prop_value_aliases returns its input for properties that don't have synonyms");
+is(prop_value_aliases("isgc", "C"), undef, "prop_value_aliases('isgc', 'C') returns <undef> since is not covered Perl extension");
+is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') returns <undef> since is not covered Perl extension");
+
+# We have no way of knowing if mktables omitted a Perl extension that it
+# shouldn't have, but we can check if it omitted an official Unicode property
+# name synonym.  And for those, we can check if the short and full names are
+# correct.
+
+my %pva_tested;   # List of things already tested.
+open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt"
+     or die "Can't open Unicode PropValueAliases.txt";
+while (<$propvalues>) {
+    s/\s*#.*//;           # Remove comments
+    next if /^\s* $/x;    # Ignore empty and comment lines
+    chomp;
+
+    my @fields = split /\s*;\s*/; # Fields are separated by semi-colons
+    my $prop = shift @fields;   # 0th field is the property,
+    my $count = 0;  # 0th field in line (after shifting off the property) is
+                    # short name; 1th is long name
+    my $short_name;
+    my @names_via_short;    # Saves the values between iterations
+
+    # The property on the lhs of the = is always loosely matched.  Add in
+    # characters that are ignored under loose matching to test that
+    my $mod_prop = "$extra_chars$prop";
+
+    if ($fields[0] eq 'n/a') {  # See comments in input file, essentially
+                                # means full name and short name are identical
+        $fields[0] = $fields[1];
+    }
+    elsif ($fields[0] ne $fields[1]
+           && utf8::_loose_name(lc $fields[0])
+               eq utf8::_loose_name(lc $fields[1])
+           && $fields[1] !~ /[[:upper:]]/)
+    {
+        # Also, there is a bug in the file in which "n/a" is omitted, and
+        # the two fields are identical except for case, and the full name
+        # is all lower case.  Copy the "short" name unto the full one to
+        # give it some upper case.
+
+        $fields[1] = $fields[0];
+    }
+
+    # The ccc property in the file is special; has an extra numeric field
+    # (0th), which should go at the end, since we use the next two fields as
+    # the short and full names, respectively.  See comments in input file.
+    splice (@fields, 0, 0, splice(@fields, 1, 2)) if $prop eq 'ccc';
+
+    my $loose_prop = utf8::_loose_name(lc $prop);
+    my $suppressed = grep { $_ eq $loose_prop }
+                          @Unicode::UCD::suppressed_properties;
+    foreach my $value (@fields) {
+        if ($suppressed) {
+            is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop");
+            next;
+        }
+        elsif (grep { $_ eq ("$loose_prop=" . utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) {
+            is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop=$value");
+            next;
+        }
+
+        # Add in test for loose matching.
+        my $mod_value = "$extra_chars$value";
+
+        # If the value is a number, optionally negative, including a floating
+        # point or rational numer, it should be only strictly matched, so the
+        # loose matching should fail.
+        if ($value =~ / ^ -? \d+ (?: [\/.] \d+ )? $ /x) {
+            is(prop_value_aliases($mod_prop, $mod_value), undef, "prop_value_aliases('$mod_prop', '$mod_value') returns undef because '$mod_value' should be strictly matched");
+
+            # And reset so below tests just the strict matching.
+            $mod_value = $value;
+        }
+
+        if ($count == 0) {
+
+            @names_via_short = prop_value_aliases($mod_prop, $mod_value);
+
+            # If the 0th test fails, no sense in continuing with the others
+            last unless is($names_via_short[0], $value, "prop_value_aliases: In '$prop', '$value' is the short name for '$mod_value'");
+            $short_name = $value;
+        }
+        elsif ($count == 1) {
+
+            # Some properties have the same short and full name; no sense
+            # repeating the test if the same.
+            if ($value ne $short_name) {
+                my @names_via_full =
+                            prop_value_aliases($mod_prop, $mod_value);
+                is_deeply(\@names_via_full, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'");
+            }
+
+            # Tests scalar context
+            is(prop_value_aliases($prop, $short_name), $value, "'$value' is the long name for prop_value_aliases('$prop', '$short_name')");
+        }
+        else {
+            my @all_names = prop_value_aliases($mod_prop, $mod_value);
+            is_deeply(\@all_names, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'");
+            ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')");
+        }
+
+        $pva_tested{utf8::_loose_name(lc $prop) . "=" . utf8::_loose_name(lc $value)} = 1;
+        $count++;
+    }
+}
+
+# And test as best we can, the non-official pva's that mktables generates.
+foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) {
+    foreach my $test (keys %$hash) {
+        next if exists $pva_tested{$test};  # Skip if already tested
+
+        my ($prop, $value) = split "=", $test;
+        next unless defined $value; # prop_value_aliases() requires an input
+                                    # 'value'
+        my $mod_value;
+        if ($hash == \%utf8::loose_to_file_of) {
+
+            # Add extra characters to test loose-match rhs value
+            $mod_value = "$extra_chars$value";
+        }
+        else { # Here value is strictly matched.
+
+            # Extra elements are added by mktables to this hash so that
+            # something like "age=6.0" has a synonym of "age=6".  It's not
+            # clear to me (khw) if we should be encouraging those synonyms, so
+            # don't test for them.
+            next if $value !~ /\D/ && exists $hash->{"$prop=$value.0"};
+
+            # Verify that loose matching fails when only strict is called for.
+            next unless is(prop_value_aliases($prop, "$extra_chars$value"), undef,
+                        "prop_value_aliases('$prop', '$extra_chars$value') returns undef since '$value' should be strictly matched"),
+
+            # Strict matching does allow for underscores between digits.  Test
+            # for that.
+            $mod_value = $value;
+            while ($mod_value =~ s/(\d)(\d)/$1_$2/g) {}
+        }
+
+        # The lhs property is always loosely matched, so add in extra
+        # characters to test that.
+        my $mod_prop = "$extra_chars$prop";
+
+        if ($prop eq 'gc' && $value =~ /l[_&]$/) {
+            # These two names are special in that they don't appear in the
+            # returned list because they are discouraged from use.  Verify
+            # that they return the same list as a non-discouraged version.
+            my @LC = prop_value_aliases('gc', 'lc');
+            my @l_ = prop_value_aliases($mod_prop, $mod_value);
+            is_deeply(\@l_, \@LC, "prop_value_aliases('$mod_prop', '$mod_value) returns the same list as prop_value_aliases('gc', 'lc')");
+        }
+        else {
+            ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) }
+                prop_value_aliases($mod_prop, $mod_value)),
+                "'$value' is listed as an alias for prop_value_aliases('$mod_prop', '$mod_value')");
+        }
+    }
+}
+
+undef %pva_tested;
+
  done_testing();
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 904c6de..d6c43af 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -192,6 +192,13 @@ Locales updated to CLDR 2.0: mk, mt, nb, nn, ro, ru, sk, sr, sv, uk
  Newly supported locales: fa, ml, mr, or, pa, si, si__dictionary,
  sr_Latn, sv__reformed, ta, te, th, ur, wae.
  
+=item *
+
+L<Unicode::UCD> has been upgraded from version 0.36 to version 0.37.
+This adds two new functions, C<prop_aliases()>, and
+C<prop_value_aliases()> which are used to find all the Unicode-approved
+synonyms for property names, or to convert from one name to another.
+
  =back
  
  =head2 Removed Modules and Pragmata
author	Karl Williamson <public@khwilliamson.com>
	Mon, 31 Oct 2011 20:07:25 +0000 (14:07 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Tue, 8 Nov 2011 15:09:32 +0000 (08:09 -0700)
lib/Unicode/UCD.pm		patch \| blob \| history
lib/Unicode/UCD.t		patch \| blob \| history
pod/perldelta.pod		patch \| blob \| history