From 8c32d378be54f33019ceb25427ae2173078043b9 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Wed, 15 Jun 2011 16:22:38 -0600
Subject: [PATCH] mktables: Allow for loose \N{} matching

mktables makes several tables and defines a subroutine for looking up
algorithmically determinable names.  Extend this to allow for Unicode
loose matching of names.

This is part of a patch sequence to extend this.
---
 lib/unicore/mktables | 64 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index e93e0a2..31ab73d 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -1097,9 +1097,13 @@ my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
 
 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
 # two must be 10; if there are 5, the first must not be a 0.  Written this way
-# to decrease backtracking
-my $code_point_re =
-        qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
+# to decrease backtracking.  The first one allows the code point to be at the
+# end of a word, but to work properly, the word shouldn't end with a valid hex
+# character.  The second one won't match a code point at the end of a word,
+# and doesn't have the run-on issue
+my $run_on_code_point_re =
+            qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
+my $code_point_re = qr/\b$run_on_code_point_re/;
 
 # This matches the beginning of the line in the Unicode db files that give the
 # defaults for code points not listed (i.e., missing) in the file.  The code
@@ -5819,6 +5823,7 @@ END
     # array giving all the ranges that use this base name.  Each range
     # is actually a hash giving the 'low' and 'high' values of it.
     my %names_ending_in_code_point;
+    my %loose_names_ending_in_code_point;
 
     # Inverse mapping.  The list of ranges that have these kinds of
     # names.  Each element contains the low, high, and base names in a
@@ -5862,6 +5867,10 @@ END
             push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
             push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
 
+            my $squeezed = $map =~ s/[-\s]+//gr;
+            push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, $low;
+            push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high;
+
             push @code_points_ending_in_code_point, { low => $low,
                                                         high => $high,
                                                         name => $map
@@ -5985,6 +5994,8 @@ END
                                     ' ' x 8);
             my $names = main::simple_dumper(\%names_ending_in_code_point,
                                             ' ' x 8);
+            my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
+                                            ' ' x 8);
 
             # Do the same with the Hangul names,
             my $jamo;
@@ -6037,16 +6048,25 @@ END
 
     # Matches legal code point.  4-6 hex numbers, If there are 6, the
     # first two must be '10'; if there are 5, the first must not be a '0'.
+    # First can match at the end of a word provided that the end of the
+    # word doesn't look like a hex number.
+    my \$run_on_code_point_re = qr/$run_on_code_point_re/;
     my \$code_point_re = qr/$code_point_re/;
 
     # In the following hash, the keys are the bases of names which includes
     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
     # of each key is another hash which is used to get the low and high ends
-    # for each range of code points that apply to the name
+    # for each range of code points that apply to the name.
     my %names_ending_in_code_point = (
 $names
     );
 
+    # The following hash is a copy of the previous one, except is for loose
+    # matching, so each name has blanks and dashes squeezed out
+    my %loose_names_ending_in_code_point = (
+$loose_names
+    );
+
     # And the following array gives the inverse mapping from code points to
     # names.  Lowest code points are first
     my \@code_points_ending_in_code_point = (
@@ -6083,7 +6103,7 @@ $jamo_t
     my \$syllable_re = qr/$jamo_re/;
 
     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
-    my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
+    my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
 
     # These constants names and values were taken from the Unicode standard,
     # version 5.1, section 3.12.  They are used in conjunction with Hangul
@@ -6103,16 +6123,19 @@ END
             $pre_body .= << 'END';
 
     sub name_to_code_point_special {
-        my $name = shift;
+        my ($name, $loose) = @_;
 
         # Returns undef if not one of the specially handled names; otherwise
         # returns the code point equivalent to the input name
+        # $loose is non-zero if to use loose matching, 'name' in that case
+        # must be input as upper case with all blanks and dashes squeezed out.
 END
             if ($has_hangul_syllables) {
                 $pre_body .= << 'END';
 
-        if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
-            $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
+        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
+            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
+        {
             return if $name !~ qr/^$syllable_re$/;
             my $L = $Jamo_L{$1};
             my $V = $Jamo_V{$2};
@@ -6123,22 +6146,30 @@ END
             }
             $pre_body .= << 'END';
 
-        # Name must end in '-code_point' for this to handle.
-        if ($name !~ /^ (.*) - ($code_point_re) $/x) {
-            return;
-        }
+        # Name must end in 'code_point' for this to handle.
+        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
+                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
 
         my $base = $1;
         my $code_point = CORE::hex $2;
+        my $names_ref;
+
+        if ($loose) {
+            $names_ref = \%loose_names_ending_in_code_point;
+        }
+        else {
+            return if $base !~ s/-$//;
+            $names_ref = \%names_ending_in_code_point;
+        }
 
         # Name must be one of the ones which has the code point in it.
-        return if ! $names_ending_in_code_point{$base};
+        return if ! $names_ref->{$base};
 
         # Look through the list of ranges that apply to this name to see if
         # the code point is in one of them.
-        for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
-            return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
-            next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
+        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
+            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
+            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
 
             # Here, the code point is in the range.
             return $code_point;
@@ -6229,6 +6260,7 @@ END
         $has_hangul_syllables = 0;
         undef @multi_code_point_maps;
         undef %names_ending_in_code_point;
+        undef %loose_names_ending_in_code_point;
         undef @code_points_ending_in_code_point;
 
         # Calculate the format of the table if not already done.
-- 
2.7.4