Unicode::UCD::prop_invmap(): New improved API

author Karl Williamson <public@khwilliamson.com>

Fri, 10 Feb 2012 21:50:18 +0000 (14:50 -0700)

committer Karl Williamson <public@khwilliamson.com>

Fri, 10 Feb 2012 22:54:26 +0000 (15:54 -0700)
author Karl Williamson <public@khwilliamson.com>
Fri, 10 Feb 2012 21:50:18 +0000 (14:50 -0700)
committer Karl Williamson <public@khwilliamson.com>
Fri, 10 Feb 2012 22:54:26 +0000 (15:54 -0700)
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index a9d49e6..ceb4917 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -406,17 +406,17 @@ sub charinfo {
  
      %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
      $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
-                     ? sprintf("%04X", $SIMPLE_UPPER{$code} + $code)
+                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
                       : "";
  
      %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
      $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
-                     ? sprintf("%04X", $SIMPLE_LOWER{$code} + $code)
+                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
                       : "";
  
      %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
      $prop{'title'} = (defined $SIMPLE_TITLE{$code})
-                     ? sprintf("%04X", $SIMPLE_TITLE{$code} + $code)
+                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
                       : "";
  
      $prop{block}  = charblock($code);
@@ -475,8 +475,17 @@ sub _read_table ($;$) {
      my @return;
      my %return;
      local $_;
+    my $list = do "unicore/$table";
  
-    for (split /^/m, do "unicore/$table") {
+    # Look up if this property requires adjustments, which we do below if it
+    # does.
+    require "unicore/Heavy.pl";
+    my $property = $table =~ s/\.pl//r;
+    $property = $utf8::file_to_swash_name{$property};
+    my $to_adjust = defined $property
+                    && $utf8::SwashInfo{$property}{'format'} eq 'a';
+
+    for (split /^/m, $list) {
          my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
                                          \s* ( \# .* )?  # Optional comment
                                          $ /x;
@@ -484,11 +493,14 @@ sub _read_table ($;$) {
          my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
          if ($return_hash) {
              foreach my $i ($decimal_start .. $decimal_end) {
-                $return{$i} = $value;
+                $return{$i} = ($to_adjust)
+                              ? $value + $i - $decimal_start
+                              : $value;
              }
          }
-        elsif (@return &&
-               $return[-1][1] == $decimal_start - 1
+        elsif (! $to_adjust
+               && @return
+               && $return[-1][1] == $decimal_start - 1
                 && $return[-1][2] eq $value)
          {
              # If this is merely extending the previous range, do just that.
@@ -2288,11 +2300,11 @@ For example,
  A map to the empty string means that there is no alias defined for the code
  point.
  
-=item B<C<c>>
+=item B<C<a>>
  
  is like C<"s"> in that all the map array elements are scalars, but here they are
-restricted to all being integers, and each has to be tweaked to get the correct
-result by adding the code point number to it.  For example, in:
+restricted to all being integers, and some have to be adjusted (hence the name
+C<"a">) to get the correct result.  For example, in:
  
   my ($uppers_ranges_ref, $uppers_maps_ref, $format)
                            = prop_invmap("Simple_Uppercase_Mapping");
@@ -2301,25 +2313,32 @@ the returned arrays look like this:
  
   @$uppers_ranges_ref    @$uppers_maps_ref   Note
         0                      0
-      97                    -32          'a' maps to 'A', b => B ...
+      97                     65          'a' maps to 'A', b => B ...
       123                      0
-     181                    743          MICRO SIGN => Greek Cap MU
+     181                    924          MICRO SIGN => Greek Cap MU
       182                      0
       ...
  
-The first line means that the uppercase of code point 0 is 0+0; the uppercase
-of code point 1 is 1+0; ...  of code point 96 is 96+0.  In other words, the
-uppercase of each of the first 0..96 code points is itself.  The second line
-means that code point 97 maps to 97-32 (=65) or the uppercase of 'a' is 'A';
-98 => 98-32 (=66) or the uppercase of 'b' is 'B'; ... 122 => 122-32 (=90) or
-the uppercase of 'z' is 'Z'.
+Let's start with the second line.  It says that the uppercase of code point 97
+is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
+points 97 through 122.  To get the mapping for any code point in a range, you
+take the offset it has from the beginning code point of the range, and add
+that to the mapping for that first code point.  So, the mapping for 122 ("z")
+is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
+yielding 90 ("z").  Likewise for everything in between.
+
+The first line works the same way.  The first map in a range is always the
+correct value for its code point (because the adjustment is 0).  Thus the
+C<uc(chr(0))> is just itself.  Also, C<uc(chr(1))> is also itself, as the
+adjustment is 0+1-0 .. C<uc(chr(96))> is 96.
  
-By requiring adding the code point to the returned result, the arrays are made
-significantly smaller, which speeds up searching them.
+Requiring this simple adjustment allows the returned arrays to be
+significantly smaller than otherwise, up to a factor of 10, speeding up
+searching through them.
  
-=item B<C<cl>>
+=item B<C<al>>
  
-means that some of the map array elements have the form given by C<"c">, and
+means that some of the map array elements have the form given by C<"a">, and
  the rest are ordered lists of code points.
  For example, in:
  
@@ -2330,34 +2349,30 @@ the returned arrays look like this:
  
   @$uppers_ranges_ref    @$uppers_maps_ref
         0                      0
-      97                    -32
+      97                     65
       123                      0
-     181                    743
+     181                    924
       182                      0
       ...
      0x0149              [ 0x02BC 0x004E ]
      0x014A                    0
-    0x014B                   -1
+    0x014B                  330
       ...
  
  This is the full Uppercase_Mapping property (as opposed to the
-Simple_Uppercase_Mapping given in the example for format C<"c">).  The only
+Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
  difference between the two in the ranges shown is that the code point at
  0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
  characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
  CAPITAL LETTER N).
  
-Yes, there is an inconsistency here.  When the map is a single element the
-correct value must be derived by adding the code point number to it; when the
-map is a list of code points, they are the final correct values.  The reason
-for forcing the addition is to make the returned map array significantly more
-compact.  There is no such advantage to doing the same thing to the elements
-that are lists, and the addition is extra work.
+No adjustments are needed to entries that are references to arrays; each such
+entry will have exactly one element in its range, so the offset is always 0.
  
-=item B<C<ce>>
+=item B<C<ae>>
  
-This is like C<"c">, but some elements are the empty string, so not all are
-integers.
+This is like C<"a">, but some elements are the empty string, and should not be
+adjusted.
  The one internal Perl property accessible by C<prop_invmap> is of this type:
  "Perl_Decimal_Digit" returns an inversion map which gives the numeric values
  that are represented by the Unicode decimal digit characters.  Characters that
@@ -2365,40 +2380,39 @@ don't represent decimal digits map to the empty string, like so:
  
   @digits    @values
   0x0000       ""
- 0x0030       -48
+ 0x0030        0
   0x003A:      ""
- 0x0660:    -1632
+ 0x0660:       0
   0x066A:      ""
- 0x06F0:    -1776
+ 0x06F0:       0
   0x06FA:      ""
- 0x07C0:    -1984
+ 0x07C0:       0
   0x07CA:      ""
- 0x0966:    -2406
+ 0x0966:       0
   ...
  
  This means that the code points from 0 to 0x2F do not represent decimal digits;
-the code point 0x30 (DIGIT ZERO, =48 decimal) represents 48-48 = 0;  code
-point 0x31, (DIGIT ONE), represents 49-48 = 1; ... code point 0x39, (DIGIT
-NINE), represents 57-48 = 9; ... code points 0x3A through 0x65F do not
-represent decimal digits; 0x660 (ARABIC-INDIC DIGIT ZERO, =1632 decimal),
-represents 1632-1632 = 0; ... 0x07C1 (NKO DIGIT ONE, = 1985), represents
-1985-1984 = 1 ...
+the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
+represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
+... code points 0x3A through 0x65F do not represent decimal digits; 0x660
+(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
+represents 0+1-0 = 1 ...
  
-=item B<C<cle>>
+=item B<C<ale>>
  
-is a combination of the C<"cl"> type and the C<"ce"> type.  Some of
-the map array elements have the forms given by C<"cl">, and
+is a combination of the C<"al"> type and the C<"ae"> type.  Some of
+the map array elements have the forms given by C<"al">, and
  the rest are the empty string.  The property C<NFKC_Casefold> has this form.
  An example slice is:
  
   @$ranges_ref  @$maps_ref         Note
      ...
-   0x00AA     -73                 FEMININE ORDINAL INDICATOR => 'a'
-   0x00AB       0
+   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
+   0x00AB        0
     0x00AD                         SOFT HYPHEN => ""
-   0x00AE       0
+   0x00AE        0
     0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
-   0x00B0       0
+   0x00B0        0
     ...
  
  =item B<C<r>>
@@ -2467,9 +2481,9 @@ string.  This function returns that real name, the empty string.  (There are
  names for these characters, but they are considered aliases, not the Name
  property name, and are contained in the C<Name_Alias> property.)
  
-=item B<C<d>>
+=item B<C<ad>>
  
-means the Decomposition_Mapping property.  This property is like C<"cl">
+means the Decomposition_Mapping property.  This property is like C<"al">
  properties, except that one of the scalar elements is of the form:
  
   <hangul syllable>
@@ -2485,6 +2499,16 @@ and to get the final decomposition, it may need to be applied recursively.
  
  =back
  
+Note that a format begins with the letter "a" if and only the property it is
+for requires adjustments by adding the offsets in multi-element ranges.  For
+all these properties, an entry should be adjusted only if the map is a scalar
+which is an integer.  That is, it must match the regular expression:
+
+    / ^ -? \d+ $ /xa
+
+Further, the first element in a range never needs adjustment, as the
+adjustment would be just adding 0.
+
  A binary search can be used to quickly find a code point in the inversion
  list, and hence its corresponding mapping.
  
@@ -2498,7 +2522,7 @@ potentially make your data structure much smaller.  As you construct your data
  structure from the one returned by this function, simply ignore those ranges
  that map to this value, generally called the "default" value.  For example, to
  convert to the data structure searchable by L</charinrange()>, you can follow
-this recipe:
+this recipe (valid only for properties that don't require adjustments):
  
   my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
   my @range_list;
@@ -2718,16 +2742,7 @@ RETRY:
                  $decomps{'TYPE'} = "ToDt";
                  $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
                  $utf8::SwashInfo{'ToDt'}{'format'} = "s";
-            }
-            else {
-                $decomps{'TYPE'} = "ToDm";
-                $utf8::SwashInfo{'ToDm'}{'missing'} = "0";
-                $utf8::SwashInfo{'ToDm'}{'format'} = 'i';
-
-                # Use a special internal-to-this_routine format, 'dm', to
-                # distinguish from 'd', meaning decimal.
-                $utf8::SwashInfo{'ToDm'}{'format'} = "dm";
-            }
+            }   # 'dm' is handled below, with 'nfkccf'
  
              $decomps{'LIST'} = "";
  
@@ -2750,35 +2765,6 @@ RETRY:
                               ? "Canonical" :
                               $type_and_map;
                  }
-                if ($second_try eq 'dm') {
-                    my @map = map { hex } split " ", $value;
-
-                    if (@map == 1) {
-
-                        # Single character maps are converted to deltas, as
-                        # this file is stored, for backwards compatibility,
-                        # not using them.
-                        $value = $map[0] - $code_point;
-
-                        # If this is a multi-char range, process the rest of
-                        # it by doing a 'redo' after this line is done.  Fix
-                        # up the line to contain the rest of the range for
-                        # that redo.
-                        if ($hex_upper ne "" && hex $hex_upper != $code_point) {
-                            $line = sprintf("%04X\t%s\t%s",
-                                            $code_point + 1,
-                                            $hex_upper,
-                                            $type_and_map);
-                            $redo = 1;
-
-                            # Pretend that this is a single element range.
-                            $hex_upper = $hex_lower;
-                        }
-                    }
-                    else {
-                        $value = join " ", @map;
-                    }
-                }
  
                  # Insert the hangul range at the appropriate spot.
                  if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
@@ -2799,37 +2785,149 @@ RETRY:
              }
              $swash = \%decomps;
          }
-        elsif ($second_try eq 'nfkccf') {
+        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
+            return;
+        }
+
+        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
  
-            # This property is stored in the old format for backwards
-            # compatibility for any applications that read its file directly.
-            # So here we convert it to delta format for compatibility with the
-            # other properties similar to it.
-            my %nfkccf;
+            # The 'nfkccf' property is stored in the old format for backwards
+            # compatibility for any applications that has read its file
+            # directly before prop_invmap() existed.
+            # And the code above has extracted the 'dm' property from its file
+            # yielding the same format.  So here we convert them to adjusted
+            # format for compatibility with the other properties similar to
+            # them.
+            my %revised_swash;
  
-            # Create a new LIST with deltas instead of code points.
+            # We construct a new converted list.
              my $list = "";
-            foreach my $range (split "\n", $swash->{'LIST'}) {
-                my ($hex_begin, $hex_end, $map) = split "\t", $range;
+
+            my @ranges = split "\n", $swash->{'LIST'};
+            for (my $i = 0; $i < @ranges; $i++) {
+                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
+
+                # The dm property has maps that are space separated sequences
+                # of code points, as well as the special entry "<hangul
+                # syllable>, which also contains a blank.
+                my @map = split " ", $map;
+                if (@map > 1) {
+
+                    # If it's just the special entry, append as-is.
+                    if ($map eq '<hangul syllable>') {
+                        $list .= "$ranges[$i]\n";
+                    }
+                    else {
+
+                        # These should all single-element ranges.
+                        croak __PACKAGE__, "Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "";
+
+                        # Convert them to decimal, as that's what's expected.
+                        $list .= "$hex_begin\t\t"
+                            . join(" ", map { hex } @map)
+                            . "\n";
+                    }
+                    next;
+                }
+
+                # Here, the mapping doesn't have a blank, is for a single code
+                # point.
                  my $begin = hex $hex_begin;
                  my $end = (defined $hex_end && $hex_end ne "")
                          ? hex $hex_end
                          : $begin;
+
+                # Again, the output is to be in decimal.
                  my $decimal_map = hex $map;
-                foreach my $code_point ($begin .. $end) {
-                    $list .= sprintf("%04X\t\t%d\n", $code_point, $decimal_map - $code_point);
+
+                # We know that multi-element ranges with the same mapping
+                # should not be adjusted, as after the adjustment
+                # multi-element ranges are for consecutive increasing code
+                # points.  Further, the final element in the list won't be
+                # adjusted, as there is nothing after it to include in the
+                # adjustment
+                if ($begin != $end || $i == @ranges -1) {
+
+                    # So just convert these to single-element ranges
+                    foreach my $code_point ($begin .. $end) {
+                        $list .= sprintf("%04X\t\t%d\n",
+                                        $code_point, $decimal_map);
+                    }
                  }
-            }
+                else {
  
-            $nfkccf{'LIST'} = $list;
-            $nfkccf{'TYPE'} = "ToNFKCCF";
-            $nfkccf{'SPECIALS'} = $swash->{'SPECIALS'};
-            $swash = \%nfkccf;
-            $utf8::SwashInfo{'ToNFKCCF'}{'missing'} = 0;
-            $utf8::SwashInfo{'ToNFKCCF'}{'format'} = 'i';
-        }
-        else {  # Don't know this property. Fail.
-            return;
+                    # Here, we have a candidate for adjusting.  What we do is
+                    # look through the subsequent adjacent elements in the
+                    # input.  If the map to the next one differs by 1 from the
+                    # one before, then we combine into a larger range with the
+                    # initial map.  Loop doing this until we find one that
+                    # can't be combined.
+
+                    my $offset = 0;     # How far away are we from the initial
+                                        # map
+                    my $squished = 0;   # ? Did we squish at least two
+                                        # elements together into one range
+                    for ( ; $i < @ranges; $i++) {
+                        my ($next_hex_begin, $next_hex_end, $next_map)
+                                                = split "\t", $ranges[$i+1];
+
+                        # In the case of 'dm', the map may be a sequence of
+                        # multiple code points, which are never combined with
+                        # another range
+                        last if $next_map =~ / /;
+
+                        $offset++;
+                        my $next_decimal_map = hex $next_map;
+
+                        # If the next map is not next in sequence, it
+                        # shouldn't be combined.
+                        last if $next_decimal_map != $decimal_map + $offset;
+
+                        my $next_begin = hex $next_hex_begin;
+
+                        # Likewise, if the next element isn't adjacent to the
+                        # previous one, it shouldn't be combined.
+                        last if $next_begin != $begin + $offset;
+
+                        my $next_end = (defined $next_hex_end
+                                        && $next_hex_end ne "")
+                                            ? hex $next_hex_end
+                                            : $next_begin;
+
+                        # And finally, if the next element is a multi-element
+                        # range, it shouldn't be combined.
+                        last if $next_end != $next_begin;
+
+                        # Here, we will combine.  Loop to see if we should
+                        # combine the next element too.
+                        $squished = 1;
+                    }
+
+                    if ($squished) {
+
+                        # Here, 'i' is the element number of the last element to
+                        # be combined, and the range is single-element, or we
+                        # wouldn't be combining.  Get it's code point.
+                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
+                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
+                    } else {
+
+                        # Here, no combining done.  Just appen the initial
+                        # (and current) values.
+                        $list .= "$hex_begin\t\t$decimal_map\n";
+                    }
+                }
+            } # End of loop constructing the converted list
+
+            # Finish up the data structure for our converted swash
+            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
+            $revised_swash{'LIST'} = $list;
+            $revised_swash{'TYPE'} = $type;
+            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
+            $swash = \%revised_swash;
+
+            $utf8::SwashInfo{$type}{'missing'} = 0;
+            $utf8::SwashInfo{$type}{'format'} = 'a';
          }
      }
  
@@ -2849,6 +2947,8 @@ RETRY:
      $format = $utf8::SwashInfo{$returned_prop}{'format'};
      $format = 'b' unless defined $format;
  
+    my $requires_adjustment = $format =~ /^a/;
+
      # The LIST input lines look like:
      # ...
      # 0374\t\tCommon
@@ -2929,10 +3029,12 @@ RETRY:
  
              # If the input isn't in the most compact form, so that there are
              # two adjacent ranges that map to the same thing, they should be
-            # combined.  This happens in our constructed dt mapping, as
-            # Element [-2] is the map for the latest range so far processed.
-            # Just set the beginning point of the map to $missing (in
-            # invlist[-1]) to 1 beyond where this range ends.  For example, in
+            # combined (EXCEPT where the arrays require adjustments, in which
+            # case everything is already set up correctly).  This happens in
+            # our constructed dt mapping, as Element [-2] is the map for the
+            # latest range so far processed.  Just set the beginning point of
+            # the map to $missing (in invlist[-1]) to 1 beyond where this
+            # range ends.  For example, in
              # 12\t13\tXYZ
              # 14\t17\tXYZ
              # we have set it up so that it looks like
@@ -2942,7 +3044,7 @@ RETRY:
              # We now see that it should be
              # 12 => XYZ
              # 18 => $missing
-            if (@invlist > 1 && ( (defined $map)
+            if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
                                    ? $invmap[-2] eq $map
                                    : $invmap[-2] eq 'Y'))
              {
@@ -2960,7 +3062,7 @@ RETRY:
  
          # Add the range beginning, and the range's map.
          push @invlist, $begin;
-        if ($format eq 'dm') {
+        if ($returned_prop eq 'ToDm') {
  
              # The decomposition maps are either a line like <hangul syllable>
              # which are to be taken as is; or a sequence of code points in hex
@@ -3023,16 +3125,16 @@ RETRY:
      if ($overrides) {
  
          # A negative $overrides implies that the SPECIALS should be ignored,
-        # and a simple 'c' list is the value.
+        # and a simple 'a' list is the value.
          if ($overrides < 0) {
-            $format = 'c';
+            $format = 'a';
          }
          else {
  
              # Currently, all overrides are for properties that normally map to
              # single code points, but now some will map to lists of code
              # points (but there is an exception case handled below).
-            $format = 'cl';
+            $format = 'al';
  
              # Look through the overrides.
              foreach my $cp_maybe_utf8 (keys %$overrides) {
@@ -3047,16 +3149,16 @@ RETRY:
  
                      # The empty string will show up unpacked as an empty
                      # array.
-                    $format = 'cle' if @map == 0;
+                    $format = 'ale' if @map == 0;
                  }
                  else {
  
                      # But if we generated the overrides, we didn't bother to
                      # pack them, and we, so far, do this only for properties
-                    # that are 'c' ones.
+                    # that are 'a' ones.
                      $cp = $cp_maybe_utf8;
                      @map = hex $overrides->{$cp};
-                    $format = 'c';
+                    $format = 'a';
                  }
  
                  # Find the range that the override applies to.
@@ -3135,8 +3237,8 @@ RETRY:
          # converted to decimal.
          $format = 'i';
      }
-    elsif ($format eq 'dm') {
-        $format = 'd';
+    elsif ($returned_prop eq 'ToDm') {
+        $format = 'ad';
      }
      elsif ($format eq 'sw') { # blank-separated elements to form a list.
          map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
@@ -3149,7 +3251,7 @@ RETRY:
          $format = 'sl';
      }
      elsif ($returned_prop eq 'ToPerlDecimalDigit') {
-        $format = 'ce';
+        $format = 'ae';
      }
      elsif ($format ne 'n' && $format ne 'r') {
  
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t

index 6018638..4188671 100644 (file)
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -872,31 +872,31 @@ use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP);
  # whole thing.
  my $prop = "uc";
  my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
+is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
  is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
  is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
-is($invmap_ref->[1], -32, "prop_invmap('$prop') map[1] is -32");
+is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41");
  
  $prop = "upper";
  ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 's', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'");
+is($format, 's', "prop_invmap() format of '$prop' is 's");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
  is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
  is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
  
  $prop = "lower";
  ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 's', "prop_invmap() format of '$prop' is 'cl'");
-is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'");
+is($format, 's', "prop_invmap() format of '$prop' is 's'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
  is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
  is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
  
  $prop = "lc";
  ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
-is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'");
+is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
  is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
  is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
-is($invmap_ref->[1], 32, "prop_invmap('$prop') map[1] is 32");
+is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61");
  
  # This property is stable and small, so can test all of it
  $prop = "ASCII_Hex_Digit";
@@ -1259,7 +1259,7 @@ foreach my $prop (keys %props) {
              next PROPERTY;
          }
      }
-    elsif ($format =~ /^ c /x) {
+    elsif ($format =~ /^ a /x) {
          if ($full_name eq 'Perl_Decimal_Digit') {
              if ($missing ne "") {
                  fail("prop_invmap('$mod_prop')");
@@ -1273,13 +1273,6 @@ foreach my $prop (keys %props) {
              next PROPERTY;
          }
      }
-    elsif ($format =~ /^ d /x) {
-        if ($missing ne "0") {
-            fail("prop_invmap('$mod_prop')");
-            diag("The missings should be '<code point>'; got '$missing'");
-            next PROPERTY;
-        }
-    }
      elsif ($missing =~ /[<>]/) {
          fail("prop_invmap('$mod_prop')");
          diag("The missings should NOT be something with <...>'");
@@ -1305,9 +1298,9 @@ foreach my $prop (keys %props) {
              # We captured the U, L, or T, leading to uc, lc, or tc.
              $proxy_prop = lc $1 . "c";
          }
-        if ($format ne "c") {
+        if ($format ne "a") {
              fail("prop_invmap('$mod_prop')");
-            diag("The format should be 'c'; got '$format'");
+            diag("The format should be 'a'; got '$format'");
              next PROPERTY;
          }
      }
@@ -1356,7 +1349,7 @@ foreach my $prop (keys %props) {
              }
          }
          else {
-            $base_file = "Decomposition" if $format eq 'd';
+            $base_file = "Decomposition" if $format eq 'ad';
  
              # Above leaves $base_file undefined only if it came from the hash
              # below.  This should happen only when it is a binary property
@@ -1381,7 +1374,7 @@ foreach my $prop (keys %props) {
              # Get rid of any trailing space and comments in the file.
              $official =~ s/\s*(#.*)?$//mg;
  
-            if ($format eq 'd') {
+            if ($format eq 'ad') {
                  my @official = split /\n/, $official;
                  $official = "";
                  foreach my $line (@official) {
@@ -1543,7 +1536,7 @@ foreach my $prop (keys %props) {
              # don't do so if the format doesn't allow references, so that an
              # improper format will generate an error.
              if (ref $invmap_ref->[$i]
-                && ($format eq 'd' || $format =~ /^ . l /x))
+                && ($format eq 'ad' || $format =~ /^ . l /x))
              {
                  # The stringification depends on the format.
                  if ($format eq 'sl') {
@@ -1577,9 +1570,9 @@ foreach my $prop (keys %props) {
                          }
                      }
                  }
-                elsif ($format =~ / ^ cl e? $/x) {
+                elsif ($format =~ / ^ al e? $/x) {
  
-                    # For a cl property, the stringified result should be in
+                    # For a al property, the stringified result should be in
                      # the specials hash.  The key is the packed code point,
                      # and the value is the packed map.
                      my $value;
@@ -1607,7 +1600,7 @@ foreach my $prop (keys %props) {
                      }
                      next;
                  }
-                elsif ($format eq 'd') {
+                elsif ($format eq 'ad') {
  
                      # The decomposition mapping file has the code points as
                      # a string of space-separated hex constants.
@@ -1619,36 +1612,36 @@ foreach my $prop (keys %props) {
                      next PROPERTY;
                  }
              }
-            elsif ($format eq 'd' || $format eq 'cle') {
+            elsif ($format eq 'ad' || $format eq 'ale') {
  
-                # The numerics in the returned map are stored as deltas.  The
-                # defaults are 0, and don't appear in $official, and are
-                # excluded later, but the elements must be converted back to
-                # their real code point values before comparing with
+                # The numerics in the returned map are stored as adjusted
+                # decimal integers.  The defaults are 0, and don't appear in
+                # $official, and are excluded later, but the elements must be
+                # converted back to their hex values before comparing with
                  # $official, as these files, for backwards compatibility, are
-                # not stored as deltas.  (There currently is only one cle
+                # not stored as adjusted.  (There currently is only one ale
                  # property, nfkccf.  If that changed this would also have to.)
                  if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x
                      && $invmap_ref->[$i] != 0)
                  {
-                    my $delta = $invmap_ref->[$i];
-                    $invmap_ref->[$i] += $invlist_ref->[$i];
-
-                    # If there are other elements with this same delta, they
-                    # must individually be re-mapped.  Do this by splicing in
-                    # a new element into the list and the map containing the
-                    # remainder of the range.  Next time through we will look
-                    # at that (possibly splicing again until the whole range
-                    # is processed).
+                    my $next = $invmap_ref->[$i] + 1;
+                    $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]);
+
+                    # If there are other elements in this range they need to
+                    # be adjusted; they must individually be re-mapped.  Do
+                    # this by splicing in a new element into the list and the
+                    # map containing the remainder of the range.  Next time
+                    # through we will look at that (possibly splicing again
+                    # until the whole range is processed).
                      if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) {
                          splice @$invlist_ref, $i+1, 0,
                                  $invlist_ref->[$i] + 1;
-                        splice @$invmap_ref, $i+1, 0, $delta;
+                        splice @$invmap_ref, $i+1, 0, $next;
                      }
                  }
-                if ($format eq 'cle' && $invmap_ref->[$i] eq "") {
+                if ($format eq 'ale' && $invmap_ref->[$i] eq "") {
  
-                    # cle properties have maps to the empty string that also
+                    # ale properties have maps to the empty string that also
                      # should be in the specials hash, with the key the packed
                      # code point, and the map just empty.
                      my $value;
@@ -1694,17 +1687,9 @@ foreach my $prop (keys %props) {
                  next;
              }
  
-            # The 'd' property and 'c' properties whose underlying format is
-            # hexadecimal have the mapping expressed in hex in the file
-            if ($format eq 'd'
-                || ($format =~ /^c/
-                    && $swash_name
-                    && $utf8::SwashInfo{$swash_name}{'format'} eq 'x'))
-            {
-
-                # The d property has one entry which isn't in the file.
+                # The ad property has one entry which isn't in the file.
                  # Ignore it, but make sure it is in order.
-                if ($format eq 'd'
+                if ($format eq 'ad'
                      && $invmap_ref->[$i] eq '<hangul syllable>'
                      && $invlist_ref->[$i] == 0xAC00)
                  {
@@ -1717,9 +1702,6 @@ foreach my $prop (keys %props) {
                      }
                      next;
                  }
-                $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i])
-                                  if $invmap_ref->[$i] =~ / ^ [A-Fa-f0-9]+ $/x;
-            }
  
              # Finally have figured out what the map column in the file should
              # be.  Append the line to the running string.
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 9b9dd7f..5a223ad 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -1245,6 +1245,7 @@ my $INTEGER_FORMAT = 'i';
  my $HEX_FORMAT = 'x';
  my $RATIONAL_FORMAT = 'r';
  my $STRING_FORMAT = 's';
+my $ADJUST_FORMAT = 'a';
  my $DECOMP_STRING_FORMAT = 'c';
  my $STRING_WHITE_SPACE_LIST = 'sw';
  
@@ -1256,6 +1257,7 @@ my %map_table_formats = (
      $HEX_FORMAT => 'non-negative hex whole number; a code point',
      $RATIONAL_FORMAT => 'rational: an integer or a fraction',
      $STRING_FORMAT => 'string',
+    $ADJUST_FORMAT => 'some entries need adjustment',
      $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
      $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
  );
@@ -5258,6 +5260,7 @@ END
              my $next_start = 0;
              my $next_end;
              my $next_value;
+            my $offset = 0;
  
              # Output each range as part of the here document.
              RANGE:
@@ -5275,7 +5278,7 @@ END
                                && $value eq $suppress_value;
  
                  {   # This bare block encloses the scope where we may need to
-                    # split a range (when outputting deltas), and each time
+                    # split a range (when outputting adjusts), and each time
                      # through we handle the next portion of the original by
                      # ending the block with a 'redo'.   The values to use for
                      # that next time through are set up just below in the
@@ -5283,12 +5286,12 @@ END
  
                      if ($use_delta_cp) {
  
-                        # When converting to deltas, we can handle only single
-                        # element ranges.  Set up so that this time through
-                        # the loop, we look at the first element, and the next
-                        # time through, we start off with the remainder.  Thus
-                        # each time through we look at the first element of
-                        # the range
+                        # When converting to use adjustments, we can handle
+                        # only single element ranges.  Set up so that this
+                        # time through the loop, we look at the first element,
+                        # and the next time through, we start off with the
+                        # remainder.  Thus each time through we look at the
+                        # first element of the range
                          if ($end != $start) {
                              $next_start = $start + 1;
                              $next_end = $end;
@@ -5296,22 +5299,33 @@ END
                              $end = $start;
                          }
  
-                        # The values for these tables is stored as hex
-                        # strings.  Get the delta by subtracting the code
-                        # point.
-                        $value = hex($value) - $start;
+                        # The values for these tables are stored as hex
+                        # strings.  Convert to decimal
+                        if ($value =~ / ^ [A-Fa-f0-9]+ $ /x) {
+                            $value = hex($value) if $self->default_map eq $CODE_POINT;
+                        }
  
                          # If this range is adjacent to the previous one, and
-                        # the values in each are the same, then this range
-                        # really extends the previous one that is already in
-                        # element $OUT[-1].  So we pop that element, and
-                        # pretend that the range starts with whatever it
-                        # started with.
+                        # the values in each are integers that are also
+                        # adjacent (differ by 1), then this range really
+                        # extends the previous one that is already in element
+                        # $OUT[-1].  So we pop that element, and pretend that
+                        # the range starts with whatever it started with.
+                        # $offset is incremented by 1 each time so that it
+                        # gives the current offset from the first element in
+                        # the accumulating range, and we keep in $value the
+                        # value of that first element.
                          if ($start == $previous_end + 1
-                            && $value == $previous_value)
+                            && $value =~ /^ -? \d+ $/xa
+                            && $previous_value =~ /^ -? \d+ $/xa
+                            && ($value == ($previous_value + ++$offset)))
                          {
                              pop @OUT;
                              $start = $previous_start;
+                            $value = $previous_value;
+                        }
+                        else {
+                            $offset = 0;
                          }
  
                          # Save the current values for the next time through
@@ -6392,13 +6406,14 @@ END
              if ($specials_name) {
                  $return .= <<END;
  # The mappings in the non-hash portion of this file must be modified to get the
-# correct values by adding the code point ordinal number to each.
+# correct values by adding the code point ordinal number to each one that is
+# numeric.
  END
              }
              else {
                  $return .= <<END;
  # The mappings must be modified to get the correct values by adding the code
-# point ordinal number to each.
+# point ordinal number to each one that is numeric.
  END
              }
          }
@@ -6524,11 +6539,11 @@ END
          }
  
          # If the output is a delta instead of the actual value, the format of
-        # the table that gets output is actually 'i' instead of whatever it is
+        # the table that gets output is actually 'a' instead of whatever it is
          # stored internally as.
          my $output_deltas = ($self->to_output_map == $OUTPUT_DELTAS);
          if ($output_deltas) {
-            $format = 'i';
+            $format = $ADJUST_FORMAT;
          }
  
          $self->_set_format($format);
diff --git a/utf8.c b/utf8.c

index b9efb36..bfcc40c 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2908,7 +2908,6 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
                     char *after_strtol = (char *) lend;
                     *val = Strtol((char *)l, &after_strtol, 10);
                     l = (U8 *) after_strtol;
-                   *val += *min;
                 }
                 else { /* Other tables are in hex, and are the correct result
                           without tweaking */
author	Karl Williamson <public@khwilliamson.com>
	Fri, 10 Feb 2012 21:50:18 +0000 (14:50 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Fri, 10 Feb 2012 22:54:26 +0000 (15:54 -0700)
lib/Unicode/UCD.pm		patch \| blob \| history
lib/Unicode/UCD.t		patch \| blob \| history
lib/unicore/mktables		patch \| blob \| history
utf8.c		patch \| blob \| history