From 2a59a71e7ec8027f4c08e367896eb56205f24b4c Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Mon, 28 Oct 2013 16:43:01 -0600
Subject: [PATCH] mktables: Stop generating most leading zeros

Leading zeros were generated to conform with Unicode usage, but these
are machine-read files so this just takes up some extra space and extra
parsing cycles at run-time.  It's a small matter, but we should design
our files to be the most efficient possible.  It is possible to get more
human-readable files by using the -annotate option to mktables.

Certain files whose existence has been published have their formats
unchanged, in case some application is reading them.  The files contain
comments that their use is deprecated, but there is no warning generated
if they are opened and read, nor is it really feasible to add such a
warning.  At some time in the future, we may feel it's ok to remove
these files, as their contents have been available since v5.16 through a
stable API in Unicode::UCD, but until we remove them, we shouldn't
change their formats.

Not all other leading zeros are removed; just the ones that were
convenient to remove.
---
 lib/Unicode/UCD.t    | 58 +++++++++++++++++++++++++++++++++++++---------------
 lib/unicore/mktables | 32 +++++++++++++++++++----------
 2 files changed, 63 insertions(+), 27 deletions(-)
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index c21b7a9..2350ad8 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1082,9 +1082,9 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
         # Now construct a string from the list that should match the file.
         # The file gives ranges of code points with starting and ending values
         # in hex, like this:
-        # 0041\t005A
-        # 0061\t007A
-        # 00AA
+        # 41\t5A
+        # 61\t7A
+        # AA
         # Our list has even numbered elements start ranges that are in the
         # list, and odd ones that aren't in the list.  Therefore the odd
         # numbered ones are one beyond the end of the previous range, but
@@ -1095,10 +1095,10 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
             my $start = $tested[$i];
             my $end = $tested[$i+1] - 1;
             if ($start == $end) {
-                $tested .= sprintf("%04X\n", $start);
+                $tested .= sprintf("%X\n", $start);
             }
             else {
-                $tested .= sprintf "%04X\t%04X\n", $start, $end;
+                $tested .= sprintf "%X\t%X\n", $start, $end;
             }
         }
 
@@ -1106,7 +1106,7 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
         # whereas the prop_invlist() ones go as high as necessary.  The
         # comparison is only valid through max Unicode.
         if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) {
-            $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]);
+            $tested .= sprintf("%X\t10FFFF\n", $tested[$i]);
         }
         local $/ = "\n";
         chomp $tested;
@@ -1183,6 +1183,12 @@ is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-
 @list = prop_invmap("Is_Is_Any");
 is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's");
 
+# The files for these properties shouldn't have their formats changed in case
+# applications use them (though such use is deprecated).
+my @legacy_file_format = qw( Bidi_Mirroring_Glyph
+                             NFKC_Casefold
+                           );
+
 # The set of properties to test on has already been compiled into %props by
 # the prop_aliases() tests.
 
@@ -1371,7 +1377,7 @@ foreach my $prop (sort keys %props) {
             {
                 # Translate the charblocks() data structure to what the file
                 # would like.
-                $official .= sprintf"%04X\t%04X\t%s\n",
+                $official .= sprintf"%X\t%X\t%s\n",
                              $range->[0][0],
                              $range->[0][1],
                              $range->[0][2];
@@ -1420,7 +1426,7 @@ foreach my $prop (sort keys %props) {
                     # easier below.
                     if ($end ne "") {
                         for my $i (hex($start) + 1 .. hex $end) {
-                            $official .= sprintf "%04X\t\t%s\n", $i, $value;
+                            $official .= sprintf "%X\t\t%s\n", $i, $value;
                         }
                     }
                 }
@@ -1447,6 +1453,19 @@ foreach my $prop (sort keys %props) {
             $file_format = $utf8::SwashInfo{$swash_name}{'format'};
         }
 
+        # Leading zeros used to be used with the values in the files that give,
+        # ranges, but these have been mostly stripped off, except for some
+        # files whose formats should not change in any way.
+        my $file_range_format = (grep { $full_name eq $_ } @legacy_file_format)
+                              ? "%04X"
+                              : "%X";
+        # Currently this property still has leading zeroes in the mapped-to
+        # values, but otherwise, those values follow the same rules as the
+        # ranges.
+        my $file_map_format = ($full_name eq 'Decomposition_Mapping')
+                              ? "%04X"
+                              : $file_range_format;
+
         # Certain of the proxy properties have to be adjusted to match the
         # real ones.
         if ($full_name =~ /^(Case_Folding|(Lower|Title|Upper)case_Mapping)/) {
@@ -1523,10 +1542,15 @@ foreach my $prop (sort keys %props) {
             for my $element (@list) {
                 $official .= "\n" if $official;
                 if ($element->[1] == $element->[0]) {
-                    $official .= sprintf "%04X\t\t%X", $element->[0], $element->[2];
+                    $official
+                        .= sprintf "$file_range_format\t\t$file_map_format",
+                                    $element->[0],        $element->[2];
                 }
                 else {
-                    $official .= sprintf "%04X\t%04X\t%X", $element->[0], $element->[1], $element->[2];
+                    $official .= sprintf "$file_range_format\t$file_range_format\t$file_map_format",
+                                         $element->[0],
+                                         $element->[1],
+                                         $element->[2];
                 }
             }
         }
@@ -1595,7 +1619,7 @@ foreach my $prop (sort keys %props) {
                         # other property; thus the special handling of the
                         # first line.
                         if (ref $invmap_ref->[$i]) {
-                            my $hex_cp = sprintf("%04X", $invlist_ref->[$i]);
+                            my $hex_cp = sprintf("%X", $invlist_ref->[$i]);
                             my $concatenated = $invmap_ref->[$i][0];
                             for (my $j = 1; $j < @{$invmap_ref->[$i]}; $j++) {
                                 $concatenated .= "\n$hex_cp\t\t" . $invmap_ref->[$i][$j];
@@ -1664,7 +1688,8 @@ foreach my $prop (sort keys %props) {
                     && $invmap_ref->[$i] != 0)
                 {
                     my $next = $invmap_ref->[$i] + 1;
-                    $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]);
+                    $invmap_ref->[$i] = sprintf($file_map_format,
+                                                $invmap_ref->[$i]);
 
                     # If there are other elements in this range they need to
                     # be adjusted; they must individually be re-mapped.  Do
@@ -1746,15 +1771,16 @@ foreach my $prop (sort keys %props) {
             # be.  Append the line to the running string.
             my $start = $invlist_ref->[$i];
             my $end = $invlist_ref->[$i+1] - 1;
-            $end = ($start == $end) ? "" : sprintf("%04X", $end);
+            $end = ($start == $end) ? "" : sprintf($file_range_format, $end);
             if ($invmap_ref->[$i] ne "") {
-                $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i];
+                $tested_map .= sprintf "$file_range_format\t%s\t%s\n",
+                                        $start, $end, $invmap_ref->[$i];
             }
             elsif ($end ne "") {
-                $tested_map .= sprintf "%04X\t%s\n", $start, $end;
+                $tested_map .= sprintf "$file_range_format\t%s\n", $start, $end;
             }
             else {
-                $tested_map .= sprintf "%04X\n", $start;
+                $tested_map .= sprintf "$file_range_format\n", $start;
             }
         } # End of looping over all elements.
 
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index a335c90..0dd50ca 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -5607,6 +5607,13 @@ END
 
             my $output_value_in_hex = $self->isa('Map_Table')
                                       && $self->format eq $HEX_ADJUST_FORMAT;
+            # Use leading zeroes just for files whose format should not be
+            # changed from what it has been.  Otherwise, they just take up
+            # space and time to process.
+            my $hex_format = ($self->isa('Map_Table')
+                              && $self->to_output_map == $EXTERNAL_MAP)
+                             ? "%04X"
+                             : "%X";
 
             # Output each range as part of the here document.
             RANGE:
@@ -5703,10 +5710,11 @@ END
                     # If there is a range and doesn't need a single point range
                     # output
                     if ($start != $end && ! $range_size_1) {
-                        push @OUT, sprintf "%04X\t%04X", $start, $end;
+                        push @OUT, sprintf "$hex_format\t$hex_format",
+                                             $start,       $end;
                         if ($value ne "") {
                             if ($output_value_in_hex) {
-                                $OUT[-1] .= sprintf "\t%X", $value;
+                                $OUT[-1] .= sprintf "\t$hex_format", $value;
                             }
                             else {
                                 $OUT[-1] .= "\t$value";
@@ -5749,11 +5757,12 @@ END
                             # Here, caller is ok with default output.
                             for (my $i = $start; $i <= $end; $i++) {
                                 if ($output_value_in_hex) {
-                                    push @OUT, sprintf "%04X\t\t%X\n",
-                                                       $i, $value;
+                                    push @OUT,
+                                        sprintf "$hex_format\t\t$hex_format\n",
+                                                 $i,            $value;
                                 }
                                 else {
-                                    push @OUT, sprintf "%04X\t\t%s\n",
+                                    push @OUT, sprintf "$hex_format\t\t%s\n",
                                                         $i, $value;
                                 }
                             }
@@ -5791,9 +5800,9 @@ END
                                 # Here is to output a range.  We don't allow a
                                 # caller-specified output format--just use the
                                 # standard one.
-                                push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
-                                                                $range_end,
-                                                                $value;
+                                push @OUT, sprintf
+                                            "$hex_format\t$hex_format\t%s\t#",
+                                              $i,         $range_end,  $value;
                                 my $range_name = $viacode[$i];
 
                                 # For the code points which end in their hex
@@ -5801,7 +5810,7 @@ END
                                 # annotation, and capitalize only the first
                                 # letter of each word.
                                 if ($type == $CP_IN_NAME) {
-                                    my $hex = sprintf "%04X", $i;
+                                    my $hex = sprintf $hex_format, $i;
                                     $range_name =~ s/-$hex$//;
                                     my @words = split " ", $range_name;
                                     for my $word (@words) {
@@ -5901,7 +5910,8 @@ END
                                                 if $viacode[$output_value];
                                     }
 
-                                    $output_value = sprintf("%X", $output_value)
+                                    $output_value = sprintf($hex_format,
+                                                            $output_value)
                                         if  $format eq $HEX_ADJUST_FORMAT
                                             || ($format eq $HEX_FORMAT
                                                 && $self->full_name
@@ -5922,7 +5932,7 @@ END
                                     push @OUT, $base_part;
                                 }
                                 else {
-                                    push @OUT, sprintf "%04X\t\t%s",
+                                    push @OUT, sprintf "$hex_format\t\t%s",
                                                         $i, $output_value;
                                 }
 
-- 
2.7.4