From 2a59a71e7ec8027f4c08e367896eb56205f24b4c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 28 Oct 2013 16:43:01 -0600 Subject: [PATCH] mktables: Stop generating most leading zeros Leading zeros were generated to conform with Unicode usage, but these are machine-read files so this just takes up some extra space and extra parsing cycles at run-time. It's a small matter, but we should design our files to be the most efficient possible. It is possible to get more human-readable files by using the -annotate option to mktables. Certain files whose existence has been published have their formats unchanged, in case some application is reading them. The files contain comments that their use is deprecated, but there is no warning generated if they are opened and read, nor is it really feasible to add such a warning. At some time in the future, we may feel it's ok to remove these files, as their contents have been available since v5.16 through a stable API in Unicode::UCD, but until we remove them, we shouldn't change their formats. Not all other leading zeros are removed; just the ones that were convenient to remove. --- lib/Unicode/UCD.t | 58 +++++++++++++++++++++++++++++++++++++--------------- lib/unicore/mktables | 32 +++++++++++++++++++---------- 2 files changed, 63 insertions(+), 27 deletions(-) diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index c21b7a9..2350ad8 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -1082,9 +1082,9 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of # Now construct a string from the list that should match the file. # The file gives ranges of code points with starting and ending values # in hex, like this: - # 0041\t005A - # 0061\t007A - # 00AA + # 41\t5A + # 61\t7A + # AA # Our list has even numbered elements start ranges that are in the # list, and odd ones that aren't in the list. Therefore the odd # numbered ones are one beyond the end of the previous range, but @@ -1095,10 +1095,10 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of my $start = $tested[$i]; my $end = $tested[$i+1] - 1; if ($start == $end) { - $tested .= sprintf("%04X\n", $start); + $tested .= sprintf("%X\n", $start); } else { - $tested .= sprintf "%04X\t%04X\n", $start, $end; + $tested .= sprintf "%X\t%X\n", $start, $end; } } @@ -1106,7 +1106,7 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of # whereas the prop_invlist() ones go as high as necessary. The # comparison is only valid through max Unicode. if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) { - $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]); + $tested .= sprintf("%X\t10FFFF\n", $tested[$i]); } local $/ = "\n"; chomp $tested; @@ -1183,6 +1183,12 @@ is(@list, 0, "prop_invmap('Perl_Charnames') returns since internal-Perl- @list = prop_invmap("Is_Is_Any"); is(@list, 0, "prop_invmap('Is_Is_Any') returns since two is's"); +# The files for these properties shouldn't have their formats changed in case +# applications use them (though such use is deprecated). +my @legacy_file_format = qw( Bidi_Mirroring_Glyph + NFKC_Casefold + ); + # The set of properties to test on has already been compiled into %props by # the prop_aliases() tests. @@ -1371,7 +1377,7 @@ foreach my $prop (sort keys %props) { { # Translate the charblocks() data structure to what the file # would like. - $official .= sprintf"%04X\t%04X\t%s\n", + $official .= sprintf"%X\t%X\t%s\n", $range->[0][0], $range->[0][1], $range->[0][2]; @@ -1420,7 +1426,7 @@ foreach my $prop (sort keys %props) { # easier below. if ($end ne "") { for my $i (hex($start) + 1 .. hex $end) { - $official .= sprintf "%04X\t\t%s\n", $i, $value; + $official .= sprintf "%X\t\t%s\n", $i, $value; } } } @@ -1447,6 +1453,19 @@ foreach my $prop (sort keys %props) { $file_format = $utf8::SwashInfo{$swash_name}{'format'}; } + # Leading zeros used to be used with the values in the files that give, + # ranges, but these have been mostly stripped off, except for some + # files whose formats should not change in any way. + my $file_range_format = (grep { $full_name eq $_ } @legacy_file_format) + ? "%04X" + : "%X"; + # Currently this property still has leading zeroes in the mapped-to + # values, but otherwise, those values follow the same rules as the + # ranges. + my $file_map_format = ($full_name eq 'Decomposition_Mapping') + ? "%04X" + : $file_range_format; + # Certain of the proxy properties have to be adjusted to match the # real ones. if ($full_name =~ /^(Case_Folding|(Lower|Title|Upper)case_Mapping)/) { @@ -1523,10 +1542,15 @@ foreach my $prop (sort keys %props) { for my $element (@list) { $official .= "\n" if $official; if ($element->[1] == $element->[0]) { - $official .= sprintf "%04X\t\t%X", $element->[0], $element->[2]; + $official + .= sprintf "$file_range_format\t\t$file_map_format", + $element->[0], $element->[2]; } else { - $official .= sprintf "%04X\t%04X\t%X", $element->[0], $element->[1], $element->[2]; + $official .= sprintf "$file_range_format\t$file_range_format\t$file_map_format", + $element->[0], + $element->[1], + $element->[2]; } } } @@ -1595,7 +1619,7 @@ foreach my $prop (sort keys %props) { # other property; thus the special handling of the # first line. if (ref $invmap_ref->[$i]) { - my $hex_cp = sprintf("%04X", $invlist_ref->[$i]); + my $hex_cp = sprintf("%X", $invlist_ref->[$i]); my $concatenated = $invmap_ref->[$i][0]; for (my $j = 1; $j < @{$invmap_ref->[$i]}; $j++) { $concatenated .= "\n$hex_cp\t\t" . $invmap_ref->[$i][$j]; @@ -1664,7 +1688,8 @@ foreach my $prop (sort keys %props) { && $invmap_ref->[$i] != 0) { my $next = $invmap_ref->[$i] + 1; - $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]); + $invmap_ref->[$i] = sprintf($file_map_format, + $invmap_ref->[$i]); # If there are other elements in this range they need to # be adjusted; they must individually be re-mapped. Do @@ -1746,15 +1771,16 @@ foreach my $prop (sort keys %props) { # be. Append the line to the running string. my $start = $invlist_ref->[$i]; my $end = $invlist_ref->[$i+1] - 1; - $end = ($start == $end) ? "" : sprintf("%04X", $end); + $end = ($start == $end) ? "" : sprintf($file_range_format, $end); if ($invmap_ref->[$i] ne "") { - $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i]; + $tested_map .= sprintf "$file_range_format\t%s\t%s\n", + $start, $end, $invmap_ref->[$i]; } elsif ($end ne "") { - $tested_map .= sprintf "%04X\t%s\n", $start, $end; + $tested_map .= sprintf "$file_range_format\t%s\n", $start, $end; } else { - $tested_map .= sprintf "%04X\n", $start; + $tested_map .= sprintf "$file_range_format\n", $start; } } # End of looping over all elements. diff --git a/lib/unicore/mktables b/lib/unicore/mktables index a335c90..0dd50ca 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -5607,6 +5607,13 @@ END my $output_value_in_hex = $self->isa('Map_Table') && $self->format eq $HEX_ADJUST_FORMAT; + # Use leading zeroes just for files whose format should not be + # changed from what it has been. Otherwise, they just take up + # space and time to process. + my $hex_format = ($self->isa('Map_Table') + && $self->to_output_map == $EXTERNAL_MAP) + ? "%04X" + : "%X"; # Output each range as part of the here document. RANGE: @@ -5703,10 +5710,11 @@ END # If there is a range and doesn't need a single point range # output if ($start != $end && ! $range_size_1) { - push @OUT, sprintf "%04X\t%04X", $start, $end; + push @OUT, sprintf "$hex_format\t$hex_format", + $start, $end; if ($value ne "") { if ($output_value_in_hex) { - $OUT[-1] .= sprintf "\t%X", $value; + $OUT[-1] .= sprintf "\t$hex_format", $value; } else { $OUT[-1] .= "\t$value"; @@ -5749,11 +5757,12 @@ END # Here, caller is ok with default output. for (my $i = $start; $i <= $end; $i++) { if ($output_value_in_hex) { - push @OUT, sprintf "%04X\t\t%X\n", - $i, $value; + push @OUT, + sprintf "$hex_format\t\t$hex_format\n", + $i, $value; } else { - push @OUT, sprintf "%04X\t\t%s\n", + push @OUT, sprintf "$hex_format\t\t%s\n", $i, $value; } } @@ -5791,9 +5800,9 @@ END # Here is to output a range. We don't allow a # caller-specified output format--just use the # standard one. - push @OUT, sprintf "%04X\t%04X\t%s\t#", $i, - $range_end, - $value; + push @OUT, sprintf + "$hex_format\t$hex_format\t%s\t#", + $i, $range_end, $value; my $range_name = $viacode[$i]; # For the code points which end in their hex @@ -5801,7 +5810,7 @@ END # annotation, and capitalize only the first # letter of each word. if ($type == $CP_IN_NAME) { - my $hex = sprintf "%04X", $i; + my $hex = sprintf $hex_format, $i; $range_name =~ s/-$hex$//; my @words = split " ", $range_name; for my $word (@words) { @@ -5901,7 +5910,8 @@ END if $viacode[$output_value]; } - $output_value = sprintf("%X", $output_value) + $output_value = sprintf($hex_format, + $output_value) if $format eq $HEX_ADJUST_FORMAT || ($format eq $HEX_FORMAT && $self->full_name @@ -5922,7 +5932,7 @@ END push @OUT, $base_part; } else { - push @OUT, sprintf "%04X\t\t%s", + push @OUT, sprintf "$hex_format\t\t%s", $i, $output_value; } -- 2.7.4