mktables: Inline short tables

author Karl Williamson <public@khwilliamson.com>

Fri, 14 Mar 2014 20:23:21 +0000 (14:23 -0600)

committer Karl Williamson <public@khwilliamson.com>

Fri, 14 Mar 2014 20:58:06 +0000 (14:58 -0600)
author Karl Williamson <public@khwilliamson.com>
Fri, 14 Mar 2014 20:23:21 +0000 (14:23 -0600)
committer Karl Williamson <public@khwilliamson.com>
Fri, 14 Mar 2014 20:58:06 +0000 (14:58 -0600)
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t

index 5ed15c6..c6b50fd 100644 (file)
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1051,9 +1051,19 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
          }
          $tested_invlist{$file} = dclone \@tested;
  
-        # A leading '!' in the file name means that it is to be inverted.
-        my $invert = $file =~ s/^!//;
-        my $official = do "unicore/lib/$file.pl";
+        # A '!' in the file name means that it is to be inverted.
+        my $invert = $file =~ s/!//;
+        my $official;
+
+        # If the file's directory is '#', it is a special case where the
+        # contents are in-lined with semi-colons meaning new-lines, instead of
+        # it being an actual file to read.
+        if ($file =~ s!^#/!!) {
+            $official = $file =~ s/;/\n/gr;
+        }
+        else {
+            $official = do "unicore/lib/$file.pl";
+        }
  
          # Get rid of any trailing space and comments in the file.
          $official =~ s/\s*(#.*)?$//mg;
@@ -1475,13 +1485,19 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
              # property comes along without these characteristics
              if (!defined $base_file) {
                  $base_file = $utf8::loose_to_file_of{$proxy_prop};
-                $is_binary = ($base_file =~ s/^!//) ? -1 : 1;
-                $base_file = "lib/$base_file";
+                $is_binary = ($base_file =~ s/!//) ? -1 : 1;
+                $base_file = "lib/$base_file" unless $base_file =~ m!^#/!;
              }
  
-            # Read in the file
-            $file = "unicore/$base_file.pl";
-            $official = do $file;
+            # Read in the file.  If the file's directory is '#', it is a
+            # special case where the contents are in-lined with semi-colons
+            # meaning new-lines, instead of it being an actual file to read.
+            if ($base_file =~ s!^#/!!) {
+                $official = $base_file =~ s/;/\n/gr;
+            }
+            else {
+                $official = do "unicore/$base_file.pl";
+            }
  
              # Get rid of any trailing space and comments in the file.
              $official =~ s/\s*(#.*)?$//mg;
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 840b15a..d92c69d 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -14823,8 +14823,9 @@ sub register_file_for_name($$$) {
      # Join all the file path components together, using slashes.
      my $full_filename = join('/', @$directory_ref, $file);
  
-    # All go in the same subdirectory of unicore
-    if ($directory_ref->[0] ne $matches_directory) {
+    # All go in the same subdirectory of unicore, or the special
+    # pseudo-directory '#'
+    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
          Carp::my_carp("Unexpected directory in "
                  .  join('/', @{$directory_ref}, $file));
      }
@@ -17021,8 +17022,12 @@ sub write_all_tables() {
                      push @unhandled_properties, "$table";
                  }
  
-                # An empty table is just the complement of everything.
-                $table->set_complement($All) if $table != $property;
+                # The old way of expressing an empty match list was to
+                # complement the list that matches everything.  The new way is
+                # to create an empty inversion list, but this doesn't work for
+                # annotating, so use the old way then.
+                $table->set_complement($All) if $annotate
+                                                && $table != $property;
              }
              elsif ($expected_empty) {
                  my $because = "";
@@ -17310,8 +17315,45 @@ sub write_all_tables() {
          my $filename;
          my $property = $table->property;
          my $is_property = ($table == $property);
-        if (! $is_property) {
  
+        # For very short tables, instead of writing them out to actual files,
+        # we in-line their inversion list definitions into Heavy.pl.  The
+        # definition replaces the file name, and the special pseudo-directory
+        # '#' is used to signal this.  This significantly cuts down the number
+        # of files written at little extra cost to the hashes in Heavy.pl.
+        # And it means, no run-time files to read to get the definitions.
+        # But short deprecated tables are written anyway, because e.g.,
+        # Gc=Surrogate is the same exact code points as LB=Surrogate, and only
+        # the latter generates a deprecated warning, and so we want to have a
+        # way to distinguish the two.
+        if (! $is_property
+            && $table->status ne $DEPRECATED
+            && ! $annotate  # For annotation, we want to explicitly show
+                            # everything, so keep in files
+            && $table->ranges <= 1)
+        {
+            my @ranges = $table->ranges;
+            my $count = @ranges;
+            if ($count == 0) {
+                $filename = "V0";
+            }
+            else {
+                my $end = $ranges[0]->end;
+                if ($end < $MAX_WORKING_CODEPOINT) {
+                    $count++;
+                    $end = ";" . ($end + 1);
+                }
+                else {  # Extends to infinity, hence no 'end'
+                    $end = "";
+                }
+                $filename = "V$count;" . $ranges[0]->start . $end;
+            }
+            @directory = "#";
+            register_file_for_name($table, \@directory, $filename);
+            next;
+        }
+
+        if (! $is_property) {
              # Match tables for the property go in lib/$subdirectory, which is
              # the property's name.  Don't use the standard file name for this,
              # as may get an unfamiliar alias
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl

index fc42283..78586ec 100644 (file)
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -407,9 +407,9 @@ sub _loose_name ($) {
                  # Add the constant and go fetch it in.
                  if (defined $file) {
  
-                    # A beginning ! means to invert.  The 0+ makes sure is
-                    # numeric
-                    $invert_it = 0 + $file =~ s/^!//;
+                    # If the file name contains a !, it means to invert.  The
+                    # 0+ makes sure result is numeric
+                    $invert_it = 0 + $file =~ s/!//;
  
                      if ($utf8::why_deprecated{$file}) {
                          warnings::warnif('deprecated', "Use of '$type' in \\p{} or \\P{} is deprecated because: $utf8::why_deprecated{$file};");
@@ -420,7 +420,11 @@ sub _loose_name ($) {
                      {
                          $file = $utf8::caseless_equivalent{$property_and_table};
                      }
-                    $file= "$unicore_dir/lib/$file.pl";
+
+                    # The pseudo-directory '#' means that there really isn't a
+                    # file to read, the data is in-line as part of the string;
+                    # we extract it below.
+                    $file = "$unicore_dir/lib/$file.pl" unless $file =~ m!^#/!;
                      last GETFILE;
                  }
                  print STDERR __LINE__, ": didn't find $property_and_table\n" if DEBUG;
@@ -482,8 +486,8 @@ sub _loose_name ($) {
                          $minbits = 1;
  
                          # The 0+ makes sure is numeric
-                        $invert_it = 0 + $file =~ s/^!//;
-                        $file = "$unicore_dir/lib/$file.pl";
+                        $invert_it = 0 + $file =~ s/!//;
+                        $file = "$unicore_dir/lib/$file.pl" unless $file =~ m!^#/!;
                          last GETFILE;
                      }
                  } }
@@ -504,10 +508,19 @@ sub _loose_name ($) {
                  ## If we reach here, it was due to a 'last GETFILE' above
                  ## (exception: user-defined properties and mappings), so we
                  ## have a filename, so now we load it if we haven't already.
-                ## If we have, return the cached results. The cache key is the
-                ## class and file to load, and whether the results need to be
-                ## inverted.
-                ##
+
+                # The pseudo-directory '#' means the result isn't really a
+                # file, but is in-line, with semi-colons to be turned into
+                # new-lines.  Since it is in-line there is no advantage to
+                # caching the result
+                if ($file =~ s!^#/!!) {
+                    $list = $file =~ s/;/\n/gr;
+                }
+                else {
+                    # Here, we have an actual file to read in and load, but it
+                    # may already have been read-in and cached.  The cache key
+                    # is the class and file to load, and whether the results
+                    # need to be inverted.
                  my $found = $Cache{$class, $file, $invert_it};
                  if ($found and ref($found) eq $class) {
                      print STDERR __LINE__, ": Returning cached swash for '$class,$file,$invert_it' for \\p{$type}\n" if DEBUG;
@@ -518,6 +531,8 @@ sub _loose_name ($) {
                  local $@;
                  local $!;
                  $list = do $file; die $@ if $@;
+                }
+
                  $list_is_from_mktables = 1;
              }
          } # End of $type is non-null
diff --git a/utf8.c b/utf8.c

index 727c125..7a30a63 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -3719,6 +3719,10 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
          /* The first number is a count of the rest */
          l++;
          elements = Strtoul((char *)l, &after_strtol, 10);
+        if (elements == 0) {
+            invlist = _new_invlist(0);
+        }
+        else {
          l = (U8 *) after_strtol;
  
          /* Get the 0th element, which is needed to setup the inversion list */
@@ -3735,6 +3739,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
              *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10);
              l = (U8 *) after_strtol;
          }
+        }
      }
      else {
author	Karl Williamson <public@khwilliamson.com>
	Fri, 14 Mar 2014 20:23:21 +0000 (14:23 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Fri, 14 Mar 2014 20:58:06 +0000 (14:58 -0600)
lib/Unicode/UCD.t		patch \| blob \| history
lib/unicore/mktables		patch \| blob \| history
lib/utf8_heavy.pl		patch \| blob \| history
utf8.c		patch \| blob \| history