phash.ph: use a bipartite graph to reduce the storage requirements

author H. Peter Anvin <hpa@zytor.com>

Thu, 30 Aug 2007 23:42:39 +0000 (23:42 +0000)

committer H. Peter Anvin <hpa@zytor.com>

Thu, 30 Aug 2007 23:42:39 +0000 (23:42 +0000)
author H. Peter Anvin <hpa@zytor.com>
Thu, 30 Aug 2007 23:42:39 +0000 (23:42 +0000)
committer H. Peter Anvin <hpa@zytor.com>
Thu, 30 Aug 2007 23:42:39 +0000 (23:42 +0000)
diff --git a/Makefile.in b/Makefile.in

index 7cec801c9d264ede0f20182549839a6bf804f58d..0651b8e794476317100fb9a20e1a84ac0e0abdab 100644 (file)
--- a/Makefile.in
+++ b/Makefile.in
@@ -113,7 +113,7 @@ regs.h: regs.dat regs.pl
         $(PERL) $(srcdir)/regs.pl h $(srcdir)/regs.dat > regs.h
  
  # Token hash
-tokhash.c: insns.dat regs.dat tokens.dat tokhash.pl
+tokhash.c: insns.dat regs.dat tokens.dat tokhash.pl perllib/phash.ph
         $(PERL) $(srcdir)/tokhash.pl $(srcdir)/insns.dat $(srcdir)/regs.dat \
                 $(srcdir)/tokens.dat > tokhash.c
  
diff --git a/perllib/phash.ph b/perllib/phash.ph

index 438e46608bda2939e8e8ba8684fff97d78105d76..017155d7639cada33b56e050c4811719821ab5a9 100644 (file)
--- a/perllib/phash.ph
+++ b/perllib/phash.ph
@@ -3,7 +3,7 @@
  # Perfect Minimal Hash Generator written in Perl, which produces
  # C output.
  #
-# Requires the CPAN Graph module (tested against 0.83)
+# Requires the CPAN Graph module (tested against 0.81, 0.83, 0.84)
  
  use Graph::Undirected;
  
@@ -65,10 +65,15 @@ sub shuffle(@) {
  #
  # ffunc(N)
  #
-sub ffunc($) {
-    my($n) = @_;
+sub ffunc($$$) {
+    my($n,$s,$i) = @_;
+    my(@l) = ();
  
-    return shuffle(0..$n-1);
+    while ($n--) {
+       push(@l, $i);
+       $i += $s;
+    }
+    return shuffle(@l);
  }
  
  #
@@ -107,12 +112,13 @@ sub gen_hash_n($$$) {
      my $i, $sv, @f1, @f2, @g;
      my $gr;
      my $k, $v;
+    my $gsize = 2*$n;
  
-    @f1 = ffunc($n);
-    @f2 = ffunc($n);
+    @f1 = ffunc($n, 2, 0);
+    @f2 = ffunc($n, 2, 1);
  
      $gr = Graph::Undirected->new;
-    for ($i = 0; $i < $n; $i++) {
+    for ($i = 0; $i < $gsize; $i++) {
         $gr->add_vertex($i);
      }
  
@@ -149,7 +155,7 @@ sub gen_hash_n($$$) {
      # edge, the sum of the values for the two vertices give the value
      # for the edge (which is our hash index.)  Since the graph is
      # acyclic, this is always doable.
-    for ($i = 0; $i < $n; $i++) {
+    for ($i = 0; $i < $gsize; $i++) {
         if (!$gr->has_vertex_attribute($i, "val")) {
             walk_graph($gr,$i,0); # First vertex in a cluster
         }
@@ -160,7 +166,7 @@ sub gen_hash_n($$$) {
      #  print STDERR "Vertex ", $i, ": ", $g[$i], "\n";
      # }
  
-    print STDERR "Done: n = $n, sv = $sv\n";
+    print STDERR "Done: n = $n, sv = [", join(',', @$sv), "]\n";
  
      return ($n, $sv, \@f1, \@f2, \@g);
  }
@@ -184,8 +190,10 @@ sub gen_perfect_hash($) {
      my @hashinfo;
      my $n, $i, $j, $sv, $maxj;
  
-    # Minimal power of 2 value for N with enough wiggle room
-    my $room = scalar(@keys)*1.3;
+    # Minimal power of 2 value for N with enough wiggle room.
+    # The scaling constant must be larger than 0.5 in order for the
+    # algorithm to ever terminate.
+    my $room = scalar(@keys)*0.8;
      $n = 1;
      while ($n < $room) {
         $n <<= 1;
@@ -243,8 +251,6 @@ sub verify_hash_table($$)
      my $k;
      my $err = 0;
  
-    print STDERR "Verify: n = $n, sv = $sv\n";
-
      foreach $k (keys(%$href)) {
         my ($p1, $p2) = prehash($k, $n, $sv);
         my $pf1 = ${$f1}[$p1];
author	H. Peter Anvin <hpa@zytor.com>
	Thu, 30 Aug 2007 23:42:39 +0000 (23:42 +0000)
committer	H. Peter Anvin <hpa@zytor.com>
	Thu, 30 Aug 2007 23:42:39 +0000 (23:42 +0000)
Makefile.in		patch \| blob \| history
perllib/phash.ph		patch \| blob \| history