The vq book builder is mostly running now, but does not produce output yet.

author Monty <xiphmont@xiph.org>

Fri, 17 Dec 1999 13:19:36 +0000 (13:19 +0000)

committer Monty <xiphmont@xiph.org>

Fri, 17 Dec 1999 13:19:36 +0000 (13:19 +0000)
author Monty <xiphmont@xiph.org>
Fri, 17 Dec 1999 13:19:36 +0000 (13:19 +0000)
committer Monty <xiphmont@xiph.org>
Fri, 17 Dec 1999 13:19:36 +0000 (13:19 +0000)
diff --git a/vq/Makefile.in b/vq/Makefile.in

index f41ebe1..137f8cd 100644 (file)
--- a/vq/Makefile.in
+++ b/vq/Makefile.in
@@ -1,4 +1,4 @@
-# $Id: Makefile.in,v 1.1 1999/12/17 07:21:26 xiphmont Exp $
+# $Id: Makefile.in,v 1.2 1999/12/17 13:19:30 xiphmont Exp $
  
  ###############################################################################
  #                                                                             #
@@ -27,8 +27,8 @@ LIBS=@LIBS@ -lm
  
  HFILES =       vqgen.h vqext.h
  
-OFILES =       vqgen.o train.o 
-ALLOFILES =    $(OFILES) lspdata.o
+OFILES =       vqgen.o vqsplit.o 
+ALLOFILES =    $(OFILES) lspdata.o train.o build.o
  
  all:
         $(MAKE) target CFLAGS="$(OPT)"
@@ -39,9 +39,12 @@ debug:
  profile: 
         $(MAKE) target CFLAGS="$(PROFILE)"
  
-target:        lspvqtrain 
+target:        lspvqtrain vqbuild
  
-lspvqtrain:    $(OFILES) lspdata.o
+lspvqtrain:    $(OFILES) lspdata.o train.o
+       $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(LIBS)
+
+vqbuild:       $(OFILES) build.o
         $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(LIBS)
  
  $(ALLOFILES):  $(HFILES)
@@ -50,5 +53,5 @@ $(ALLOFILES):         $(HFILES)
         $(CC) $(CFLAGS) -c $<
  
  clean:
-       -rm -f *.o *.a test* *~ *.out *.m config.*\
+       -rm -f *.o *.a test* *~ *.out *.m config.* \
                 lspvqtrain
diff --git a/vq/build.c b/vq/build.c

index 0eac602..ed5ca71 100644 (file)
--- a/vq/build.c
+++ b/vq/build.c
@@ -14,7 +14,7 @@
   function: utility main for building codebooks from training sets
   author: Monty <xiphmont@mit.edu>
   modifications by: Monty
- last modification date: Dec 14 1999
+ last modification date: Dec 15 1999
  
   ********************************************************************/
  
@@ -23,209 +23,177 @@
  #include <math.h>
  #include <string.h>
  #include <errno.h>
-#include <signal.h>
  #include "vqgen.h"
  
-static int rline(FILE *in,FILE *out,char *line,int max,int pass){
-  while(fgets(line,160,in)){
-    if(line[0]=='#'){
-      if(pass)fprintf(out,"%s",line);
+static char *linebuffer=NULL;
+static int  lbufsize=0;
+static char *rline(FILE *in,FILE *out,int pass){
+  long sofar=0;
+  if(feof(in))return NULL;
+
+  while(1){
+    int gotline=0;
+
+    while(!gotline){
+      if(sofar>=lbufsize){
+       if(!lbufsize){  
+         lbufsize=1024;
+         linebuffer=malloc(lbufsize);
+       }else{
+         lbufsize*=2;
+         linebuffer=realloc(linebuffer,lbufsize);
+       }
+      }
+      {
+       long c=fgetc(in);
+       switch(c){
+       case '\n':
+       case EOF:
+         gotline=1;
+         break;
+       default:
+         linebuffer[sofar++]=c;
+         linebuffer[sofar]='\0';
+         break;
+       }
+      }
+    }
+    
+    if(linebuffer[0]=='#'){
+      if(pass)fprintf(out,"%s",linebuffer);
+      sofar=0;
      }else{
-      return(1);
+      return(linebuffer);
      }
    }
-  return(0);
  }
  
  /* command line:
-   buildvq vq=file out=file quant=n
+   buildvq file
  */
  
-int exiting=0;
-void setexit(int dummy){
-  fprintf(stderr,"\nexiting... please wait to finish this iteration\n");
-  exiting=1;
-}
-
  int main(int argc,char *argv[]){
    vqgen v;
-  int entries=-1,dim=-1;
-  char line[1024];
+  vqbook b;
+  quant_return q;
+  int *quantlist=NULL;
+
+  int entries=-1,dim=-1,quant=-1;
+  FILE *out=NULL;
+  FILE *in=NULL;
+  char *line,*name;
    long i,j,k;
  
-  int init=0;
-  while(*argv){
-
-    /* load the trained data */
-    if(!strncmp(*argv,"vq=",3)){
-      FILE *in=NULL;
-      char filename[80],*ptr;
-      if(sscanf(*argv,"vq=%70s",filename)!=1){
-       fprintf(stderr,"Syntax error in argument '%s'\n",*argv);
-       exit(1);
-      }
-
-      in=fopen(filename,"r");
-      ptr=strrchr(filename,'-');
-      if(ptr){
-       int num;
-       ptr++;
-       num=atoi(ptr);
-       sprintf(ptr,"%d.vqi",num+1);
-      }else
-       strcat(filename,"-0.vqi");
-      
-      out=fopen(filename,"w");
-      if(out==NULL){
-       fprintf(stderr,"Unable to open %s for writing\n",filename);
-       exit(1);
-      }
-      fprintf(out,"# OggVorbis VQ codebook trainer, intermediate file\n");
-
-      if(in){
-       /* we wish to suck in a preexisting book and continue to train it */
-       double a;
-           
-       rline(in,out,line,160,1);
-       if(sscanf(line,"%d %d %d",&entries,&dim,&met)!=3){
-         fprintf(stderr,"Syntax error reading book file\n");
-         exit(1);
-       }
-
-       metric=set_metric(met);
-       vqgen_init(&v,dim,entries,metric,0.);
-       init=1;
-
-       /* entries, bias, points */
-       i=0;
-       for(j=0;j<entries;j++){
-         for(k=0;k<dim;k++){
-           rline(in,out,line,160,0);
-           sscanf(line,"%lf",&a);
-           v.entrylist[i++]=a;
-         }
-       }
-
-       i=0;
-       for(j=0;j<entries;j++){
-         rline(in,out,line,160,0);
-         sscanf(line,"%lf",&a);
-         v.bias[i++]=a;
-       }
-
-       {
-         double b[80];
-         i=0;
-         v.entries=0; /* hack to avoid reseeding */
-         while(1){
-           for(k=0;k<dim && k<80;k++){
-             rline(in,out,line,160,0);
-             sscanf(line,"%lf",b+k);
-           }
-           if(feof(in))break;
-           vqgen_addpoint(&v,b);
-         }
-         v.entries=entries;
-       }
+  if(argv[1]==NULL){
+    fprintf(stderr,"Need a trained data set on the command line.\n");
+    exit(1);
+  }
  
-       fclose(in);
-      }
-    }
+  {
+    char *ptr;
+    char *filename=strdup(argv[1]);
  
-    /* set parameters if we're not loading a pre book */
-    if(!strncmp(*argv,"entries=",8)){
-      sscanf(*argv,"entries=%d",&entries);
+    in=fopen(filename,"r");
+    if(!in){
+      fprintf(stderr,"Could not open input file %s\n",filename);
+      exit(1);
      }
-    if(!strncmp(*argv,"desired=",8)){
-      sscanf(*argv,"desired=%lf",&desired);
-    }
-    if(!strncmp(*argv,"dim=",4)){
-      sscanf(*argv,"dim=%d",&dim);
+    
+    ptr=strrchr(filename,'.');
+    if(ptr){
+      *ptr='\0';
+      name=strdup(ptr);
+      sprintf(ptr,".h");
+    }else{
+      name=strdup(filename);
+      strcat(filename,".h");
      }
  
-    /* which error metric (0==euclidian distance default) */
-    if(!strncmp(*argv,"met=",4)){
-      sscanf(*argv,"met=%d",&met);
-      metric=set_metric(met);
+    out=fopen(filename,"w");
+    if(out==NULL){
+      fprintf(stderr,"Unable to open %s for writing\n",filename);
+      exit(1);
      }
+  }
  
-    if(!strncmp(*argv,"in=",3)){
-      int start;
-      char file[80];
-      FILE *in;
-
-      if(sscanf(*argv,"in=%79[^,],%d",file,&start)!=2)goto syner;
-      if(!out){
-       fprintf(stderr,"vq= must preceed in= arguments\n");
-       exit(1);
-      }
-      if(!init){
-       if(dim==-1 || entries==-1){
-         fprintf(stderr,"Must specify dimensionality and entries before"
-                 " first input file\n");
-         exit(1);
-       }
-       vqgen_init(&v,dim,entries,metric,0.);
-       init=1;
-      }
+  /* suck in the trained book */
  
-      in=fopen(file,"r");
-      if(in==NULL){
-       fprintf(stderr,"Could not open input file %s\n",file);
-       exit(1);
-      }
-      fprintf(out,"# training file entry: %s\n",file);
-
-      while(rline(in,out,line,1024,1)){
-       double b[16];
-       int n=sscanf(line,"%lf %lf %lf %lf %lf %lf %lf %lf "
-                    "%lf %lf %lf %lf %lf %lf %lf %lf",
-                    b,b+1,b+2,b+3,b+4,b+5,b+6,b+7,b+8,b+9,b+10,b+11,b+12,b+13,
-                    b+14,b+15);
-       if(start+dim>n){
-         fprintf(stderr,"ran out of columns reading %s\n",file);
-         exit(1);
-       }
-       vqgen_addpoint(&v,b+start);
+  /* read book type, but it doesn't matter */
+  line=rline(in,out,1);
+  
+  line=rline(in,out,1);
+  if(sscanf(line,"%d %d",&entries,&dim)!=2){
+    fprintf(stderr,"Syntax error reading book file\n");
+    exit(1);
+  }
+  
+  /* just use it to allocate mem */
+  vqgen_init(&v,dim,entries,NULL);
+  
+  /* quant */
+  line=rline(in,out,1);
+  if(sscanf(line,"%lf %lf %d %d",&q.minval,&q.delt,
+           &q.addtoquant,&quant)!=4){
+    fprintf(stderr,"Syntax error reading book file\n");
+    exit(1);
+  }
+  
+  /* quantized entries */
+  /* save quant data; we can't regen it because the quant details
+     are specific to the data type */
+  i=0;
+  quantlist=malloc(sizeof(int)*v.elements*v.entries);
+  for(j=0;j<entries;j++){
+    double a;
+    for(k=0;k<dim;k++){
+      line=rline(in,out,0);
+      sscanf(line,"%lf",&a);
+      quantlist[i++]=rint(a);
+    }
+  }    
+  
+  /* unquantized entries */
+  i=0;
+  for(j=0;j<entries;j++){
+    double a;
+    for(k=0;k<dim;k++){
+      line=rline(in,out,0);
+      sscanf(line,"%lf",&a);
+      v.entrylist[i++]=a;
+    }
+  }
+  
+  /* ignore bias */
+  for(j=0;j<entries;j++)line=rline(in,out,0);
+  free(v.bias);
+  v.bias=NULL;
+  
+  /* training points */
+  {
+    double b[80];
+    i=0;
+    v.entries=0; /* hack to avoid reseeding */
+    while(1){
+      for(k=0;k<dim && k<80;k++){
+       line=rline(in,out,0);
+       if(!line)break;
+       sscanf(line,"%lf",b+k);
        }
-
-      fclose(in);
+      if(feof(in))break;
+      vqgen_addpoint(&v,b);
      }
-    argv++;
+    v.entries=entries;
    }
+  
+  fclose(in);
  
-  /* train the book */
-  signal(SIGTERM,setexit);
-  signal(SIGINT,setexit);
-
-  for(i=0;i<iter && !exiting;i++){
-    if(vqgen_iterate(&v)<desired)break;
-  }
+  /* build the book */
+  vqsp_book(&v,&b);
  
-  /* save the book */
+  /* save the book in C header form */
  
-  fprintf(out,"%d %d %d\n",entries,dim,met);
  
-  i=0;
-  for(j=0;j<entries;j++)
-    for(k=0;k<dim;k++)
-      fprintf(out,"%f\n",v.entrylist[i++]);
-  
-  fprintf(out,"# biases---\n");
-  i=0;
-  for(j=0;j<entries;j++)
-    fprintf(out,"%f\n",v.bias[i++]);
  
-  fprintf(out,"# points---\n");
-  i=0;
-  for(j=0;j<v.points;j++)
-    for(k=0;k<dim && k<80;k++)
-      fprintf(out,"%f\n",v.pointlist[i++]);
  
-  fclose(out);
    exit(0);
-
-  syner:
-    fprintf(stderr,"Syntax error in argument '%s'\n",*argv);
-    exit(1);
  }
diff --git a/vq/train.c b/vq/train.c

index 64a9e92..887cdec 100644 (file)
--- a/vq/train.c
+++ b/vq/train.c
@@ -63,6 +63,7 @@ static char *rline(FILE *in,FILE *out,int pass){
      
      if(linebuffer[0]=='#'){
        if(pass)fprintf(out,"%s",linebuffer);
+      sofar=0;
      }else{
        return(linebuffer);
      }
@@ -125,7 +126,6 @@ int main(int argc,char *argv[]){
         double a;
  
         line=rline(in,out,1);
-       if(strlen(line)>0)line[strlen(line)-1]='\0';
         if(strcmp(line,vqext_booktype)){
           fprintf(stderr,"wrong book type; %s!=%s\n",line,vqext_booktype);
           exit(1);
@@ -149,7 +149,12 @@ int main(int argc,char *argv[]){
           exit(1);
         }
  
-       /* entries */
+       /* quantized entries */
+       for(j=0;j<entries;j++)
+         for(k=0;k<dim;k++)
+           line=rline(in,out,0);
+
+       /* unquantized entries */
         i=0;
         for(j=0;j<entries;j++){
           for(k=0;k<dim;k++){
@@ -159,9 +164,6 @@ int main(int argc,char *argv[]){
           }
         }
         
-       /* dequantize */
-       vqext_unquantize(&v,&q);
-
         /* bias, points */
         i=0;
         for(j=0;j<entries;j++){
@@ -177,6 +179,7 @@ int main(int argc,char *argv[]){
           while(1){
             for(k=0;k<dim && k<80;k++){
               line=rline(in,out,0);
+             if(!line)break;
               sscanf(line,"%lf",b+k);
             }
             if(feof(in))break;
@@ -265,6 +268,12 @@ int main(int argc,char *argv[]){
      argv++;
    }
  
+  if(!init){
+    fprintf(stderr,"No input files!\n");
+    exit(1);
+  }
+
+
    /* train the book */
    signal(SIGTERM,setexit);
    signal(SIGINT,setexit);
@@ -284,10 +293,23 @@ int main(int argc,char *argv[]){
    fprintf(out,"%d %d\n",entries,dim);
    fprintf(out,"%g %g %d %d\n",q.minval,q.delt,q.addtoquant,quant);
  
+  /* quantized entries */
+  fprintf(out,"# quantized entries---\n");
+  i=0;
+  for(j=0;j<entries;j++)
+    for(k=0;k<dim;k++)
+      fprintf(out,"%d\n",(int)(rint(v.entrylist[i++])));
+
+  /* dequantize */
+  vqext_unquantize(&v,&q);
+
+  fprintf(out,"# unquantized entries---\n");
    i=0;
    for(j=0;j<entries;j++)
      for(k=0;k<dim;k++)
        fprintf(out,"%f\n",v.entrylist[i++]);
+
+  /* unquantized entries */
    
    fprintf(out,"# biases---\n");
    i=0;
diff --git a/vq/vqext.h b/vq/vqext.h

index 88466d1..59f3447 100644 (file)
--- a/vq/vqext.h
+++ b/vq/vqext.h
@@ -21,11 +21,6 @@
  #include "vqgen.h"
  
  extern char *vqext_booktype;
-typedef struct {
-  double minval;
-  double delt;
-  int    addtoquant;
-} quant_return;
  
  extern double vqext_metric(vqgen *v,double *b, double *a);
  extern quant_return vqext_quantize(vqgen *v,int quantbits);
diff --git a/vq/vqgen.c b/vq/vqgen.c

index e4fc2a2..d192695 100644 (file)
--- a/vq/vqgen.c
+++ b/vq/vqgen.c
@@ -57,14 +57,6 @@
  /* internal helpers *****************************************************/
  #define vN(data,i) (data+v->elements*i)
  
-double *_point(vqgen *v,long ptr){
-  return v->pointlist+(v->elements*ptr);
-}
-
-double *_now(vqgen *v,long ptr){
-  return v->entrylist+(v->elements*ptr);
-}
-
  /* default metric; squared 'distance' from desired value. */
  double _dist_sq(vqgen *v,double *a, double *b){
    int i;
@@ -265,342 +257,3 @@ double vqgen_iterate(vqgen *v){
    return(asserror);
  }
  
-
-/* Building a codebook from trained set **********************************
-
-   The codebook in raw form is technically finished once it's trained.
-   However, we want to finalize the representative codebook values for
-   each entry and generate auxiliary information to optimize encoding.
-   We generate the auxiliary coding tree using collected data,
-   probably the same data as in the original training */
-
-/* At each recursion, the data set is split in half.  Cells with data
-   points on side A go into set A, same with set B.  The sets may
-   overlap.  If the cell overlaps the deviding line only very slightly
-   (provided parameter), we may choose to ignore the overlap in order
-   to pare the tree down */
-
-double *sortvals;
-int els;
-int iascsort(const void *a,const void *b){
-  double av=sortvals[*((long *)a) * els];
-  double bv=sortvals[*((long *)b) * els];
-  if(av<bv)return(-1);
-  return(1);
-}
-
-/* goes through the split, but just counts it and returns a metric*/
-void lp_count(vqgen *v,long *entryindex,long entries, 
-               long *pointindex,long points,
-               long *entryA,long *entryB,
-               double *n, double c, double slack,
-               long *entriesA,long *entriesB,long *entriesC){
-  long i,j,k;
-  long A=0,B=0,C=0;
-
-  memset(entryA,0,sizeof(long)*entries);
-  memset(entryB,0,sizeof(long)*entries);
-
-  for(i=0;i<points;i++){
-    double *ppt=_point(v,pointindex[i]);
-    long   firstentry=0;
-    double firstmetric=_dist_sq(v,_now(v,entryindex[0]),ppt);
-    double position=-c;
-    
-    for(j=1;j<entries;j++){
-      double thismetric=_dist_sq(v,_now(v,entryindex[j]),ppt);
-      if(thismetric<firstmetric){
-       firstmetric=thismetric;
-       firstentry=j;
-      }
-    }
-
-    /* count point split */
-    for(k=0;k<v->elements;k++)
-      position+=ppt[k]*n[k];
-    if(position>0.){
-      entryA[firstentry]++;
-    }else{
-      entryB[firstentry]++;
-    }
-  }
-
-  /* look to see if entries are in the slack zone */
-  /* The entry splitting isn't total, so that storage has to be
-     allocated for recursion.  Reuse the entryA/entryB vectors */
-  for(j=0;j<entries;j++){
-    long total=entryA[j]+entryB[j];
-    if((double)entryA[j]/total<slack){
-      entryA[j]=0;
-    }else if((double)entryB[j]/total<slack){
-      entryB[j]=0;
-    }
-    if(entryA[j] && entryB[j])C++;
-    if(entryA[j])entryA[A++]=entryindex[j];
-    if(entryB[j])entryB[B++]=entryindex[j];
-  }
-  *entriesA=A;
-  *entriesB=B;
-  *entriesC=C;
-}
-
-void pq_in_out(vqgen *v,double *n,double *c,double *p,double *q){
-  int k;
-  *c=0.;
-  for(k=0;k<v->elements;k++){
-    double center=(p[k]+q[k])/2.;
-    n[k]=(center-q[k])*2.;
-    *c+=center*n[k];
-  }
-}
-
-void pq_center_out(vqgen *v,double *n,double *c,double *center,double *q){
-  int k;
-  *c=0.;
-  for(k=0;k<v->elements;k++){
-    n[k]=(center[k]-q[k])*2.;
-    *c+=center[k]*n[k];
-  }
-}
-
-int lp_split(vqgen *v,vqbook *b,
-            long *entryindex,long entries, 
-            long *pointindex,long points,
-            long depth,double slack){
-
-  /* The encoder, regardless of book, will be using a straight
-     euclidian distance-to-point metric to determine closest point.
-     Thus we split the cells using the same (we've already trained the
-     codebook set spacing and distribution using special metrics and
-     even a midpoint division won't disturb the basic properties) */
-
-  long ret;
-  double *p;
-  double *q;
-  double *n;
-  double c;
-  long *entryA=calloc(entries,sizeof(long));
-  long *entryB=calloc(entries,sizeof(long));
-  long entriesA=0;
-  long entriesB=0;
-  long entriesC=0;
-  long pointsA=0;
-  long i,j,k;
-
-  p=alloca(sizeof(double)*v->elements);
-  q=alloca(sizeof(double)*v->elements);
-  n=alloca(sizeof(double)*v->elements);
-  memset(p,0,sizeof(double)*v->elements);
-
-  /* We need to find the dividing hyperplane. find the median of each
-     axis as the centerpoint and the normal facing farthest point */
-
-  /* more than one way to do this part.  For small sets, we can brute
-     force it. */
-
-  if(entries<32){
-    /* try every pair possibility */
-    double best=0;
-    long   besti=0;
-    long   bestj=0;
-    double this;
-    for(i=0;i<entries-1;i++){
-      for(j=i+1;j<entries;j++){
-       pq_in_out(v,n,&c,_now(v,entryindex[i]),_now(v,entryindex[j]));
-       lp_count(v,entryindex,entries, 
-                pointindex,points,
-                entryA,entryB,
-                n, c, slack,
-                &entriesA,&entriesB,&entriesC);
-       this=(entriesA-entriesC)*(entriesB-entriesC);
-
-       if(this>best){
-         best=this;
-         besti=i;
-         bestj=j;
-       }
-      }
-    }
-    pq_in_out(v,n,&c,_now(v,entryindex[besti]),_now(v,entryindex[bestj]));
-  }else{
-    double best=0.;
-    long   bestj=0;
-
-    /* try COG/normal and furthest pairs */
-    /* medianpoint */
-    for(k=0;k<v->elements;k++){
-      /* just sort the index array */
-      sortvals=v->pointlist+k;
-      els=v->elements;
-      qsort(pointindex,points,sizeof(long),iascsort);
-      if(points&0x1){
-       p[k]=v->pointlist[(pointindex[points/2])*v->elements+k];
-      }else{
-       p[k]=(v->pointlist[(pointindex[points/2])*v->elements+k]+
-             v->pointlist[(pointindex[points/2-1])*v->elements+k])/2.;
-      }
-    }
-    
-    /* try every normal, but just for distance */
-    for(j=0;j<entries;j++){
-      double *ppj=_now(v,entryindex[j]);
-      double this=_dist_sq(v,p,ppj);
-      if(this>best){
-       best=this;
-       bestj=j;
-      }
-    }
-
-    pq_center_out(v,n,&c,p,_point(v,pointindex[bestj]));
-
-
-  }
-
-  /* find cells enclosing points */
-  /* count A/B points */
-
-  lp_count(v,entryindex,entries, 
-          pointindex,points,
-          entryA,entryB,
-          n, c, slack,
-          &entriesA,&entriesB,&entriesC);
-
-  /* the point index is split evenly, so we do an Order n
-     rearrangement into A first/B last and just pass it on */
-  {
-    long Aptr=0;
-    long Bptr=points-1;
-    while(Aptr<=Bptr){
-      while(Aptr<=Bptr){
-       double position=-c;
-       for(k=0;k<v->elements;k++)
-         position+=_point(v,pointindex[Aptr])[k]*n[k];
-       if(position<=0.)break; /* not in A */
-       Aptr++;
-      }
-      while(Aptr<=Bptr){
-       double position=-c;
-       for(k=0;k<v->elements;k++)
-         position+=_point(v,pointindex[Bptr])[k]*n[k];
-       if(position>0.)break; /* not in B */
-       Bptr--;
-      }
-      if(Aptr<Bptr){
-       long temp=pointindex[Aptr];
-       pointindex[Aptr]=pointindex[Bptr];
-       pointindex[Bptr]=temp;
-      }
-      pointsA=Aptr;
-    }
-  }
-
-  fprintf(stderr,"split: total=%ld depth=%ld set A=%ld:%ld:%ld=B\n",
-         entries,depth,entriesA-entriesC,entriesC,entriesB-entriesC);
-  {
-    long thisaux=b->aux++;
-    if(b->aux>=b->alloc){
-      b->alloc*=2;
-      b->ptr0=realloc(b->ptr0,sizeof(long)*b->alloc);
-      b->ptr1=realloc(b->ptr1,sizeof(long)*b->alloc);
-      b->n=realloc(b->n,sizeof(double)*b->elements*b->alloc);
-      b->c=realloc(b->c,sizeof(double)*b->alloc);
-    }
-    
-    memcpy(b->n+b->elements*thisaux,n,sizeof(double)*v->elements);
-    b->c[thisaux]=c;
-
-    if(entriesA==1){
-      ret=1;
-      b->ptr0[thisaux]=entryA[0];
-    }else{
-      b->ptr0[thisaux]= -b->aux;
-      ret=lp_split(v,b,entryA,entriesA,pointindex,pointsA,depth+1,slack); 
-    }
-    if(entriesB==1){
-      ret++;
-      b->ptr1[thisaux]=entryB[0];
-    }else{
-      b->ptr1[thisaux]= -b->aux;
-      ret+=lp_split(v,b,entryB,entriesB,pointindex+pointsA,points-pointsA,
-                  depth+1,slack); 
-    }
-  }
-  free(entryA);
-  free(entryB);
-  return(ret);
-}
-
-int vqenc_entry(vqbook *b,double *val){
-  int ptr=0,k;
-  while(1){
-    double c= -b->c[ptr];
-    double *nptr=b->n+b->elements*ptr;
-    for(k=0;k<b->elements;k++)
-      c+=nptr[k]*val[k];
-    if(c>0.) /* in A */
-      ptr= -b->ptr0[ptr];
-    else     /* in B */
-      ptr= -b->ptr1[ptr];
-    if(ptr<=0)break;
-  }
-  return(-ptr);
-}
-
-void vqgen_book(vqgen *v,vqbook *b){
-  long i;
-  long *entryindex=malloc(sizeof(double)*v->entries);
-  long *pointindex=malloc(sizeof(double)*v->points);
-
-  memset(b,0,sizeof(vqbook));
-  for(i=0;i<v->entries;i++)entryindex[i]=i;
-  for(i=0;i<v->points;i++)pointindex[i]=i;
-  b->elements=v->elements;
-  b->entries=v->entries;
-  b->alloc=4096;
-  b->ptr0=malloc(sizeof(long)*b->alloc);
-  b->ptr1=malloc(sizeof(long)*b->alloc);
-  b->n=malloc(sizeof(double)*b->elements*b->alloc);
-  b->c=malloc(sizeof(double)*b->alloc);
-
-  b->valuelist=malloc(sizeof(double)*b->elements*b->entries);
-  b->codelist=malloc(sizeof(long)*b->entries);
-  b->lengthlist=malloc(b->entries*sizeof(long));
-  
-  /* first, generate the encoding decision heirarchy */
-  fprintf(stderr,"Total leaves: %d\n",
-         lp_split(v,b,entryindex,v->entries, pointindex,v->points,0,0));
-  
-  /* run all training points through the decision tree to get a final
-     probability count */
-  {
-    long *probability=calloc(b->entries*2,sizeof(long));
-    for(i=0;i<v->points;i++){
-      int ret=vqenc_entry(b,v->pointlist+i*v->elements);
-      probability[ret]++;
-    }
-
-    for(i=0;i<b->entries;i++){
-      fprintf(stderr,"point %ld: %ld\n",i,probability[i]);
-    }
-
-  }
-
-  /* generate the codewords (short to long) */
-
-  free(entryindex);
-  free(pointindex);
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/vq/vqgen.h b/vq/vqgen.h

index 3c20761..a592f78 100644 (file)
--- a/vq/vqgen.h
+++ b/vq/vqgen.h
@@ -36,6 +36,7 @@ typedef struct vqgen{
  typedef struct vqbook{
    long elements;
    long entries;
+
    double *valuelist;
    long   *codelist;
    long   *lengthlist;
@@ -52,14 +53,28 @@ typedef struct vqbook{
  
  } vqbook;
  
+typedef struct {
+  double minval;
+  double delt;
+  int    addtoquant;
+} quant_return;
+
+static inline double *_point(vqgen *v,long ptr){
+  return v->pointlist+(v->elements*ptr);
+}
+
+static inline double *_now(vqgen *v,long ptr){
+  return v->entrylist+(v->elements*ptr);
+}
+
  extern void vqgen_init(vqgen *v,int elements,int entries,
                        double (*metric)(vqgen *,double *, double *));
  extern void vqgen_addpoint(vqgen *v, double *p);
  extern double *vqgen_midpoint(vqgen *v);
  extern double vqgen_iterate(vqgen *v);
+
+extern void vqsp_book(vqgen *v,vqbook *b);
  extern int vqenc_entry(vqbook *b,double *val);
-extern void vqgen_book(vqgen *v,vqbook *b);
-extern double *_now(vqgen *v,long ptr);
  
  #endif
  
diff --git a/vq/vqsplit.c b/vq/vqsplit.c

new file mode 100644 (file)

index 0000000..f453d0c
--- /dev/null
+++ b/vq/vqsplit.c
@@ -0,0 +1,432 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE Ogg Vorbis SOFTWARE CODEC SOURCE CODE.  *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS SOURCE IS GOVERNED BY *
+ * THE GNU PUBLIC LICENSE 2, WHICH IS INCLUDED WITH THIS SOURCE.    *
+ * PLEASE READ THESE TERMS DISTRIBUTING.                            *
+ *                                                                  *
+ * THE OggSQUISH SOURCE CODE IS (C) COPYRIGHT 1994-1999             *
+ * by 1999 Monty <monty@xiph.org> and The XIPHOPHORUS Company       *
+ * http://www.xiph.org/                                             *
+ *                                                                  *
+ ********************************************************************
+
+ function: build a VQ codebook 
+ author: Monty <xiphmont@mit.edu>
+ modifications by: Monty
+ last modification date: Dec 10 1999
+
+ ********************************************************************/
+
+/* This code is *not* part of libvorbis.  It is used to generate
+   trained codebooks offline and then spit the results into a
+   pregenerated codebook that is compiled into libvorbis.  It is an
+   expensive (but good) algorithm.  Run it on big iron. */
+
+/* There are so many optimizations to explore in *both* stages that
+   considering the undertaking is almost withering.  For now, we brute
+   force it all */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+#include "vqgen.h"
+
+/* Codebook generation happens in two steps: 
+
+   1) Train the codebook with data collected from the encoder: We use
+   one of a few error metrics (which represent the distance between a
+   given data point and a candidate point in the training set) to
+   divide the training set up into cells representing roughly equal
+   probability of occurring. 
+
+   2) Generate the codebook and auxiliary data from the trained data set
+*/
+
+/* Building a codebook from trained set **********************************
+
+   The codebook in raw form is technically finished once it's trained.
+   However, we want to finalize the representative codebook values for
+   each entry and generate auxiliary information to optimize encoding.
+   We generate the auxiliary coding tree using collected data,
+   probably the same data as in the original training */
+
+/* At each recursion, the data set is split in half.  Cells with data
+   points on side A go into set A, same with set B.  The sets may
+   overlap.  If the cell overlaps the deviding line only very slightly
+   (provided parameter), we may choose to ignore the overlap in order
+   to pare the tree down */
+
+double *sortvals;
+int els;
+int iascsort(const void *a,const void *b){
+  double av=sortvals[*((long *)a) * els];
+  double bv=sortvals[*((long *)b) * els];
+  if(av<bv)return(-1);
+  return(1);
+}
+
+extern double _dist_sq(vqgen *v,double *a, double *b);
+
+/* goes through the split, but just counts it and returns a metric*/
+void vqsp_count(vqgen *v,long *entryindex,long entries, 
+               long *pointindex,long points,
+               long *entryA,long *entryB,
+               double *n, double c, double slack,
+               long *entriesA,long *entriesB,long *entriesC){
+  long i,j,k;
+  long A=0,B=0,C=0;
+
+  memset(entryA,0,sizeof(long)*entries);
+  memset(entryB,0,sizeof(long)*entries);
+
+  for(i=0;i<points;i++){
+    double *ppt=_point(v,pointindex[i]);
+    long   firstentry=0;
+    double firstmetric=_dist_sq(v,_now(v,entryindex[0]),ppt);
+    double position=-c;
+    
+    for(j=1;j<entries;j++){
+      double thismetric=_dist_sq(v,_now(v,entryindex[j]),ppt);
+      if(thismetric<firstmetric){
+       firstmetric=thismetric;
+       firstentry=j;
+      }
+    }
+
+    /* count point split */
+    for(k=0;k<v->elements;k++)
+      position+=ppt[k]*n[k];
+    if(position>0.){
+      entryA[firstentry]++;
+    }else{
+      entryB[firstentry]++;
+    }
+  }
+
+  /* look to see if entries are in the slack zone */
+  /* The entry splitting isn't total, so that storage has to be
+     allocated for recursion.  Reuse the entryA/entryB vectors */
+  for(j=0;j<entries;j++){
+    long total=entryA[j]+entryB[j];
+    if((double)entryA[j]/total<slack){
+      entryA[j]=0;
+    }else if((double)entryB[j]/total<slack){
+      entryB[j]=0;
+    }
+    if(entryA[j] && entryB[j])C++;
+    if(entryA[j])entryA[A++]=entryindex[j];
+    if(entryB[j])entryB[B++]=entryindex[j];
+  }
+  *entriesA=A;
+  *entriesB=B;
+  *entriesC=C;
+}
+
+void pq_in_out(vqgen *v,double *n,double *c,double *p,double *q){
+  int k;
+  *c=0.;
+  for(k=0;k<v->elements;k++){
+    double center=(p[k]+q[k])/2.;
+    n[k]=(center-q[k])*2.;
+    *c+=center*n[k];
+  }
+}
+
+void pq_center_out(vqgen *v,double *n,double *c,double *center,double *q){
+  int k;
+  *c=0.;
+  for(k=0;k<v->elements;k++){
+    n[k]=(center[k]-q[k])*2.;
+    *c+=center[k]*n[k];
+  }
+}
+
+int lp_split(vqgen *v,vqbook *b,
+            long *entryindex,long entries, 
+            long *pointindex,long points,
+            long depth,double slack){
+
+  /* The encoder, regardless of book, will be using a straight
+     euclidian distance-to-point metric to determine closest point.
+     Thus we split the cells using the same (we've already trained the
+     codebook set spacing and distribution using special metrics and
+     even a midpoint division won't disturb the basic properties) */
+
+  long ret;
+  double *p;
+  double *q;
+  double *n;
+  double c;
+  long *entryA=calloc(entries,sizeof(long));
+  long *entryB=calloc(entries,sizeof(long));
+  long entriesA=0;
+  long entriesB=0;
+  long entriesC=0;
+  long pointsA=0;
+  long i,j,k;
+
+  p=alloca(sizeof(double)*v->elements);
+  q=alloca(sizeof(double)*v->elements);
+  n=alloca(sizeof(double)*v->elements);
+  memset(p,0,sizeof(double)*v->elements);
+
+  /* We need to find the dividing hyperplane. find the median of each
+     axis as the centerpoint and the normal facing farthest point */
+
+  /* more than one way to do this part.  For small sets, we can brute
+     force it. */
+
+  if(entries<32){
+    /* try every pair possibility */
+    double best=0;
+    long   besti=0;
+    long   bestj=0;
+    double this;
+    for(i=0;i<entries-1;i++){
+      for(j=i+1;j<entries;j++){
+       pq_in_out(v,n,&c,_now(v,entryindex[i]),_now(v,entryindex[j]));
+       vqsp_count(v,entryindex,entries, 
+                pointindex,points,
+                entryA,entryB,
+                n, c, slack,
+                &entriesA,&entriesB,&entriesC);
+       this=(entriesA-entriesC)*(entriesB-entriesC);
+
+       if(this>best){
+         best=this;
+         besti=i;
+         bestj=j;
+       }
+      }
+    }
+    pq_in_out(v,n,&c,_now(v,entryindex[besti]),_now(v,entryindex[bestj]));
+  }else{
+    double best=0.;
+    long   bestj=0;
+
+    /* try COG/normal and furthest pairs */
+    /* medianpoint */
+    for(k=0;k<v->elements;k++){
+      /* just sort the index array */
+      sortvals=v->pointlist+k;
+      els=v->elements;
+      qsort(pointindex,points,sizeof(long),iascsort);
+      if(points&0x1){
+       p[k]=v->pointlist[(pointindex[points/2])*v->elements+k];
+      }else{
+       p[k]=(v->pointlist[(pointindex[points/2])*v->elements+k]+
+             v->pointlist[(pointindex[points/2-1])*v->elements+k])/2.;
+      }
+    }
+    
+    /* try every normal, but just for distance */
+    for(j=0;j<entries;j++){
+      double *ppj=_now(v,entryindex[j]);
+      double this=_dist_sq(v,p,ppj);
+      if(this>best){
+       best=this;
+       bestj=j;
+      }
+    }
+
+    pq_center_out(v,n,&c,p,_point(v,pointindex[bestj]));
+
+
+  }
+
+  /* find cells enclosing points */
+  /* count A/B points */
+
+  vqsp_count(v,entryindex,entries, 
+          pointindex,points,
+          entryA,entryB,
+          n, c, slack,
+          &entriesA,&entriesB,&entriesC);
+
+  /* the point index is split evenly, so we do an Order n
+     rearrangement into A first/B last and just pass it on */
+  {
+    long Aptr=0;
+    long Bptr=points-1;
+    while(Aptr<=Bptr){
+      while(Aptr<=Bptr){
+       double position=-c;
+       for(k=0;k<v->elements;k++)
+         position+=_point(v,pointindex[Aptr])[k]*n[k];
+       if(position<=0.)break; /* not in A */
+       Aptr++;
+      }
+      while(Aptr<=Bptr){
+       double position=-c;
+       for(k=0;k<v->elements;k++)
+         position+=_point(v,pointindex[Bptr])[k]*n[k];
+       if(position>0.)break; /* not in B */
+       Bptr--;
+      }
+      if(Aptr<Bptr){
+       long temp=pointindex[Aptr];
+       pointindex[Aptr]=pointindex[Bptr];
+       pointindex[Bptr]=temp;
+      }
+      pointsA=Aptr;
+    }
+  }
+
+  fprintf(stderr,"split: total=%ld depth=%ld set A=%ld:%ld:%ld=B\n",
+         entries,depth,entriesA-entriesC,entriesC,entriesB-entriesC);
+  {
+    long thisaux=b->aux++;
+    if(b->aux>=b->alloc){
+      b->alloc*=2;
+      b->ptr0=realloc(b->ptr0,sizeof(long)*b->alloc);
+      b->ptr1=realloc(b->ptr1,sizeof(long)*b->alloc);
+      b->n=realloc(b->n,sizeof(double)*b->elements*b->alloc);
+      b->c=realloc(b->c,sizeof(double)*b->alloc);
+    }
+    
+    memcpy(b->n+b->elements*thisaux,n,sizeof(double)*v->elements);
+    b->c[thisaux]=c;
+
+    if(entriesA==1){
+      ret=1;
+      b->ptr0[thisaux]=entryA[0];
+    }else{
+      b->ptr0[thisaux]= -b->aux;
+      ret=lp_split(v,b,entryA,entriesA,pointindex,pointsA,depth+1,slack); 
+    }
+    if(entriesB==1){
+      ret++;
+      b->ptr1[thisaux]=entryB[0];
+    }else{
+      b->ptr1[thisaux]= -b->aux;
+      ret+=lp_split(v,b,entryB,entriesB,pointindex+pointsA,points-pointsA,
+                  depth+1,slack); 
+    }
+  }
+  free(entryA);
+  free(entryB);
+  return(ret);
+}
+
+void vqsp_book(vqgen *v, vqbook *b){
+  long *entryindex=malloc(sizeof(double)*v->entries);
+  long *pointindex=malloc(sizeof(double)*v->points);
+  long i;
+
+  memset(b,0,sizeof(vqbook));
+  for(i=0;i<v->entries;i++)entryindex[i]=i;
+  for(i=0;i<v->points;i++)pointindex[i]=i;
+  
+  b->elements=v->elements;
+  b->entries=v->entries;
+  b->alloc=4096;
+  b->ptr0=malloc(sizeof(long)*b->alloc);
+  b->ptr1=malloc(sizeof(long)*b->alloc);
+  b->n=malloc(sizeof(double)*b->elements*b->alloc);
+  b->c=malloc(sizeof(double)*b->alloc);
+  
+  b->valuelist=v->entrylist;
+  b->codelist=malloc(sizeof(long)*b->entries);
+  b->lengthlist=calloc(b->entries,sizeof(long));
+
+  /* first, generate the encoding decision heirarchy */
+  fprintf(stderr,"Total leaves: %d\n",
+         lp_split(v,b,entryindex,v->entries, pointindex,v->points,0,0));
+  
+  /* run all training points through the decision tree to get a final
+     probability count */
+  {
+    long *probability=calloc(b->entries,sizeof(long));
+    long *membership=malloc(b->entries*sizeof(long));
+    long j;
+
+    for(i=0;i<v->points;i++){
+      int ret=vqenc_entry(b,v->pointlist+i*v->elements);
+      probability[ret]++;
+    }
+    for(i=0;i<v->entries;i++)membership[i]=i;
+
+    /* find codeword lengths */
+    /* much more elegant means exist.  Brute force n^2, minimum thought */
+    for(i=v->entries;i>1;i--){
+      int first=-1,second=-1;
+      long least=v->points+1;
+
+      /* find the two nodes to join */
+      for(j=0;j<v->entries;j++)
+       if(probability[j]<least){
+         least=probability[j];
+         first=membership[j];
+       }
+      least=v->points+1;
+      for(j=0;j<v->entries;j++)
+       if(probability[j]<least && membership[j]!=first){
+         least=probability[j];
+         second=membership[j];
+       }
+      if(first==-1 || second==-1){
+       fprintf(stderr,"huffman fault; no free branch\n");
+       exit(1);
+      }
+
+      /* join them */
+      least=probability[first]+probability[second];
+      for(j=0;j<v->entries;j++)
+       if(membership[j]==first || membership[j]==second){
+         membership[j]=first;
+         probability[j]=least;
+         b->lengthlist[j]++;
+       }
+    }
+    for(i=0;i<v->entries-1;i++)
+      if(membership[i]!=membership[i+1]){
+       fprintf(stderr,"huffman fault; failed to build single tree\n");
+       exit(1);
+      }
+
+    /* unneccessary metric */
+    memset(probability,0,sizeof(long)*v->entries);
+    for(i=0;i<v->points;i++){
+      int ret=vqenc_entry(b,v->pointlist+i*v->elements);
+      probability[ret]++;
+    }
+    
+    /* print the results */
+    for(i=0;i<v->entries;i++)
+      fprintf(stderr,"%d: count=%ld codelength=%d\n",i,probability[i],b->lengthlist[i]);
+
+    free(probability);
+    free(membership);
+  }
+
+  /* rearrange, sort by codelength */
+
+
+  /* generate the codewords (short to long) */
+  
+  
+  free(entryindex);
+  free(pointindex);
+}
+
+int vqenc_entry(vqbook *b,double *val){
+  int ptr=0,k;
+  while(1){
+    double c= -b->c[ptr];
+    double *nptr=b->n+b->elements*ptr;
+    for(k=0;k<b->elements;k++)
+      c+=nptr[k]*val[k];
+    if(c>0.) /* in A */
+      ptr= -b->ptr0[ptr];
+    else     /* in B */
+      ptr= -b->ptr1[ptr];
+    if(ptr<=0)break;
+  }
+  return(-ptr);
+}
+
+
+
+
author	Monty <xiphmont@xiph.org>
	Fri, 17 Dec 1999 13:19:36 +0000 (13:19 +0000)
committer	Monty <xiphmont@xiph.org>
	Fri, 17 Dec 1999 13:19:36 +0000 (13:19 +0000)
vq/Makefile.in		patch \| blob \| history
vq/build.c		patch \| blob \| history
vq/train.c		patch \| blob \| history
vq/vqext.h		patch \| blob \| history
vq/vqgen.c		patch \| blob \| history
vq/vqgen.h		patch \| blob \| history
vq/vqsplit.c	[new file with mode: 0644]	patch \| blob