7b7c6903a6b9253a79038f33f0185f325d49b39a
[platform/upstream/libvorbis.git] / vq / latticepare.c
1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE Ogg Vorbis SOFTWARE CODEC SOURCE CODE.  *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS SOURCE IS GOVERNED BY *
5  * THE GNU PUBLIC LICENSE 2, WHICH IS INCLUDED WITH THIS SOURCE.    *
6  * PLEASE READ THESE TERMS DISTRIBUTING.                            *
7  *                                                                  *
8  * THE OggSQUISH SOURCE CODE IS (C) COPYRIGHT 1994-2000             *
9  * by Monty <monty@xiph.org> and The XIPHOPHORUS Company            *
10  * http://www.xiph.org/                                             *
11  *                                                                  *
12  ********************************************************************
13
14  function: utility for paring low hit count cells from lattice codebook
15  last mod: $Id: latticepare.c,v 1.4 2000/07/12 09:36:18 xiphmont Exp $
16
17  ********************************************************************/
18
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <math.h>
22 #include <string.h>
23 #include <errno.h>
24 #include "vorbis/codebook.h"
25 #include "../lib/sharedbook.h"
26 #include "../lib/scales.h"
27 #include "bookutil.h"
28 #include "vqgen.h"
29 #include "vqsplit.h"
30 #include "../lib/os.h"
31
32 /* Lattice codebooks have two strengths: important fetaures that are
33    poorly modelled by global error minimization training (eg, strong
34    peaks) are not neglected 2) compact quantized representation.
35
36    A fully populated lattice codebook, however, swings point 1 too far
37    in the opposite direction; rare features need not be modelled quite
38    so religiously and as such, we waste bits unless we eliminate the
39    least common cells.  The codebook rep supports unused cells, so we
40    need to tag such cells and build an auxiliary (non-thresh) search
41    mechanism to find the proper match quickly */
42
43 /* two basic steps; first is pare the cell for which dispersal creates
44    the least additional error.  This will naturally choose
45    low-population cells and cells that have not taken on points from
46    neighboring paring (but does not result in the lattice collapsing
47    inward and leaving low population ares totally unmodelled).  After
48    paring has removed the desired number of cells, we need to build an
49    auxiliary search for each culled point */
50
51 /* Although lattice books (due to threshhold-based matching) do not
52    actually use error to make cell selections (in fact, it need not
53    bear any relation), the 'secondbest' entry finder here is in fact
54    error/distance based, so latticepare is only useful on such books */
55
56 /* command line:
57    latticepare latticebook.vqh input_data.vqd <target_cells>
58
59    produces a new output book on stdout 
60 */
61
62 static double _dist(int el,double *a, double *b){
63   int i;
64   double acc=0.;
65   for(i=0;i<el;i++){
66     double val=(a[i]-b[i]);
67     acc+=val*val;
68   }
69   return(acc);
70 }
71
72 static double *pointlist;
73 static long points=0;
74
75 void add_vector(codebook *b,double *vec,long n){
76   int dim=b->dim,i,j;
77   int step=n/dim;
78   for(i=0;i<step;i++){
79     for(j=i;j<n;j+=step){
80       pointlist[points++]=vec[j];
81     }
82   }
83 }
84
85 static int bestm(codebook *b,double *vec){
86   encode_aux_threshmatch *tt=b->c->thresh_tree;
87   int dim=b->dim;
88   int i,k,o;
89   int best=0;
90   
91   /* what would be the closest match if the codebook was fully
92      populated? */
93   
94   for(k=0,o=dim-1;k<dim;k++,o--){
95     int i;
96     for(i=0;i<tt->threshvals-1;i++)
97       if(vec[o]<tt->quantthresh[i])break;
98     best=(best*tt->quantvals)+tt->quantmap[i];
99   }
100   return(best);
101 }
102
103 static int closest(codebook *b,double *vec,int current){
104   encode_aux_threshmatch *tt=b->c->thresh_tree;
105   int dim=b->dim;
106   int i,k,o;
107
108   double bestmetric=0;
109   int bestentry=-1;
110   int best=bestm(b,vec);
111
112   if(current<0 && b->c->lengthlist[best]>0)return best;
113
114   for(i=0;i<b->entries;i++){
115     if(b->c->lengthlist[i]>0 && i!=best && i!=current){
116       double thismetric=_dist(dim, vec, b->valuelist+i*dim);
117       if(bestentry==-1 || thismetric<bestmetric){
118         bestentry=i;
119         bestmetric=thismetric;
120       }
121     }
122   }
123
124   return(bestentry);
125 }
126
127 static double _heuristic(codebook *b,double *ppt,int secondbest){
128   double *secondcell=b->valuelist+secondbest*b->dim;
129   int best=bestm(b,ppt);
130   double *firstcell=b->valuelist+best*b->dim;
131   double error=_dist(b->dim,firstcell,secondcell);
132   double *zero=alloca(b->dim*sizeof(double));
133   double fromzero;
134   
135   memset(zero,0,b->dim*sizeof(double));
136   fromzero=sqrt(_dist(b->dim,firstcell,zero));
137
138   return(error/fromzero);
139 }
140
141 static int longsort(const void *a, const void *b){
142   return **(long **)b-**(long **)a;
143 }
144
145 void usage(void){
146   fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
147           "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
148           "where <target_cells> is the desired number of final cells (or -1\n"
149           "                     for no change)\n"
150           "      <protected_cells> is the number of highest-hit count cells\n"
151           "                     to protect from dispersal\n"
152           "      base is the base name (not including .vqh) of the new\n"
153           "                     book\n\n");
154   exit(1);
155 }
156
157 int main(int argc,char *argv[]){
158   char *basename;
159   codebook *b=NULL;
160   int entries=0;
161   int dim=0;
162   long i,j,target=-1,protect=-1;
163   FILE *out=NULL;
164
165   int argnum=0;
166
167   argv++;
168   if(*argv==NULL){
169     usage();
170     exit(1);
171   }
172
173   while(*argv){
174     if(*argv[0]=='-'){
175
176       argv++;
177         
178     }else{
179       switch (argnum++){
180       case 0:case 1:
181         {
182           /* yes, this is evil.  However, it's very convenient to parse file
183              extentions */
184           
185           /* input file.  What kind? */
186           char *dot;
187           char *ext=NULL;
188           char *name=strdup(*argv++);
189           dot=strrchr(name,'.');
190           if(dot)
191             ext=dot+1;
192           else{
193             ext="";
194             
195           }
196           
197           
198           /* codebook */
199           if(!strcmp(ext,"vqh")){
200             
201             basename=strrchr(name,'/');
202             if(basename)
203               basename=strdup(basename)+1;
204             else
205               basename=strdup(name);
206             dot=strrchr(basename,'.');
207             if(dot)*dot='\0';
208             
209             b=codebook_load(name);
210             dim=b->dim;
211             entries=b->entries;
212           }
213           
214           /* data file; we do actually need to suck it into memory */
215           /* we're dealing with just one book, so we can de-interleave */ 
216           if(!strcmp(ext,"vqd") && !points){
217             int cols;
218             long lines=0;
219             char *line;
220             double *vec;
221             FILE *in=fopen(name,"r");
222             if(!in){
223               fprintf(stderr,"Could not open input file %s\n",name);
224               exit(1);
225             }
226             
227             reset_next_value();
228             line=setup_line(in);
229             /* count cols before we start reading */
230             {
231               char *temp=line;
232               while(*temp==' ')temp++;
233               for(cols=0;*temp;cols++){
234                 while(*temp>32)temp++;
235                 while(*temp==' ')temp++;
236               }
237             }
238             vec=alloca(cols*sizeof(double));
239             /* count, then load, to avoid fragmenting the hell out of
240                memory */
241             while(line){
242               lines++;
243               for(j=0;j<cols;j++)
244                 if(get_line_value(in,vec+j)){
245                   fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
246                   exit(1);
247                 }
248               if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
249               line=setup_line(in);
250             }
251             pointlist=malloc((cols*lines+entries*dim)*sizeof(double));
252             
253             rewind(in);
254             line=setup_line(in);
255             while(line){
256               lines--;
257               for(j=0;j<cols;j++)
258                 if(get_line_value(in,vec+j)){
259                   fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
260                   exit(1);
261                 }
262               /* deinterleave, add to heap */
263               add_vector(b,vec,cols);
264               if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
265               
266               line=setup_line(in);
267             }
268             fclose(in);
269           }
270         }
271         break;
272       case 2:
273         target=atol(*argv++);
274         if(target==0)target=entries;
275         break;
276       case 3:
277         protect=atol(*argv++);
278         break;
279       case 4:
280         {
281           char *buff=alloca(strlen(*argv)+5);
282           sprintf(buff,"%s.vqh",*argv);
283           basename=*argv++;
284
285           out=fopen(buff,"w");
286           if(!out){
287             fprintf(stderr,"unable ot open %s for output",buff);
288             exit(1);
289           }
290         }
291         break;
292       default:
293         usage();
294       }
295     }
296   }
297   if(!entries || !points || !out)usage();
298   if(target==-1)usage();
299
300   /* add guard points */
301   for(i=0;i<entries;i++)
302     for(j=0;j<dim;j++)
303       pointlist[points++]=b->valuelist[i*dim+j];
304   
305   points/=dim;
306
307   /* set up auxiliary vectors for error tracking */
308   {
309     encode_aux_nearestmatch *nt=NULL;
310     long pointssofar=0;
311     long *pointindex;
312     long indexedpoints=0;
313     long *entryindex;
314     long *reventry;
315     long *membership=malloc(points*sizeof(long));
316     long *firsthead=malloc(entries*sizeof(long));
317     long *secondary=malloc(points*sizeof(long));
318     long *secondhead=malloc(entries*sizeof(long));
319
320     long *cellcount=calloc(entries,sizeof(long));
321     long *cellcount2=calloc(entries,sizeof(long));
322     double *cellerror=calloc(entries,sizeof(double));
323     double *cellerrormax=calloc(entries,sizeof(double));
324     long cellsleft=entries;
325     for(i=0;i<points;i++)membership[i]=-1;
326     for(i=0;i<entries;i++)firsthead[i]=-1;
327     for(i=0;i<points;i++)secondary[i]=-1;
328     for(i=0;i<entries;i++)secondhead[i]=-1;
329
330     for(i=0;i<points;i++){
331       /* assign vectors to the nearest cell.  Also keep track of second
332          nearest for error statistics */
333       double *ppt=pointlist+i*dim;
334       int    firstentry=closest(b,ppt,-1);
335       int    secondentry=closest(b,ppt,firstentry);
336       double firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
337       double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
338       
339       if(!(i&0xff))spinnit("initializing... ",points-i);
340     
341       membership[i]=firsthead[firstentry];
342       firsthead[firstentry]=i;
343       secondary[i]=secondhead[secondentry];
344       secondhead[secondentry]=i;
345
346       if(i<points-entries){
347         cellerror[firstentry]+=secondmetric-firstmetric;
348         cellerrormax[firstentry]=max(cellerrormax[firstentry],
349                                      _heuristic(b,ppt,secondentry));
350         cellcount[firstentry]++;
351         cellcount2[secondentry]++;
352       }
353     }
354
355     /* which cells are most heavily populated?  Protect as many from
356        dispersal as the user has requested */
357     {
358       long **countindex=calloc(entries,sizeof(long *));
359       for(i=0;i<entries;i++)countindex[i]=cellcount+i;
360       qsort(countindex,entries,sizeof(long *),longsort);
361       for(i=0;i<protect;i++){
362         int ptr=countindex[i]-cellcount;
363         cellerrormax[ptr]=9e50;
364       }
365     }
366
367     {
368       fprintf(stderr,"\r");
369       for(i=0;i<entries;i++){
370         /* decompose index */
371         int entry=i;
372         for(j=0;j<dim;j++){
373           fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
374           entry/=b->c->thresh_tree->quantvals;
375         }
376         
377         fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
378       }
379       fprintf(stderr,"\n");
380     }
381
382     /* do the automatic cull request */
383     while(cellsleft>target){
384       int bestcell=-1;
385       double besterror=0;
386       double besterror2=0;
387       long head=-1;
388       char spinbuf[80];
389       sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);
390
391       /* find the cell with lowest removal impact */
392       for(i=0;i<entries;i++){
393         if(b->c->lengthlist[i]>0){
394           if(bestcell==-1 || cellerrormax[i]<=besterror2){
395             if(bestcell==-1 || cellerrormax[i]<besterror2 || 
396                besterror>cellerror[i]){
397               besterror=cellerror[i];
398               besterror2=cellerrormax[i];
399               bestcell=i;
400             }
401           }
402         }
403       }
404
405       fprintf(stderr,"\reliminating cell %d                              \n"
406               "     dispersal error of %g max/%g total (%ld hits)\n",
407               bestcell,besterror2,besterror,cellcount[bestcell]);
408
409       /* disperse it.  move each point out, adding it (properly) to
410          the second best */
411       b->c->lengthlist[bestcell]=0;
412       head=firsthead[bestcell];
413       firsthead[bestcell]=-1;
414       while(head!=-1){
415         /* head is a point number */
416         double *ppt=pointlist+head*dim;
417         int firstentry=closest(b,ppt,-1);
418         int secondentry=closest(b,ppt,firstentry);
419         double firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
420         double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
421         long next=membership[head];
422
423         if(head<points-entries){
424           cellcount[firstentry]++;
425           cellcount[bestcell]--;
426           cellerror[firstentry]+=secondmetric-firstmetric;
427           cellerrormax[firstentry]=max(cellerrormax[firstentry],
428                                        _heuristic(b,ppt,secondentry));
429         }
430
431         membership[head]=firsthead[firstentry];
432         firsthead[firstentry]=head;
433         head=next;
434         if(cellcount[bestcell]%128==0)
435           spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);
436
437       }
438
439       /* now see that all points that had the dispersed cell as second
440          choice have second choice reassigned */
441       head=secondhead[bestcell];
442       secondhead[bestcell]=-1;
443       while(head!=-1){
444         double *ppt=pointlist+head*dim;
445         /* who are we assigned to now? */
446         int firstentry=closest(b,ppt,-1);
447         /* what is the new second closest match? */
448         int secondentry=closest(b,ppt,firstentry);
449         /* old second closest is the cell being disbanded */
450         double oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
451         /* new second closest error */
452         double secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
453         long next=secondary[head];
454
455         if(head<points-entries){
456           cellcount2[secondentry]++;
457           cellcount2[bestcell]--;
458           cellerror[firstentry]+=secondmetric-oldsecondmetric;
459           cellerrormax[firstentry]=max(cellerrormax[firstentry],
460                                        _heuristic(b,ppt,secondentry));
461         }
462         
463         secondary[head]=secondhead[secondentry];
464         secondhead[secondentry]=head;
465         head=next;
466
467         if(cellcount2[bestcell]%128==0)
468           spinnit(spinbuf,cellcount2[bestcell]);
469       }
470
471       cellsleft--;
472     }
473
474     /* paring is over.  Build decision trees using points that now fall
475        through the thresh matcher. */
476     /* we don't free membership; we flatten it in order to use in lp_split */
477
478     for(i=0;i<entries;i++){
479       long head=firsthead[i];
480       spinnit("rearranging membership cache... ",entries-i);
481       while(head!=-1){
482         long next=membership[head];
483         membership[head]=i;
484         head=next;
485       }
486     }
487
488     free(secondhead);
489     free(firsthead);
490     free(cellerror);
491     free(cellerrormax);
492     free(secondary);
493
494     pointindex=malloc(points*sizeof(long));
495     /* make a point index of fall-through points */
496     for(i=0;i<points;i++){
497       int best=_best(b,pointlist+i*dim,1);
498       if(best==-1)
499         pointindex[indexedpoints++]=i;
500       spinnit("finding orphaned points... ",points-i);
501     }
502
503     /* make an entry index */
504     entryindex=malloc(entries*sizeof(long));
505     target=0;
506     for(i=0;i<entries;i++){
507       if(b->c->lengthlist[i]>0)
508         entryindex[target++]=i;
509     }
510
511     /* make working space for a reverse entry index */
512     reventry=malloc(entries*sizeof(long));
513
514     /* do the split */
515     nt=b->c->nearest_tree=
516       calloc(1,sizeof(encode_aux_nearestmatch));
517
518     nt->alloc=4096;
519     nt->ptr0=malloc(sizeof(long)*nt->alloc);
520     nt->ptr1=malloc(sizeof(long)*nt->alloc);
521     nt->p=malloc(sizeof(long)*nt->alloc);
522     nt->q=malloc(sizeof(long)*nt->alloc);
523     nt->aux=0;
524
525     fprintf(stderr,"Leaves added: %d              \n",
526             lp_split(pointlist,points,
527                      b,entryindex,target,
528                      pointindex,indexedpoints,
529                      membership,reventry,
530                      0,&pointssofar));
531     free(membership);
532     free(reventry);
533     free(pointindex);
534
535     /* hack alert.  I should just change the damned splitter and
536        codebook writer */
537     for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
538     for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
539     
540     /* recount hits.  Build new lengthlist. reuse entryindex storage */
541     for(i=0;i<entries;i++)entryindex[i]=1;
542     for(i=0;i<points-entries;i++){
543       int best=_best(b,pointlist+i*dim,1);
544       double *a=pointlist+i*dim;
545       if(!(i&0xff))spinnit("counting hits...",i);
546       if(best==-1){
547         fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
548                 "codebook entry.  The new decision tree is broken.\n");
549         exit(1);
550       }
551       entryindex[best]++;
552     }
553     for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
554     for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
555     
556     /* the lengthlist builder doesn't actually deal with 0 hit entries.
557        So, we pack the 'sparse' hit list into a dense list, then unpack
558        the lengths after the build */
559     {
560       int upper=0;
561       long *lengthlist=calloc(entries,sizeof(long));
562       for(i=0;i<entries;i++){
563         if(b->c->lengthlist[i]>0)
564           entryindex[upper++]=entryindex[i];
565         else{
566           if(entryindex[i]>1){
567             fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
568             exit(1);
569           }
570         }
571       }
572       
573       /* sanity check */
574       if(upper != target){
575         fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
576         exit(1);
577       }
578     
579       build_tree_from_lengths(upper,entryindex,lengthlist);
580       
581       upper=0;
582       for(i=0;i<entries;i++){
583         if(b->c->lengthlist[i]>0)
584           b->c->lengthlist[i]=lengthlist[upper++];
585       }
586
587     }
588   }
589   /* we're done.  write it out. */
590   write_codebook(out,basename,b->c);
591
592   fprintf(stderr,"\r                                        \nDone.\n");
593   return(0);
594 }
595
596
597
598