1 /********************************************************************
3 * THIS FILE IS PART OF THE Ogg Vorbis SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS SOURCE IS GOVERNED BY *
5 * THE GNU PUBLIC LICENSE 2, WHICH IS INCLUDED WITH THIS SOURCE. *
6 * PLEASE READ THESE TERMS DISTRIBUTING. *
8 * THE OggSQUISH SOURCE CODE IS (C) COPYRIGHT 1994-2000 *
9 * by Monty <monty@xiph.org> and The XIPHOPHORUS Company *
10 * http://www.xiph.org/ *
12 ********************************************************************
14 function: utility for paring low hit count cells from lattice codebook
15 last mod: $Id: latticepare.c,v 1.5 2000/10/12 03:13:01 xiphmont Exp $
17 ********************************************************************/
24 #include "vorbis/codebook.h"
25 #include "../lib/sharedbook.h"
26 #include "../lib/scales.h"
30 #include "../lib/os.h"
32 /* Lattice codebooks have two strengths: important fetaures that are
33 poorly modelled by global error minimization training (eg, strong
34 peaks) are not neglected 2) compact quantized representation.
36 A fully populated lattice codebook, however, swings point 1 too far
37 in the opposite direction; rare features need not be modelled quite
38 so religiously and as such, we waste bits unless we eliminate the
39 least common cells. The codebook rep supports unused cells, so we
40 need to tag such cells and build an auxiliary (non-thresh) search
41 mechanism to find the proper match quickly */
43 /* two basic steps; first is pare the cell for which dispersal creates
44 the least additional error. This will naturally choose
45 low-population cells and cells that have not taken on points from
46 neighboring paring (but does not result in the lattice collapsing
47 inward and leaving low population ares totally unmodelled). After
48 paring has removed the desired number of cells, we need to build an
49 auxiliary search for each culled point */
51 /* Although lattice books (due to threshhold-based matching) do not
52 actually use error to make cell selections (in fact, it need not
53 bear any relation), the 'secondbest' entry finder here is in fact
54 error/distance based, so latticepare is only useful on such books */
57 latticepare latticebook.vqh input_data.vqd <target_cells>
59 produces a new output book on stdout
62 static float _dist(int el,float *a, float *b){
66 float val=(a[i]-b[i]);
72 static float *pointlist;
75 void add_vector(codebook *b,float *vec,long n){
80 pointlist[points++]=vec[j];
85 static int bestm(codebook *b,float *vec){
86 encode_aux_threshmatch *tt=b->c->thresh_tree;
91 /* what would be the closest match if the codebook was fully
94 for(k=0,o=dim-1;k<dim;k++,o--){
96 for(i=0;i<tt->threshvals-1;i++)
97 if(vec[o]<tt->quantthresh[i])break;
98 best=(best*tt->quantvals)+tt->quantmap[i];
103 static int closest(codebook *b,float *vec,int current){
104 encode_aux_threshmatch *tt=b->c->thresh_tree;
110 int best=bestm(b,vec);
112 if(current<0 && b->c->lengthlist[best]>0)return best;
114 for(i=0;i<b->entries;i++){
115 if(b->c->lengthlist[i]>0 && i!=best && i!=current){
116 float thismetric=_dist(dim, vec, b->valuelist+i*dim);
117 if(bestentry==-1 || thismetric<bestmetric){
119 bestmetric=thismetric;
127 static float _heuristic(codebook *b,float *ppt,int secondbest){
128 float *secondcell=b->valuelist+secondbest*b->dim;
129 int best=bestm(b,ppt);
130 float *firstcell=b->valuelist+best*b->dim;
131 float error=_dist(b->dim,firstcell,secondcell);
132 float *zero=alloca(b->dim*sizeof(float));
135 memset(zero,0,b->dim*sizeof(float));
136 fromzero=sqrt(_dist(b->dim,firstcell,zero));
138 return(error/fromzero);
141 static int longsort(const void *a, const void *b){
142 return **(long **)b-**(long **)a;
146 fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
147 "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
148 "where <target_cells> is the desired number of final cells (or -1\n"
150 " <protected_cells> is the number of highest-hit count cells\n"
151 " to protect from dispersal\n"
152 " base is the base name (not including .vqh) of the new\n"
157 int main(int argc,char *argv[]){
162 long i,j,target=-1,protect=-1;
182 /* yes, this is evil. However, it's very convenient to parse file
185 /* input file. What kind? */
188 char *name=strdup(*argv++);
189 dot=strrchr(name,'.');
199 if(!strcmp(ext,"vqh")){
201 basename=strrchr(name,'/');
203 basename=strdup(basename)+1;
205 basename=strdup(name);
206 dot=strrchr(basename,'.');
209 b=codebook_load(name);
214 /* data file; we do actually need to suck it into memory */
215 /* we're dealing with just one book, so we can de-interleave */
216 if(!strcmp(ext,"vqd") && !points){
221 FILE *in=fopen(name,"r");
223 fprintf(stderr,"Could not open input file %s\n",name);
229 /* count cols before we start reading */
232 while(*temp==' ')temp++;
233 for(cols=0;*temp;cols++){
234 while(*temp>32)temp++;
235 while(*temp==' ')temp++;
238 vec=alloca(cols*sizeof(float));
239 /* count, then load, to avoid fragmenting the hell out of
244 if(get_line_value(in,vec+j)){
245 fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
248 if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
251 pointlist=malloc((cols*lines+entries*dim)*sizeof(float));
258 if(get_line_value(in,vec+j)){
259 fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
262 /* deinterleave, add to heap */
263 add_vector(b,vec,cols);
264 if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
273 target=atol(*argv++);
274 if(target==0)target=entries;
277 protect=atol(*argv++);
281 char *buff=alloca(strlen(*argv)+5);
282 sprintf(buff,"%s.vqh",*argv);
287 fprintf(stderr,"unable ot open %s for output",buff);
297 if(!entries || !points || !out)usage();
298 if(target==-1)usage();
300 /* add guard points */
301 for(i=0;i<entries;i++)
303 pointlist[points++]=b->valuelist[i*dim+j];
307 /* set up auxiliary vectors for error tracking */
309 encode_aux_nearestmatch *nt=NULL;
312 long indexedpoints=0;
315 long *membership=malloc(points*sizeof(long));
316 long *firsthead=malloc(entries*sizeof(long));
317 long *secondary=malloc(points*sizeof(long));
318 long *secondhead=malloc(entries*sizeof(long));
320 long *cellcount=calloc(entries,sizeof(long));
321 long *cellcount2=calloc(entries,sizeof(long));
322 float *cellerror=calloc(entries,sizeof(float));
323 float *cellerrormax=calloc(entries,sizeof(float));
324 long cellsleft=entries;
325 for(i=0;i<points;i++)membership[i]=-1;
326 for(i=0;i<entries;i++)firsthead[i]=-1;
327 for(i=0;i<points;i++)secondary[i]=-1;
328 for(i=0;i<entries;i++)secondhead[i]=-1;
330 for(i=0;i<points;i++){
331 /* assign vectors to the nearest cell. Also keep track of second
332 nearest for error statistics */
333 float *ppt=pointlist+i*dim;
334 int firstentry=closest(b,ppt,-1);
335 int secondentry=closest(b,ppt,firstentry);
336 float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
337 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
339 if(!(i&0xff))spinnit("initializing... ",points-i);
341 membership[i]=firsthead[firstentry];
342 firsthead[firstentry]=i;
343 secondary[i]=secondhead[secondentry];
344 secondhead[secondentry]=i;
346 if(i<points-entries){
347 cellerror[firstentry]+=secondmetric-firstmetric;
348 cellerrormax[firstentry]=max(cellerrormax[firstentry],
349 _heuristic(b,ppt,secondentry));
350 cellcount[firstentry]++;
351 cellcount2[secondentry]++;
355 /* which cells are most heavily populated? Protect as many from
356 dispersal as the user has requested */
358 long **countindex=calloc(entries,sizeof(long *));
359 for(i=0;i<entries;i++)countindex[i]=cellcount+i;
360 qsort(countindex,entries,sizeof(long *),longsort);
361 for(i=0;i<protect;i++){
362 int ptr=countindex[i]-cellcount;
363 cellerrormax[ptr]=9e50;
368 fprintf(stderr,"\r");
369 for(i=0;i<entries;i++){
370 /* decompose index */
373 fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
374 entry/=b->c->thresh_tree->quantvals;
377 fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
379 fprintf(stderr,"\n");
382 /* do the automatic cull request */
383 while(cellsleft>target){
389 sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);
391 /* find the cell with lowest removal impact */
392 for(i=0;i<entries;i++){
393 if(b->c->lengthlist[i]>0){
394 if(bestcell==-1 || cellerrormax[i]<=besterror2){
395 if(bestcell==-1 || cellerrormax[i]<besterror2 ||
396 besterror>cellerror[i]){
397 besterror=cellerror[i];
398 besterror2=cellerrormax[i];
405 fprintf(stderr,"\reliminating cell %d \n"
406 " dispersal error of %g max/%g total (%ld hits)\n",
407 bestcell,besterror2,besterror,cellcount[bestcell]);
409 /* disperse it. move each point out, adding it (properly) to
411 b->c->lengthlist[bestcell]=0;
412 head=firsthead[bestcell];
413 firsthead[bestcell]=-1;
415 /* head is a point number */
416 float *ppt=pointlist+head*dim;
417 int firstentry=closest(b,ppt,-1);
418 int secondentry=closest(b,ppt,firstentry);
419 float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
420 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
421 long next=membership[head];
423 if(head<points-entries){
424 cellcount[firstentry]++;
425 cellcount[bestcell]--;
426 cellerror[firstentry]+=secondmetric-firstmetric;
427 cellerrormax[firstentry]=max(cellerrormax[firstentry],
428 _heuristic(b,ppt,secondentry));
431 membership[head]=firsthead[firstentry];
432 firsthead[firstentry]=head;
434 if(cellcount[bestcell]%128==0)
435 spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);
439 /* now see that all points that had the dispersed cell as second
440 choice have second choice reassigned */
441 head=secondhead[bestcell];
442 secondhead[bestcell]=-1;
444 float *ppt=pointlist+head*dim;
445 /* who are we assigned to now? */
446 int firstentry=closest(b,ppt,-1);
447 /* what is the new second closest match? */
448 int secondentry=closest(b,ppt,firstentry);
449 /* old second closest is the cell being disbanded */
450 float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
451 /* new second closest error */
452 float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
453 long next=secondary[head];
455 if(head<points-entries){
456 cellcount2[secondentry]++;
457 cellcount2[bestcell]--;
458 cellerror[firstentry]+=secondmetric-oldsecondmetric;
459 cellerrormax[firstentry]=max(cellerrormax[firstentry],
460 _heuristic(b,ppt,secondentry));
463 secondary[head]=secondhead[secondentry];
464 secondhead[secondentry]=head;
467 if(cellcount2[bestcell]%128==0)
468 spinnit(spinbuf,cellcount2[bestcell]);
474 /* paring is over. Build decision trees using points that now fall
475 through the thresh matcher. */
476 /* we don't free membership; we flatten it in order to use in lp_split */
478 for(i=0;i<entries;i++){
479 long head=firsthead[i];
480 spinnit("rearranging membership cache... ",entries-i);
482 long next=membership[head];
494 pointindex=malloc(points*sizeof(long));
495 /* make a point index of fall-through points */
496 for(i=0;i<points;i++){
497 int best=_best(b,pointlist+i*dim,1);
499 pointindex[indexedpoints++]=i;
500 spinnit("finding orphaned points... ",points-i);
503 /* make an entry index */
504 entryindex=malloc(entries*sizeof(long));
506 for(i=0;i<entries;i++){
507 if(b->c->lengthlist[i]>0)
508 entryindex[target++]=i;
511 /* make working space for a reverse entry index */
512 reventry=malloc(entries*sizeof(long));
515 nt=b->c->nearest_tree=
516 calloc(1,sizeof(encode_aux_nearestmatch));
519 nt->ptr0=malloc(sizeof(long)*nt->alloc);
520 nt->ptr1=malloc(sizeof(long)*nt->alloc);
521 nt->p=malloc(sizeof(long)*nt->alloc);
522 nt->q=malloc(sizeof(long)*nt->alloc);
525 fprintf(stderr,"Leaves added: %d \n",
526 lp_split(pointlist,points,
528 pointindex,indexedpoints,
535 /* hack alert. I should just change the damned splitter and
537 for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
538 for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
540 /* recount hits. Build new lengthlist. reuse entryindex storage */
541 for(i=0;i<entries;i++)entryindex[i]=1;
542 for(i=0;i<points-entries;i++){
543 int best=_best(b,pointlist+i*dim,1);
544 float *a=pointlist+i*dim;
545 if(!(i&0xff))spinnit("counting hits...",i);
547 fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
548 "codebook entry. The new decision tree is broken.\n");
553 for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
554 for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
556 /* the lengthlist builder doesn't actually deal with 0 hit entries.
557 So, we pack the 'sparse' hit list into a dense list, then unpack
558 the lengths after the build */
561 long *lengthlist=calloc(entries,sizeof(long));
562 for(i=0;i<entries;i++){
563 if(b->c->lengthlist[i]>0)
564 entryindex[upper++]=entryindex[i];
567 fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
575 fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
579 build_tree_from_lengths(upper,entryindex,lengthlist);
582 for(i=0;i<entries;i++){
583 if(b->c->lengthlist[i]>0)
584 b->c->lengthlist[i]=lengthlist[upper++];
589 /* we're done. write it out. */
590 write_codebook(out,basename,b->c);
592 fprintf(stderr,"\r \nDone.\n");