gst/monoscope/convolve.c

   1 /* Karatsuba convolution
   2  *
   3  *  Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
   4  *
   5  *  This program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2 of the License, or
   8  *  (at your option) any later version.
   9  *
  10  *  This program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with this program; if not, write to the Free Software
  17  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18  *
  19  *
  20  *  $Id$
  21  *
  22  */
  23
  24 /* The algorithm is based on the following.  For the convolution of a pair
  25  * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
  26  * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
  27  * b.d.  A similar relation enables us to compute a 2n by 2n convolution
  28  * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
  29  * multiplications (as opposed to the 4^n that the quadratic algorithm
  30  * takes. */
  31
  32 /* For large n, this is slower than the O(n log n) that the FFT method
  33  * takes, but we avoid using complex numbers, and we only have to compute
  34  * one convolution, as opposed to 3 FFTs.  We have good locality-of-
  35  * reference as well, which will help on CPUs with tiny caches.  */
  36
  37 /* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
  38  * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
  39  * algorithm.  We actually want 257 outputs of a 256 x 512 convolution;
  40  * that doesn't appear to give an easy advantage for the FFT algorithm, but
  41  * for the Karatsuba algorithm, it's easy to use two 256 x 256
  42  * convolutions, taking 2 x 3^8 = 12312 multiplications.  [This difference
  43  * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
  44  * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
  45  * - 1]. */
  46
  47 /* There's a big lie above, actually... for a 4x4 convolution, it's quicker
  48  * to do it using 16 multiplications than the more complex Karatsuba
  49  * algorithm...  So the recursion bottoms out at 4x4s.  This increases the
  50  * number of multiplications by a factor of 16/9, but reduces the overheads
  51  * dramatically. */
  52
  53 /* The convolution algorithm is implemented as a stack machine.  We have a
  54  * stack of commands, each in one of the forms "do a 2^n x 2^n
  55  * convolution", or "combine these three length 2^n outputs into one
  56  * 2^{n+1} output." */
  57
  58 #ifdef HAVE_CONFIG_H
  59 #include "config.h"
  60 #endif
  61
  62 #include <stdlib.h>
  63 #include "convolve.h"
  64
  65 typedef union stack_entry_s
  66 {
  67   struct
  68   {
  69     const double *left, *right;
  70     double *out;
  71   } v;
  72   struct
  73   {
  74     double *main, *null;
  75   } b;
  76
  77 } stack_entry;
  78
  79 #define STACK_SIZE (CONVOLVE_DEPTH * 3)
  80
  81 struct _struct_convolve_state
  82 {
  83   double left[CONVOLVE_BIG];
  84   double right[CONVOLVE_SMALL * 3];
  85   double scratch[CONVOLVE_SMALL * 3];
  86   stack_entry stack[STACK_SIZE];
  87 };
  88
  89 /*
  90  * Initialisation routine - sets up tables and space to work in.
  91  * Returns a pointer to internal state, to be used when performing calls.
  92  * On error, returns NULL.
  93  * The pointer should be freed when it is finished with, by convolve_close().
  94  */
  95 convolve_state *
  96 convolve_init (void)
  97 {
  98   return (convolve_state *) malloc (sizeof (convolve_state));
  99 }
 100
 101 /*
 102  * Free the state allocated with convolve_init().
 103  */
 104 void
 105 convolve_close (convolve_state * state)
 106 {
 107   if (state)
 108     free (state);
 109 }
 110
 111 static void
 112 convolve_4 (double *out, const double *left, const double *right)
 113 /* This does a 4x4 -> 7 convolution.  For what it's worth, the slightly odd
 114  * ordering gives about a 1% speed up on my Pentium II. */
 115 {
 116   double l0, l1, l2, l3, r0, r1, r2, r3;
 117   double a;
 118
 119   l0 = left[0];
 120   r0 = right[0];
 121   a = l0 * r0;
 122   l1 = left[1];
 123   r1 = right[1];
 124   out[0] = a;
 125   a = (l0 * r1) + (l1 * r0);
 126   l2 = left[2];
 127   r2 = right[2];
 128   out[1] = a;
 129   a = (l0 * r2) + (l1 * r1) + (l2 * r0);
 130   l3 = left[3];
 131   r3 = right[3];
 132   out[2] = a;
 133
 134   out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
 135   out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
 136   out[5] = (l2 * r3) + (l3 * r2);
 137   out[6] = l3 * r3;
 138 }
 139
 140 static void
 141 convolve_run (stack_entry * top, unsigned size, double *scratch)
 142 /* Interpret a stack of commands.  The stack starts with two entries; the
 143  * convolution to do, and an illegal entry used to mark the stack top.  The
 144  * size is the number of entries in each input, and must be a power of 2,
 145  * and at least 8.  It is OK to have out equal to left and/or right.
 146  * scratch must have length 3*size.  The number of stack entries needed is
 147  * 3n-4 where size=2^n. */
 148 {
 149   do {
 150     const double *left;
 151     const double *right;
 152     double *out;
 153
 154     /* When we get here, the stack top is always a convolve,
 155      * with size > 4.  So we will split it.  We repeatedly split
 156      * the top entry until we get to size = 4. */
 157
 158     left = top->v.left;
 159     right = top->v.right;
 160     out = top->v.out;
 161     top++;
 162
 163     do {
 164       double *s_left, *s_right;
 165       int i;
 166
 167       /* Halve the size. */
 168       size >>= 1;
 169
 170       /* Allocate the scratch areas. */
 171       s_left = scratch + size * 3;
 172       /* s_right is a length 2*size buffer also used for
 173        * intermediate output. */
 174       s_right = scratch + size * 4;
 175
 176       /* Create the intermediate factors. */
 177       for (i = 0; i < size; i++) {
 178         double l = left[i] + left[i + size];
 179         double r = right[i] + right[i + size];
 180
 181         s_left[i + size] = r;
 182         s_left[i] = l;
 183       }
 184
 185       /* Push the combine entry onto the stack. */
 186       top -= 3;
 187       top[2].b.main = out;
 188       top[2].b.null = NULL;
 189
 190       /* Push the low entry onto the stack.  This must be
 191        * the last of the three sub-convolutions, because
 192        * it may overwrite the arguments. */
 193       top[1].v.left = left;
 194       top[1].v.right = right;
 195       top[1].v.out = out;
 196
 197       /* Push the mid entry onto the stack. */
 198       top[0].v.left = s_left;
 199       top[0].v.right = s_right;
 200       top[0].v.out = s_right;
 201
 202       /* Leave the high entry in variables. */
 203       left += size;
 204       right += size;
 205       out += size * 2;
 206
 207     } while (size > 4);
 208
 209     /* When we get here, the stack top is a group of 3
 210      * convolves, with size = 4, followed by some combines.  */
 211     convolve_4 (out, left, right);
 212     convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
 213     convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
 214     top += 2;
 215
 216     /* Now process combines. */
 217     do {
 218       /* b.main is the output buffer, mid is the middle
 219        * part which needs to be adjusted in place, and
 220        * then folded back into the output.  We do this in
 221        * a slightly strange way, so as to avoid having
 222        * two loops. */
 223       double *out = top->b.main;
 224       double *mid = scratch + size * 4;
 225       unsigned int i;
 226
 227       top++;
 228       out[size * 2 - 1] = 0;
 229       for (i = 0; i < size - 1; i++) {
 230         double lo;
 231         double hi;
 232
 233         lo = mid[0] - (out[0] + out[2 * size]) + out[size];
 234         hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
 235         out[size] = lo;
 236         out[2 * size] = hi;
 237         out++;
 238         mid++;
 239       }
 240       size <<= 1;
 241     } while (top->b.null == NULL);
 242   } while (top->b.main != NULL);
 243 }
 244
 245 int
 246 convolve_match (const int *lastchoice,
 247     const short *input, convolve_state * state)
 248 /* lastchoice is a 256 sized array.  input is a 512 array.  We find the
 249  * contiguous length 256 sub-array of input that best matches lastchoice.
 250  * A measure of how good a sub-array is compared with the lastchoice is
 251  * given by the sum of the products of each pair of entries.  We maximise
 252  * that, by taking an appropriate convolution, and then finding the maximum
 253  * entry in the convolutions.  state is a (non-NULL) pointer returned by
 254  * convolve_init.  */
 255 {
 256   double avg;
 257   double best;
 258   int p = 0;
 259   int i;
 260   double *left = state->left;
 261   double *right = state->right;
 262   double *scratch = state->scratch;
 263   stack_entry *top = state->stack + STACK_SIZE - 1;
 264
 265 #if 1
 266   for (i = 0; i < 512; i++)
 267     left[i] = input[i];
 268
 269   avg = 0;
 270   for (i = 0; i < 256; i++) {
 271     double a = lastchoice[255 - i];
 272
 273     right[i] = a;
 274     avg += a;
 275   }
 276 #endif
 277   /* We adjust the smaller of the two input arrays to have average
 278    * value 0.  This makes the eventual result insensitive to both
 279    * constant offsets and positive multipliers of the inputs. */
 280   avg /= 256;
 281   for (i = 0; i < 256; i++)
 282     right[i] -= avg;
 283   /* End-of-stack marker. */
 284 #if     0                       /* The following line produces a CRASH, need to figure out why?!! */
 285   top[1].b.null = scratch;
 286 #endif
 287   top[1].b.main = NULL;
 288   /* The low 256x256, of which we want the high 256 outputs. */
 289   top->v.left = left;
 290   top->v.right = right;
 291   top->v.out = right + 256;
 292   convolve_run (top, 256, scratch);
 293
 294   /* The high 256x256, of which we want the low 256 outputs. */
 295   top->v.left = left + 256;
 296   top->v.right = right;
 297   top->v.out = right;
 298   convolve_run (top, 256, scratch);
 299
 300   /* Now find the best position amoungs this.  Apart from the first
 301    * and last, the required convolution outputs are formed by adding
 302    * outputs from the two convolutions above. */
 303   best = right[511];
 304   right[767] = 0;
 305   p = -1;
 306   for (i = 0; i < 256; i++) {
 307     double a = right[i] + right[i + 512];
 308
 309     if (a > best) {
 310       best = a;
 311       p = i;
 312     }
 313   }
 314   p++;
 315
 316 #if 0
 317   {
 318     /* This is some debugging code... */
 319     int bad = 0;
 320
 321     best = 0;
 322     for (i = 0; i < 256; i++)
 323       best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);
 324
 325     for (i = 0; i < 257; i++) {
 326       double tot = 0;
 327       unsigned int j;
 328
 329       for (j = 0; j < 256; j++)
 330         tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
 331       if (tot > best)
 332         printf ("(%i)", i);
 333       if (tot != left[i + 255])
 334         printf ("!");
 335     }
 336
 337     printf ("%i\n", p);
 338   }
 339 #endif
 340
 341   return p;
 342 }