1 /* Karatsuba convolution
3 * Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public
16 * License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
21 * Note: 7th December 2004: This file used to be licensed under the GPL,
22 * but we got permission from Ralp Loader to relicense it to LGPL.
28 /* The algorithm is based on the following. For the convolution of a pair
29 * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
30 * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
31 * b.d. A similar relation enables us to compute a 2n by 2n convolution
32 * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
33 * multiplications (as opposed to the 4^n that the quadratic algorithm
36 /* For large n, this is slower than the O(n log n) that the FFT method
37 * takes, but we avoid using complex numbers, and we only have to compute
38 * one convolution, as opposed to 3 FFTs. We have good locality-of-
39 * reference as well, which will help on CPUs with tiny caches. */
41 /* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
42 * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
43 * algorithm. We actually want 257 outputs of a 256 x 512 convolution;
44 * that doesn't appear to give an easy advantage for the FFT algorithm, but
45 * for the Karatsuba algorithm, it's easy to use two 256 x 256
46 * convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference
47 * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
48 * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
51 /* There's a big lie above, actually... for a 4x4 convolution, it's quicker
52 * to do it using 16 multiplications than the more complex Karatsuba
53 * algorithm... So the recursion bottoms out at 4x4s. This increases the
54 * number of multiplications by a factor of 16/9, but reduces the overheads
57 /* The convolution algorithm is implemented as a stack machine. We have a
58 * stack of commands, each in one of the forms "do a 2^n x 2^n
59 * convolution", or "combine these three length 2^n outputs into one
69 typedef union stack_entry_s
73 const double *left, *right;
86 #define STACK_SIZE (CONVOLVE_DEPTH * 3)
88 struct _struct_convolve_state
90 double left[CONVOLVE_BIG];
91 double right[CONVOLVE_SMALL * 3];
92 double scratch[CONVOLVE_SMALL * 3];
93 stack_entry stack[STACK_SIZE + 1];
97 * Initialisation routine - sets up tables and space to work in.
98 * Returns a pointer to internal state, to be used when performing calls.
99 * On error, returns NULL.
100 * The pointer should be freed when it is finished with, by convolve_close().
105 return (convolve_state *) calloc (1, sizeof (convolve_state));
109 * Free the state allocated with convolve_init().
112 convolve_close (convolve_state * state)
118 convolve_4 (double *out, const double *left, const double *right)
119 /* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd
120 * ordering gives about a 1% speed up on my Pentium II. */
122 double l0, l1, l2, l3, r0, r1, r2, r3;
131 a = (l0 * r1) + (l1 * r0);
135 a = (l0 * r2) + (l1 * r1) + (l2 * r0);
140 out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
141 out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
142 out[5] = (l2 * r3) + (l3 * r2);
147 convolve_run (stack_entry * top, unsigned size, double *scratch)
148 /* Interpret a stack of commands. The stack starts with two entries; the
149 * convolution to do, and an illegal entry used to mark the stack top. The
150 * size is the number of entries in each input, and must be a power of 2,
151 * and at least 8. It is OK to have out equal to left and/or right.
152 * scratch must have length 3*size. The number of stack entries needed is
153 * 3n-4 where size=2^n. */
160 /* When we get here, the stack top is always a convolve,
161 * with size > 4. So we will split it. We repeatedly split
162 * the top entry until we get to size = 4. */
165 right = top->v.right;
170 double *s_left, *s_right;
173 /* Halve the size. */
176 /* Allocate the scratch areas. */
177 s_left = scratch + size * 3;
178 /* s_right is a length 2*size buffer also used for
179 * intermediate output. */
180 s_right = scratch + size * 4;
182 /* Create the intermediate factors. */
183 for (i = 0; i < size; i++) {
184 double l = left[i] + left[i + size];
185 double r = right[i] + right[i + size];
187 s_left[i + size] = r;
191 /* Push the combine entry onto the stack. */
194 top[2].b.null = NULL;
196 /* Push the low entry onto the stack. This must be
197 * the last of the three sub-convolutions, because
198 * it may overwrite the arguments. */
199 top[1].v.left = left;
200 top[1].v.right = right;
203 /* Push the mid entry onto the stack. */
204 top[0].v.left = s_left;
205 top[0].v.right = s_right;
206 top[0].v.out = s_right;
208 /* Leave the high entry in variables. */
215 /* When we get here, the stack top is a group of 3
216 * convolves, with size = 4, followed by some combines. */
217 convolve_4 (out, left, right);
218 convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
219 convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
222 /* Now process combines. */
224 /* b.main is the output buffer, mid is the middle
225 * part which needs to be adjusted in place, and
226 * then folded back into the output. We do this in
227 * a slightly strange way, so as to avoid having
229 double *out = top->b.main;
230 double *mid = scratch + size * 4;
234 out[size * 2 - 1] = 0;
235 for (i = 0; i < size - 1; i++) {
239 lo = mid[0] - (out[0] + out[2 * size]) + out[size];
240 hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
247 } while (top->b.null == NULL);
248 } while (top->b.main != NULL);
253 * @lastchoice: an array of size SMALL.
254 * @input: an array of siue BIG (2*SMALL)
255 * @state: a (non-NULL) pointer returned by convolve_init.
257 * We find the contiguous SMALL-size sub-array of input that best matches
258 * lastchoice. A measure of how good a sub-array is compared with the lastchoice
259 * is given by the sum of the products of each pair of entries. We maximise
260 * that, by taking an appropriate convolution, and then finding the maximum
261 * entry in the convolutions.
263 * Return: the position of the best match
266 convolve_match (const int *lastchoice, const short *input,
267 convolve_state * state)
273 double *left = state->left;
274 double *right = state->right;
275 double *scratch = state->scratch;
276 stack_entry *top = state->stack + (STACK_SIZE - 1);
278 for (i = 0; i < CONVOLVE_BIG; i++)
281 for (i = 0; i < CONVOLVE_SMALL; i++) {
282 double a = lastchoice[(CONVOLVE_SMALL - 1) - i];
288 /* We adjust the smaller of the two input arrays to have average
289 * value 0. This makes the eventual result insensitive to both
290 * constant offsets and positive multipliers of the inputs. */
291 avg /= CONVOLVE_SMALL;
292 for (i = 0; i < CONVOLVE_SMALL; i++)
294 /* End-of-stack marker. */
295 top[1].b.null = scratch;
296 top[1].b.main = NULL;
297 /* The low SMALLxSMALL, of which we want the high outputs. */
299 top->v.right = right;
300 top->v.out = right + CONVOLVE_SMALL;
301 convolve_run (top, CONVOLVE_SMALL, scratch);
303 /* The high SMALLxSMALL, of which we want the low outputs. */
304 top->v.left = left + CONVOLVE_SMALL;
305 top->v.right = right;
307 convolve_run (top, CONVOLVE_SMALL, scratch);
309 /* Now find the best position amoungs this. Apart from the first
310 * and last, the required convolution outputs are formed by adding
311 * outputs from the two convolutions above. */
312 best = right[CONVOLVE_BIG - 1];
313 right[CONVOLVE_BIG + CONVOLVE_SMALL + 1] = 0;
315 for (i = 0; i < CONVOLVE_SMALL; i++) {
316 double a = right[i] + right[i + CONVOLVE_BIG];
327 /* This is some debugging code... */
329 for (i = 0; i < CONVOLVE_SMALL; i++)
330 best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);
332 for (i = 0; i <= CONVOLVE_SMALL; i++) {
336 for (j = 0; j < CONVOLVE_SMALL; j++)
337 tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
340 if (tot != left[i + (CONVOLVE_SMALL - 1)])