2 Copyright (c) 2003-2004, Mark Borgerding
6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18 #include "_kiss_fft_guts_s16.h"
19 /* The guts header contains all the multiplication and addition macros that are defined for
20 fixed or floating point complex numbers. It also delares the kf_ internal functions.
23 static kiss_fft_s16_cpx *scratchbuf = NULL;
24 static size_t nscratchbuf = 0;
25 static kiss_fft_s16_cpx *tmpbuf = NULL;
26 static size_t ntmpbuf = 0;
28 #define CHECKBUF(buf,nbuf,n) \
30 if ( nbuf < (size_t)(n) ) {\
32 buf = (kiss_fft_s16_cpx*)KISS_FFT_S16_MALLOC(sizeof(kiss_fft_s16_cpx)*(n)); \
39 kf_bfly2 (kiss_fft_s16_cpx * Fout,
40 const size_t fstride, const kiss_fft_s16_cfg st, int m)
42 kiss_fft_s16_cpx *Fout2;
43 kiss_fft_s16_cpx *tw1 = st->twiddles;
51 C_MUL (t, *Fout2, *tw1);
53 C_SUB (*Fout2, *Fout, t);
61 kf_bfly4 (kiss_fft_s16_cpx * Fout,
62 const size_t fstride, const kiss_fft_s16_cfg st, const size_t m)
64 kiss_fft_s16_cpx *tw1, *tw2, *tw3;
65 kiss_fft_s16_cpx scratch[6];
67 const size_t m2 = 2 * m;
68 const size_t m3 = 3 * m;
70 tw3 = tw2 = tw1 = st->twiddles;
74 C_FIXDIV (Fout[m], 4);
75 C_FIXDIV (Fout[m2], 4);
76 C_FIXDIV (Fout[m3], 4);
78 C_MUL (scratch[0], Fout[m], *tw1);
79 C_MUL (scratch[1], Fout[m2], *tw2);
80 C_MUL (scratch[2], Fout[m3], *tw3);
82 C_SUB (scratch[5], *Fout, scratch[1]);
83 C_ADDTO (*Fout, scratch[1]);
84 C_ADD (scratch[3], scratch[0], scratch[2]);
85 C_SUB (scratch[4], scratch[0], scratch[2]);
86 C_SUB (Fout[m2], *Fout, scratch[3]);
90 C_ADDTO (*Fout, scratch[3]);
93 Fout[m].r = scratch[5].r - scratch[4].i;
94 Fout[m].i = scratch[5].i + scratch[4].r;
95 Fout[m3].r = scratch[5].r + scratch[4].i;
96 Fout[m3].i = scratch[5].i - scratch[4].r;
98 Fout[m].r = scratch[5].r + scratch[4].i;
99 Fout[m].i = scratch[5].i - scratch[4].r;
100 Fout[m3].r = scratch[5].r - scratch[4].i;
101 Fout[m3].i = scratch[5].i + scratch[4].r;
108 kf_bfly3 (kiss_fft_s16_cpx * Fout,
109 const size_t fstride, const kiss_fft_s16_cfg st, size_t m)
112 const size_t m2 = 2 * m;
113 kiss_fft_s16_cpx *tw1, *tw2;
114 kiss_fft_s16_cpx scratch[5];
115 kiss_fft_s16_cpx epi3;
117 epi3 = st->twiddles[fstride * m];
119 tw1 = tw2 = st->twiddles;
123 C_FIXDIV (Fout[m], 3);
124 C_FIXDIV (Fout[m2], 3);
126 C_MUL (scratch[1], Fout[m], *tw1);
127 C_MUL (scratch[2], Fout[m2], *tw2);
129 C_ADD (scratch[3], scratch[1], scratch[2]);
130 C_SUB (scratch[0], scratch[1], scratch[2]);
134 Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
135 Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
137 C_MULBYSCALAR (scratch[0], epi3.i);
139 C_ADDTO (*Fout, scratch[3]);
141 Fout[m2].r = Fout[m].r + scratch[0].i;
142 Fout[m2].i = Fout[m].i - scratch[0].r;
144 Fout[m].r -= scratch[0].i;
145 Fout[m].i += scratch[0].r;
152 kf_bfly5 (kiss_fft_s16_cpx * Fout,
153 const size_t fstride, const kiss_fft_s16_cfg st, int m)
155 kiss_fft_s16_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
157 kiss_fft_s16_cpx scratch[13];
158 kiss_fft_s16_cpx *twiddles = st->twiddles;
159 kiss_fft_s16_cpx *tw;
160 kiss_fft_s16_cpx ya, yb;
162 ya = twiddles[fstride * m];
163 yb = twiddles[fstride * 2 * m];
167 Fout2 = Fout0 + 2 * m;
168 Fout3 = Fout0 + 3 * m;
169 Fout4 = Fout0 + 4 * m;
172 for (u = 0; u < m; ++u) {
173 C_FIXDIV (*Fout0, 5);
174 C_FIXDIV (*Fout1, 5);
175 C_FIXDIV (*Fout2, 5);
176 C_FIXDIV (*Fout3, 5);
177 C_FIXDIV (*Fout4, 5);
180 C_MUL (scratch[1], *Fout1, tw[u * fstride]);
181 C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
182 C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
183 C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
185 C_ADD (scratch[7], scratch[1], scratch[4]);
186 C_SUB (scratch[10], scratch[1], scratch[4]);
187 C_ADD (scratch[8], scratch[2], scratch[3]);
188 C_SUB (scratch[9], scratch[2], scratch[3]);
190 Fout0->r += scratch[7].r + scratch[8].r;
191 Fout0->i += scratch[7].i + scratch[8].i;
194 scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
196 scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
198 scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
199 scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
201 C_SUB (*Fout1, scratch[5], scratch[6]);
202 C_ADD (*Fout4, scratch[5], scratch[6]);
205 scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
207 scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
208 scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
209 scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
211 C_ADD (*Fout2, scratch[11], scratch[12]);
212 C_SUB (*Fout3, scratch[11], scratch[12]);
222 /* perform the butterfly for one stage of a mixed radix FFT */
224 kf_bfly_generic (kiss_fft_s16_cpx * Fout,
225 const size_t fstride, const kiss_fft_s16_cfg st, int m, int p)
228 kiss_fft_s16_cpx *twiddles = st->twiddles;
230 int Norig = st->nfft;
232 CHECKBUF (scratchbuf, nscratchbuf, p);
234 for (u = 0; u < m; ++u) {
236 for (q1 = 0; q1 < p; ++q1) {
237 scratchbuf[q1] = Fout[k];
238 C_FIXDIV (scratchbuf[q1], p);
243 for (q1 = 0; q1 < p; ++q1) {
246 Fout[k] = scratchbuf[0];
247 for (q = 1; q < p; ++q) {
248 twidx += fstride * k;
251 C_MUL (t, scratchbuf[q], twiddles[twidx]);
252 C_ADDTO (Fout[k], t);
260 kf_work (kiss_fft_s16_cpx * Fout,
261 const kiss_fft_s16_cpx * f,
262 const size_t fstride,
263 int in_stride, int *factors, const kiss_fft_s16_cfg st)
265 kiss_fft_s16_cpx *Fout_beg = Fout;
266 const int p = *factors++; /* the radix */
267 const int m = *factors++; /* stage's fft length/p */
268 const kiss_fft_s16_cpx *Fout_end = Fout + p * m;
271 // use openmp extensions at the
272 // top-level (not recursive)
276 // execute the p different work units in different threads
277 # pragma omp parallel for
278 for (k = 0; k < p; ++k)
279 kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
280 in_stride, factors, st);
281 // all threads have joined by this point
285 kf_bfly2 (Fout, fstride, st, m);
288 kf_bfly3 (Fout, fstride, st, m);
291 kf_bfly4 (Fout, fstride, st, m);
294 kf_bfly5 (Fout, fstride, st, m);
297 kf_bfly_generic (Fout, fstride, st, m, p);
307 f += fstride * in_stride;
308 } while (++Fout != Fout_end);
312 // DFT of size m*p performed by doing
313 // p instances of smaller DFTs of size m,
314 // each one takes a decimated version of the input
315 kf_work (Fout, f, fstride * p, in_stride, factors, st);
316 f += fstride * in_stride;
317 } while ((Fout += m) != Fout_end);
322 // recombine the p smaller DFTs
325 kf_bfly2 (Fout, fstride, st, m);
328 kf_bfly3 (Fout, fstride, st, m);
331 kf_bfly4 (Fout, fstride, st, m);
334 kf_bfly5 (Fout, fstride, st, m);
337 kf_bfly_generic (Fout, fstride, st, m, p);
342 /* facbuf is populated by p1,m1,p2,m2, ...
347 kf_factor (int n, int *facbuf)
352 floor_sqrt = floor (sqrt ((double) n));
354 /*factor out powers of 4, powers of 2, then any remaining primes */
369 p = n; /* no more factors, skip to end */
379 * User-callable function to allocate all necessary storage space for the fft.
381 * The return value is a contiguous block of memory, allocated with malloc. As such,
382 * It can be freed with free(), rather than a kiss_fft-specific function.
385 kiss_fft_s16_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
387 kiss_fft_s16_cfg st = NULL;
388 size_t memneeded = sizeof (struct kiss_fft_s16_state)
389 + sizeof (kiss_fft_s16_cpx) * (nfft - 1); /* twiddle factors */
391 if (lenmem == NULL) {
392 st = (kiss_fft_s16_cfg) KISS_FFT_S16_MALLOC (memneeded);
394 if (mem != NULL && *lenmem >= memneeded)
395 st = (kiss_fft_s16_cfg) mem;
402 st->inverse = inverse_fft;
404 for (i = 0; i < nfft; ++i) {
406 3.141592653589793238462643383279502884197169399375105820974944;
407 double phase = -2 * pi * i / nfft;
411 kf_cexp (st->twiddles + i, phase);
414 kf_factor (nfft, st->factors);
423 kiss_fft_s16_stride (kiss_fft_s16_cfg st, const kiss_fft_s16_cpx * fin,
424 kiss_fft_s16_cpx * fout, int in_stride)
427 CHECKBUF (tmpbuf, ntmpbuf, st->nfft);
428 kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
429 memcpy (fout, tmpbuf, sizeof (kiss_fft_s16_cpx) * st->nfft);
431 kf_work (fout, fin, 1, in_stride, st->factors, st);
436 kiss_fft_s16 (kiss_fft_s16_cfg cfg, const kiss_fft_s16_cpx * fin,
437 kiss_fft_s16_cpx * fout)
439 kiss_fft_s16_stride (cfg, fin, fout, 1);
443 /* not really necessary to call, but if someone is doing in-place ffts, they may want to free the
444 buffers from CHECKBUF
447 kiss_fft_s16_cleanup (void)
458 kiss_fft_s16_next_fast_size (int n)
470 break; /* n is completely factorable by twos, threes, and fives */