From: Sebastian Dröge Date: Mon, 24 May 2010 09:27:36 +0000 (+0200) Subject: fft: Merge kissfft 1.2.8 X-Git-Tag: RELEASE-0.10.30~169 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=fdfb70e2623fcc077f07f464cfb0319e6e0cdebc;p=platform%2Fupstream%2Fgst-plugins-base.git fft: Merge kissfft 1.2.8 This reduces memory footprint for the FFT and adds OpenMP support (but we don't use it). --- diff --git a/gst-libs/gst/fft/kiss_fft_f32.c b/gst-libs/gst/fft/kiss_fft_f32.c index f53d1d7..693a899 100644 --- a/gst-libs/gst/fft/kiss_fft_f32.c +++ b/gst-libs/gst/fft/kiss_fft_f32.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_f32_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_f32_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_f32_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_f32_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_f32.h b/gst-libs/gst/fft/kiss_fft_f32.h index 06a893d..098b89d 100644 --- a/gst-libs/gst/fft/kiss_fft_f32.h +++ b/gst-libs/gst/fft/kiss_fft_f32.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -92,6 +92,10 @@ void kiss_fft_f32_cleanup(void); */ int kiss_fft_f32_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fft_f64.c b/gst-libs/gst/fft/kiss_fft_f64.c index 4f0cbc5..b311711 100644 --- a/gst-libs/gst/fft/kiss_fft_f64.c +++ b/gst-libs/gst/fft/kiss_fft_f64.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_f64_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_f64_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_f64_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_f64_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_f64.h b/gst-libs/gst/fft/kiss_fft_f64.h index 572b33a..90792bb 100644 --- a/gst-libs/gst/fft/kiss_fft_f64.h +++ b/gst-libs/gst/fft/kiss_fft_f64.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -92,6 +92,10 @@ void kiss_fft_f64_cleanup(void); */ int kiss_fft_f64_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fft_s16.c b/gst-libs/gst/fft/kiss_fft_s16.c index ca20004..ee2de4e 100644 --- a/gst-libs/gst/fft/kiss_fft_s16.c +++ b/gst-libs/gst/fft/kiss_fft_s16.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_s16_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_s16_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_s16_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_s16_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_s16.h b/gst-libs/gst/fft/kiss_fft_s16.h index 9e22905..4303127 100644 --- a/gst-libs/gst/fft/kiss_fft_s16.h +++ b/gst-libs/gst/fft/kiss_fft_s16.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -95,6 +95,10 @@ void kiss_fft_s16_cleanup(void); */ int kiss_fft_s16_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fft_s32.c b/gst-libs/gst/fft/kiss_fft_s32.c index 760013c..5acb2ed 100644 --- a/gst-libs/gst/fft/kiss_fft_s32.c +++ b/gst-libs/gst/fft/kiss_fft_s32.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_s32_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_s32_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_s32_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_s32_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_s32.h b/gst-libs/gst/fft/kiss_fft_s32.h index b1c1d70..8edd664 100644 --- a/gst-libs/gst/fft/kiss_fft_s32.h +++ b/gst-libs/gst/fft/kiss_fft_s32.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -96,6 +96,10 @@ void kiss_fft_s32_cleanup(void); */ int kiss_fft_s32_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fftr_f32.c b/gst-libs/gst/fft/kiss_fftr_f32.c index ea45206..ab11c04 100644 --- a/gst-libs/gst/fft/kiss_fftr_f32.c +++ b/gst-libs/gst/fft/kiss_fftr_f32.c @@ -40,7 +40,7 @@ kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_f32_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)) - + ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 2); + + ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_f32_cfg) KISS_FFT_F32_MALLOC (memneeded); @@ -58,8 +58,9 @@ kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_f32_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -117,7 +118,7 @@ kiss_fftr_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -155,7 +156,7 @@ kiss_fftri_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD diff --git a/gst-libs/gst/fft/kiss_fftr_f64.c b/gst-libs/gst/fft/kiss_fftr_f64.c index 5958714..740be27 100644 --- a/gst-libs/gst/fft/kiss_fftr_f64.c +++ b/gst-libs/gst/fft/kiss_fftr_f64.c @@ -41,7 +41,7 @@ kiss_fftr_f64_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_f64_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_f64_state)) + ALIGN_STRUCT (subsize) - + sizeof (kiss_fft_f64_cpx) * (nfft * 2); + + sizeof (kiss_fft_f64_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_f64_cfg) KISS_FFT_F64_MALLOC (memneeded); @@ -59,8 +59,9 @@ kiss_fftr_f64_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_f64_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -118,7 +119,7 @@ kiss_fftr_f64 (kiss_fftr_f64_cfg st, const kiss_fft_f64_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -156,7 +157,7 @@ kiss_fftri_f64 (kiss_fftr_f64_cfg st, const kiss_fft_f64_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD diff --git a/gst-libs/gst/fft/kiss_fftr_s16.c b/gst-libs/gst/fft/kiss_fftr_s16.c index 803fa63..f1ec990 100644 --- a/gst-libs/gst/fft/kiss_fftr_s16.c +++ b/gst-libs/gst/fft/kiss_fftr_s16.c @@ -41,7 +41,7 @@ kiss_fftr_s16_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_s16_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_s16_state)) + ALIGN_STRUCT (subsize) - + sizeof (kiss_fft_s16_cpx) * (nfft * 2); + + sizeof (kiss_fft_s16_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_s16_cfg) KISS_FFT_S16_MALLOC (memneeded); @@ -59,8 +59,9 @@ kiss_fftr_s16_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_s16_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -118,7 +119,7 @@ kiss_fftr_s16 (kiss_fftr_s16_cfg st, const kiss_fft_s16_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -156,7 +157,7 @@ kiss_fftri_s16 (kiss_fftr_s16_cfg st, const kiss_fft_s16_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD diff --git a/gst-libs/gst/fft/kiss_fftr_s32.c b/gst-libs/gst/fft/kiss_fftr_s32.c index e2a8541..260da29 100644 --- a/gst-libs/gst/fft/kiss_fftr_s32.c +++ b/gst-libs/gst/fft/kiss_fftr_s32.c @@ -41,7 +41,7 @@ kiss_fftr_s32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_s32_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_s32_state)) + ALIGN_STRUCT (subsize) - + sizeof (kiss_fft_s32_cpx) * (nfft * 2); + + sizeof (kiss_fft_s32_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_s32_cfg) KISS_FFT_S32_MALLOC (memneeded); @@ -59,8 +59,9 @@ kiss_fftr_s32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_s32_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -118,7 +119,7 @@ kiss_fftr_s32 (kiss_fftr_s32_cfg st, const kiss_fft_s32_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -156,7 +157,7 @@ kiss_fftri_s32 (kiss_fftr_s32_cfg st, const kiss_fft_s32_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD