Enable Fixed-Point Non-power-of-2 FFT.
authorPhil.Wang <phil.wang@arm.com>
Fri, 23 Jan 2015 07:19:44 +0000 (15:19 +0800)
committerPhil Wang <phil.wang@arm.com>
Tue, 27 Jan 2015 10:30:38 +0000 (10:30 +0000)
For Cortex-A53 (AArch64)
LLVM 3.5, -O2
Time: in ms
SNR : in dB

|         |Forward  |Backward |
|    |Size| Time|SNR| Time|SNR|
|   C|   8|   26| 92|   26| 93|
|   C|  16|   51| 90|   51| 90|
|   C|  32|  130| 89|  132| 91|
|   C|  60|  452| 81|  469| 83|
|   C|  64|  304| 88|  305| 88|
|   C| 120| 1070| 82| 1149| 82|
|   C| 128|  727| 88|  735| 89|
|   C| 240| 2197| 81| 2312| 82|
|   C| 256| 1659| 88| 1659| 88|
|   C| 480| 5127| 82| 5520| 82|
|   C| 512| 3819| 88| 3855| 88|
|   C| 900|11621| 80|12190| 81|
|   C| 960|10640| 82|11246| 82|
|NEON|   8|   10| 93|   10| 95|
|NEON|  16|   18| 97|   18| 97|
|NEON|  32|   54| 88|   55| 89|
|NEON|  60|  163| 88|  169| 88|
|NEON|  64|  133| 85|  133| 86|
|NEON| 120|  346| 88|  358| 90|
|NEON| 128|  263| 87|  264| 87|
|NEON| 240|  668| 89|  704| 88|
|NEON| 256|  635| 85|  635| 85|
|NEON| 480| 1526| 89| 1595| 89|
|NEON| 512| 1300| 86| 1299| 87|
|NEON| 900| 3207| 88| 3372| 89|
|NEON| 960| 3107| 89| 3394| 89|

Change-Id: I256d1e4eff40ff20e19fe941f3222a3f7d2944f6

inc/NE10_dsp.h
inc/NE10_types.h
modules/CMakeLists.txt
modules/dsp/NE10_fft.c
modules/dsp/NE10_fft.h
modules/dsp/NE10_fft_common_varibles.h
modules/dsp/NE10_fft_generic_int32.cpp
modules/dsp/NE10_fft_generic_int32.h
modules/dsp/NE10_fft_generic_int32.neonintrinsic.cpp [new file with mode: 0644]
modules/dsp/NE10_fft_generic_int32.neonintrinsic.h [new file with mode: 0644]
modules/dsp/NE10_fft_int32.neonintrinsic.c

index 8c26085..8b53038 100644 (file)
@@ -166,6 +166,8 @@ extern "C" {
             ne10_fft_cpx_float32_t *fin,
             ne10_fft_r2c_cfg_float32_t cfg);
 
+    extern ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_neon (ne10_int32_t nfft);
+
     extern void ne10_fft_c2c_1d_int32_neon (ne10_fft_cpx_int32_t *fout,
                                             ne10_fft_cpx_int32_t *fin,
                                             ne10_fft_cfg_int32_t cfg,
index 20b30fb..577444d 100644 (file)
@@ -298,6 +298,7 @@ typedef struct
     ne10_int32_t *factors;
     ne10_fft_cpx_int32_t *twiddles;
     ne10_fft_cpx_int32_t *buffer;
+    ne10_fft_cpx_int32_t *last_twiddles;
 } ne10_fft_state_int32_t;
 
 typedef ne10_fft_state_int32_t* ne10_fft_cfg_int32_t;
index f8855b4..03b571c 100644 (file)
@@ -178,6 +178,7 @@ if(NE10_ENABLE_DSP)
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_float32.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_int32.cpp
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_float32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.c
@@ -203,6 +204,7 @@ if(NE10_ENABLE_DSP)
            ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neon.c
            ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neon.c
            ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp
+           ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_int32.neonintrinsic.cpp
            )
 
       foreach(intrinsic_file ${NE10_DSP_INTRINSIC_SRCS})
@@ -225,6 +227,7 @@ if(NE10_ENABLE_DSP)
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neonintrinsic.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neonintrinsic.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_int32.neonintrinsic.cpp
         )
     endif()
 
index 324d5b6..3fb9c89 100644 (file)
@@ -359,5 +359,92 @@ ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_neon (ne10_int32_t nfft)
 }
 
 /**
+ * @brief User-callable function to allocate all necessary storage space for the fft.
+ * @param[in]   nfft             length of FFT
+ * @return      st               point to the FFT config memory. This memory is allocated with malloc.
+ * The function allocate all necessary storage space for the fft. It also factors out the length of FFT and generates the twiddle coeff.
+ */
+ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_neon (ne10_int32_t nfft)
+{
+    ne10_fft_cfg_int32_t st = NULL;
+    ne10_uint32_t memneeded = sizeof (ne10_fft_state_int32_t)
+                              + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/
+                              + sizeof (ne10_fft_cpx_int32_t) * nfft        /* twiddle*/
+                              + sizeof (ne10_fft_cpx_int32_t) * nfft        /* buffer*/
+                              + NE10_FFT_BYTE_ALIGNMENT;     /* 64-bit alignment*/
+
+    st = (ne10_fft_cfg_int32_t) NE10_MALLOC (memneeded);
+
+    // Bad allocation.
+    if (st == NULL)
+    {
+        return st;
+    }
+
+    uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_int32_t);
+    NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
+    st->factors = (ne10_int32_t*) address;
+    st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
+    st->buffer = st->twiddles + nfft;
+
+    // st->last_twiddles is default NULL.
+    // Calling fft_c or fft_neon is decided by this pointers.
+    st->last_twiddles = NULL;
+
+    st->nfft = nfft;
+    if (nfft % NE10_FFT_PARA_LEVEL == 0)
+    {
+        // Size of FFT satisfies requirement of NEON optimization.
+        st->nfft /= NE10_FFT_PARA_LEVEL;
+        st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL;
+    }
+
+    ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
+
+    // Can not factor.
+    if (result == NE10_ERR)
+    {
+        NE10_FREE (st);
+        return st;
+    }
+
+    // Check if radix-8 can be enabled
+    ne10_int32_t stage_count    = st->factors[0];
+    ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
+
+    // Enable radix-8.
+    if (algorithm_flag == NE10_FFT_ALG_ANY)
+    {
+        result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
+        if (result == NE10_ERR)
+        {
+            NE10_FREE (st);
+            return st;
+        }
+        ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft);
+    }
+    else
+    {
+        st->last_twiddles = NULL;
+        st->nfft = nfft;
+        result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
+        ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft);
+        return st;
+    }
+
+    // Generate super twiddles for the last stage.
+    if (nfft % NE10_FFT_PARA_LEVEL == 0)
+    {
+        // Size of FFT satisfies requirement of NEON optimization.
+        ne10_fft_generate_twiddles_line_int32 (st->last_twiddles,
+                st->nfft,
+                1,
+                NE10_FFT_PARA_LEVEL,
+                nfft);
+    }
+    return st;
+}
+
+/**
  * @}
  */ //end of C2C_FFT_IFFT group
index 5912f80..3c91c20 100644 (file)
@@ -135,6 +135,20 @@ extern "C" {
             ne10_fft_cpx_int32_t * buffer,
             const ne10_int32_t scaled_flag);
 
+    extern void ne10_mixed_radix_generic_butterfly_int32_neon (ne10_fft_cpx_int32_t * Fout,
+            const ne10_fft_cpx_int32_t * Fin,
+            const ne10_int32_t * factors,
+            const ne10_fft_cpx_int32_t * twiddles,
+            ne10_fft_cpx_int32_t * buffer,
+            const ne10_int32_t scaled_flag);
+
+    extern void ne10_mixed_radix_generic_butterfly_inverse_int32_neon (ne10_fft_cpx_int32_t * Fout,
+            const ne10_fft_cpx_int32_t * Fin,
+            const ne10_int32_t * factors,
+            const ne10_fft_cpx_int32_t * twiddles,
+            ne10_fft_cpx_int32_t * buffer,
+            const ne10_int32_t scaled_flag);
+
     extern void ne10_mixed_radix_fft_forward_int32_unscaled_neon (ne10_fft_cpx_int32_t * Fout,
             ne10_fft_cpx_int32_t *fin,
             ne10_int32_t * factors,
index 2e1799a..763f0f2 100644 (file)
@@ -69,4 +69,6 @@ const static ne10_fft_cpx_int32_t TW_5B_S32 =
 const static ne10_float32_t TW_3I_F32  =   0.866025403784439; // sqrt (3) / 2
 const static ne10_float32_t TW_3IN_F32 = - 0.866025403784439; // - TW_3IN_F32
 const static ne10_int32_t TW_3I_S32 = 1859775393; // round (TW_3I_F32 * 2^31)
+const static ne10_int32_t TW_3IN_S32 = -1859775393; // round (TW_3IN_F32 * 2^31)
+
 #endif // NE10_FFT_COMMON_VARIBLES_H
index c82e5b1..b408582 100644 (file)
@@ -50,93 +50,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 #include "NE10_fft_generic_int32.h"
 
 /**
- * @brief Conjugate a fix-point complex array.
- * @tparam RADIX Length of given fix-point complex array
- * @param[out] in Given array
- */
-template<int RADIX>
-inline void NE10_CONJ (ne10_fft_cpx_int32_t in[RADIX])
-{
-    NE10_CONJ<RADIX - 1> (in);
-    in[RADIX - 1].i = -in[RADIX - 1].i;
-}
-
-template<>
-inline void NE10_CONJ<1> (ne10_fft_cpx_int32_t in[1])
-{
-    in[0].i = -in[0].i;
-}
-
-/**
- * @brief Load a fixed-size array from given buffer, by given step.
- * @tparam RADIX Length of array.
- * @param[out] out      Array to which data are loaded
- * @param[in]  Fin      Pointing to buffer from which data are loaded
- * @param[in]  in_step  Step between loaded data in Fin
- */
-template<int RADIX>
-inline void NE10_LOAD_BY_STEP (ne10_fft_cpx_int32_t out[RADIX],
-        const ne10_fft_cpx_int32_t *Fin,
-        const ne10_int32_t in_step)
-{
-    out[0] = Fin[0];
-    NE10_LOAD_BY_STEP<RADIX - 1> (out + 1, Fin + in_step, in_step);
-}
-
-template<>
-inline void NE10_LOAD_BY_STEP<1> (ne10_fft_cpx_int32_t out[0],
-        const ne10_fft_cpx_int32_t *Fin,
-        const ne10_int32_t)
-{
-    out[0] = Fin[0];
-}
-
-/**
- * @brief Store a fixed-size array to given buffer, by given step.
- * @tparam RADIX Length of array.
- * @param[out] Fout         Pointing to buffer to which data are stored
- * @param[in]  out          Array to from data are stored
- * @param[in]  out_step     Step between stored data in Fout
- */
-template<int RADIX>
-inline void NE10_STORE_BY_STEP (ne10_fft_cpx_int32_t *Fout,
-        const ne10_fft_cpx_int32_t in[RADIX],
-        const ne10_int32_t out_step)
-{
-    Fout[0] = in[0];
-    NE10_STORE_BY_STEP<RADIX - 1> (Fout + out_step, in + 1, out_step);
-}
-
-template<>
-inline void NE10_STORE_BY_STEP<1> (ne10_fft_cpx_int32_t *Fout,
-        const ne10_fft_cpx_int32_t in[1],
-        const ne10_int32_t)
-{
-    Fout[0] = in[0];
-}
-
-/**
- * @brief Scale a fixed-size array by given divider.
- * @tparam          RADIX        Length of array.
- * @param[out]      out          Array whose elements are scaled
- * @param[in]       scaling      Divider by which array is divided
- */
-template<int RADIX>
-inline void NE10_SCALED (ne10_fft_cpx_int32_t out[RADIX],
-        const ne10_int32_t scaling)
-{
-    NE10_F2I32_FIXDIV (out[0], scaling);
-    NE10_SCALED<RADIX - 1> (out + 1, scaling);
-}
-
-template<>
-inline void NE10_SCALED<1> (ne10_fft_cpx_int32_t out[1],
-        const ne10_int32_t scaling)
-{
-    NE10_F2I32_FIXDIV (out[0], scaling);
-}
-
-/**
  * @brief Generic butterfly function for 32-bit fixed point.
  * @tparam RADIX            Radix of this stage. One among {2, 3, 4, 5 }
  * @tparam is_first_stage   Whether this stags is the first. If it is, multiplication
index ec3d825..90793af 100644 (file)
@@ -257,4 +257,122 @@ inline void FFT_FCU<5> (ne10_fft_cpx_int32_t Fout[5],
     Fout[4] = scratch_in[4];
 }
 
+/**
+ * @brief Conjugate a fix-point complex scalar/NEON vector.
+ */
+template<class T>
+inline void NE10_CONJ_S (T &);
+
+template<>
+inline void NE10_CONJ_S<ne10_fft_cpx_int32_t> (ne10_fft_cpx_int32_t &scalar)
+{
+    scalar.i = -scalar.i;
+}
+
+/**
+ * @brief Conjugate a fix-point complex array.
+ * @tparam RADIX Length of given fix-point complex array
+ * @param[out] in Given array
+ */
+template<int RADIX, class T = ne10_fft_cpx_int32_t>
+inline void NE10_CONJ (T in[RADIX])
+{
+    NE10_CONJ<RADIX - 1> (in);
+    NE10_CONJ_S<T> (in[RADIX - 1]);
+}
+
+template<>
+inline void NE10_CONJ<1, ne10_fft_cpx_int32_t> (ne10_fft_cpx_int32_t in[1])
+{
+    NE10_CONJ_S<ne10_fft_cpx_int32_t> (in[0]);
+}
+
+template<class T>
+inline T NE10_CPX_LOAD_S (const T *ptr)
+{
+    return *ptr;
+}
+
+template<class T>
+inline void NE10_CPX_STORE_S (T *Fout, const T in)
+{
+    *Fout = in;
+}
+
+/**
+ * @brief Load a fixed-size array from given buffer, by given step.
+ * @tparam RADIX Length of array.
+ * @param[out] out      Array to which data are loaded
+ * @param[in]  Fin      Pointing to buffer from which data are loaded
+ * @param[in]  in_step  Step between loaded data in Fin
+ */
+template<int RADIX, class T = ne10_fft_cpx_int32_t>
+inline void NE10_LOAD_BY_STEP (T out[RADIX],
+        const T *Fin,
+        const ne10_int32_t in_step);
+
+template<>
+inline void NE10_LOAD_BY_STEP<1, ne10_fft_cpx_int32_t> (
+        ne10_fft_cpx_int32_t out[0],
+        const ne10_fft_cpx_int32_t *Fin,
+        const ne10_int32_t)
+{
+    out[0] = NE10_CPX_LOAD_S<ne10_fft_cpx_int32_t> (Fin);
+}
+
+template<int RADIX, class T>
+inline void NE10_LOAD_BY_STEP (T out[RADIX],
+        const T *Fin,
+        const ne10_int32_t in_step)
+{
+    out[0] = NE10_CPX_LOAD_S<T> (Fin);
+    NE10_LOAD_BY_STEP<RADIX - 1, T> (out + 1, Fin + in_step, in_step);
+}
+
+/**
+ * @brief Store a fixed-size array to given buffer, by given step.
+ * @tparam RADIX Length of array.
+ * @param[out] Fout         Pointing to buffer to which data are stored
+ * @param[in]  out          Array to from data are stored
+ * @param[in]  out_step     Step between stored data in Fout
+ */
+template<int RADIX, class T = ne10_fft_cpx_int32_t>
+inline void NE10_STORE_BY_STEP (T *Fout,
+        const T in[RADIX],
+        const ne10_int32_t out_step)
+{
+    NE10_CPX_STORE_S<T> (Fout, in[0]);
+    NE10_STORE_BY_STEP<RADIX - 1, T> (Fout + out_step, in + 1, out_step);
+}
+
+template<>
+inline void NE10_STORE_BY_STEP<1, ne10_fft_cpx_int32_t> (
+        ne10_fft_cpx_int32_t *Fout,
+        const ne10_fft_cpx_int32_t in[1],
+        const ne10_int32_t)
+{
+    Fout[0] = in[0];
+}
+
+/**
+ * @brief Scale a fixed-size array by given divider.
+ * @tparam          RADIX        Length of array.
+ * @param[out]      out          Array whose elements are scaled
+ * @param[in]       scaling      Divider by which array is divided
+ */
+template<int RADIX>
+inline void NE10_SCALED (ne10_fft_cpx_int32_t out[RADIX],
+        const ne10_int32_t scaling)
+{
+    NE10_F2I32_FIXDIV (out[0], scaling);
+    NE10_SCALED<RADIX - 1> (out + 1, scaling);
+}
+
+template<>
+inline void NE10_SCALED<1> (ne10_fft_cpx_int32_t out[1],
+        const ne10_int32_t scaling)
+{
+    NE10_F2I32_FIXDIV (out[0], scaling);
+}
+
 #endif // NE10_FFT_GENERIC_INT32_H
diff --git a/modules/dsp/NE10_fft_generic_int32.neonintrinsic.cpp b/modules/dsp/NE10_fft_generic_int32.neonintrinsic.cpp
new file mode 100644 (file)
index 0000000..eb0d544
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ *  Copyright 2015 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_fft_generic_int32.neonintrisic.cpp
+ *
+ * This file must be compiled by C++ toolchain because some functions are
+ * written as template functions to make it easier for compiler to
+ * reduce branch jump.
+ */
+
+#include "NE10_fft_generic_int32.neonintrinsic.h"
+
+template<bool is_inverse>
+inline void ne10_mixed_radix_generic_butterfly_int32_neon_dispatch (
+        ne10_fft_cpx_int32_t *Fout,
+        const ne10_fft_cpx_int32_t *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_int32_t *twiddles,
+        ne10_fft_cpx_int32_t *buffer,
+        ne10_int32_t is_scaled_flag)
+{
+    ne10_int32_t stage_count = factors[0];
+    ne10_int32_t fstride = factors[1];
+    ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
+
+    // nfft below is not the actual length of FFT, it is 1/4 of the actual one
+    // instead.
+    ne10_int32_t nfft = fstride * radix;
+
+    void (*ne10_mixed_butterfly_f) (CPLX *, const CPLX *, const ne10_int32_t *,
+            const ne10_fft_cpx_int32_t *, CPLX *) = NULL;
+
+    void (*ne10_last_stage_f) (CPLX *, const CPLX *, const ne10_fft_cpx_int32_t *,
+            ne10_int32_t, ne10_int32_t, ne10_int32_t) = NULL;
+
+    if (is_scaled_flag == 1)
+    {
+        ne10_mixed_butterfly_f =
+            ne10_mixed_radix_generic_butterfly_int32_neon_impl<is_inverse, true>;
+    }
+    else
+    {
+        ne10_mixed_butterfly_f =
+            ne10_mixed_radix_generic_butterfly_int32_neon_impl<is_inverse, false>;
+    }
+
+    if (is_scaled_flag == 1)
+    {
+        ne10_last_stage_f =
+            ne10_c2c_1d_last_stage_neon<is_inverse, true>;
+    }
+    else
+    {
+        ne10_last_stage_f =
+            ne10_c2c_1d_last_stage_neon<is_inverse, false>;
+    }
+
+    ne10_mixed_butterfly_f ((CPLX *) buffer,
+            (const CPLX *) Fin, // From Fin to buffer
+            factors,
+            twiddles,
+            (CPLX *) Fout); // Fout is "buffer" for these stages.
+
+    ne10_last_stage_f ((CPLX *) Fout,
+            (const CPLX *) buffer, // From buffer to Fout
+            twiddles + nfft,
+            1, // out_step == fstride == 1
+            nfft, // in_step == mstride == nfft
+            nfft * 4); // Actual length of FFT
+}
+
+void ne10_mixed_radix_generic_butterfly_int32_neon (
+        ne10_fft_cpx_int32_t *Fout,
+        const ne10_fft_cpx_int32_t *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_int32_t *twiddles,
+        ne10_fft_cpx_int32_t *buffer,
+        ne10_int32_t is_scaled_flag)
+{
+    ne10_mixed_radix_generic_butterfly_int32_neon_dispatch <false> (
+            Fout, Fin, factors, twiddles, buffer, is_scaled_flag);
+}
+
+void ne10_mixed_radix_generic_butterfly_inverse_int32_neon (
+        ne10_fft_cpx_int32_t *Fout,
+        const ne10_fft_cpx_int32_t *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_int32_t *twiddles,
+        ne10_fft_cpx_int32_t *buffer,
+        ne10_int32_t is_scaled_flag)
+{
+    ne10_mixed_radix_generic_butterfly_int32_neon_dispatch <true> (
+            Fout, Fin, factors, twiddles, buffer, is_scaled_flag);
+}
diff --git a/modules/dsp/NE10_fft_generic_int32.neonintrinsic.h b/modules/dsp/NE10_fft_generic_int32.neonintrinsic.h
new file mode 100644 (file)
index 0000000..8b244cb
--- /dev/null
@@ -0,0 +1,671 @@
+/*
+ *  Copyright 2015 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_fft_generic_int32.neonintrisic.h
+ *
+ * This file must be compiled by C++ toolchain because some functions are
+ * written as template functions to make it easier for compiler to
+ * reduce branch jump.
+ */
+
+#ifndef NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
+#define NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_fft.neonintrinsic.h"
+#include "NE10_fft_generic_int32.h"
+
+typedef int32x4x2_t CPLX;
+typedef int32x4_t   REAL;
+#define NE10_REAL_DUP_NEON_S32 vdupq_n_s32
+#define NE10_CPLX_LOAD(PTR) vld2q_s32 ((ne10_int32_t*) (PTR))
+#define NE10_CPLX_STORE(PTR,OUT) \
+    do { \
+        vst2q_s32 ((ne10_int32_t*) (PTR), OUT); \
+    } while (0)
+
+template<>
+inline CPLX NE10_CPX_LOAD_S<CPLX> (const CPLX *ptr)
+{
+    return NE10_CPLX_LOAD(ptr);
+}
+
+template<>
+inline void NE10_CPX_STORE_S<CPLX> (CPLX *ptr, const CPLX out)
+{
+    NE10_CPLX_STORE (ptr, out);
+}
+
+template<>
+inline void NE10_LOAD_BY_STEP<1, CPLX> (CPLX out[1],
+        const CPLX *Fin,
+        const ne10_int32_t)
+{
+    out[0] = NE10_CPX_LOAD_S (Fin);
+}
+
+template<>
+inline void NE10_STORE_BY_STEP<1, CPLX> (CPLX *Fout,
+        const CPLX out[1],
+        const ne10_int32_t)
+{
+    NE10_CPX_STORE_S (Fout, out[0]);
+}
+
+static inline REAL NE10_S_MUL_NEON_S32 (const REAL vec,
+        const ne10_int32_t scalar)
+{
+    REAL scalar_neon = NE10_REAL_DUP_NEON_S32 (scalar);
+    REAL result = vqrdmulhq_s32 (scalar_neon, vec);
+    return result;
+}
+
+static inline void NE10_CPX_MUL_NEON_S32 (CPLX &result, const CPLX A, const CPLX B)
+{
+        REAL ARBR = vqrdmulhq_s32 (A.val[0], B.val[0]);
+        REAL ARBI = vqrdmulhq_s32 (A.val[0], B.val[1]);
+        REAL AIBR = vqrdmulhq_s32 (A.val[1], B.val[0]);
+        REAL AIBI = vqrdmulhq_s32 (A.val[1], B.val[1]);
+        result.val[0] = ARBR - AIBI;
+        result.val[1] = ARBI + AIBR;
+}
+
+template<int RADIX>
+inline void NE10_LOAD_TW_AND_MUL (CPLX scratch_in[RADIX],
+        const ne10_fft_cpx_int32_t *ptr_in,
+        const ne10_int32_t step)
+{
+    CPLX scratch_tw;
+    int32x2_t d2_tmp = vld1_s32 ((ne10_int32_t *)(ptr_in + (RADIX - 2) * step));
+
+    scratch_tw.val[0] = NE10_REAL_DUP_NEON_S32 (d2_tmp[0]);
+    scratch_tw.val[1] = NE10_REAL_DUP_NEON_S32 (d2_tmp[1]);
+    NE10_CPX_MUL_NEON_S32 (scratch_in[RADIX - 1], scratch_in[RADIX - 1], scratch_tw);
+
+    NE10_LOAD_TW_AND_MUL<RADIX - 1> (scratch_in, ptr_in, step);
+}
+
+template<>
+inline void NE10_LOAD_TW_AND_MUL<1> (CPLX [1],
+        const ne10_fft_cpx_int32_t *,
+        const ne10_int32_t)
+{
+}
+
+////////////////
+// Conj inplace.
+////////////////
+template<>
+inline void NE10_CONJ_S<CPLX> (CPLX &cplx)
+{
+    cplx.val[1] = -cplx.val[1];
+}
+
+template<>
+inline void NE10_CONJ<1, CPLX> (CPLX in[1])
+{
+    NE10_CONJ_S<CPLX> (in[0]);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Scaling
+// If Macro NE10_DSP_CFFT_SCALING is not defined, these functions do nothing.
+/////////////////////////////////////////////////////////////////////////////
+template<int RADIX, int SIZE = RADIX>
+struct NE10_FFT_SCALING {
+    inline void operator() (CPLX scratch_out[RADIX])
+    {
+#ifdef NE10_DSP_CFFT_SCALING
+        const static int32x4_t one_by_RADIX =
+        {
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
+        };
+        scratch_out[SIZE - 1].val[0] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[0], one_by_RADIX);
+        scratch_out[SIZE - 1].val[1] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[1], one_by_RADIX);
+        NE10_FFT_SCALING<RADIX, SIZE - 1> () (scratch_out);
+#endif
+    }
+};
+
+template<int RADIX>
+struct NE10_FFT_SCALING<RADIX, 1> {
+    inline void operator () (CPLX scratch_out[1])
+    {
+#ifdef NE10_DSP_CFFT_SCALING
+        const static int32x4_t one_by_RADIX =
+        {
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
+            (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
+        };
+        scratch_out[0].val[0] = vqrdmulhq_s32 (scratch_out[0].val[0], one_by_RADIX);
+        scratch_out[0].val[1] = vqrdmulhq_s32 (scratch_out[0].val[1], one_by_RADIX);
+#endif
+    }
+};
+
+inline void NE10_CPX_ADD_NEON_S32 (CPLX &result, const CPLX a, const CPLX b)
+{
+    result.val[0] = vaddq_s32 (a.val[0], b.val[0]);
+    result.val[1] = vaddq_s32 (a.val[1], b.val[1]);
+}
+
+inline void NE10_CPX_SUB_NEON_S32 (CPLX &result, const CPLX a, const CPLX b)
+{
+    result.val[0] = vsubq_s32 (a.val[0], b.val[0]);
+    result.val[1] = vsubq_s32 (a.val[1], b.val[1]);
+}
+
+inline REAL NE10_HALF (REAL src)
+{
+    const static int32x4_t CONST_HALF_NEON = { -1, -1, -1, -1};
+    src = vshlq_s32 (src, CONST_HALF_NEON);
+    return src;
+}
+
+///////////////////
+// FFT Kernel
+// F: Forward
+// C: Complex
+// U: Unscaled
+//////////////////
+template<int RADIX>
+inline void NE10_FFT_FCU_NEON_S32 (CPLX [RADIX], const CPLX [RADIX]);
+
+template<>
+inline void NE10_FFT_FCU_NEON_S32<2> (CPLX scratch_out[2],
+        const CPLX scratch_in[2])
+{
+    NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch_in[0], scratch_in[1]);
+    NE10_CPX_SUB_NEON_S32 (scratch_out[1], scratch_in[0], scratch_in[1]);
+}
+
+template<>
+inline void NE10_FFT_FCU_NEON_S32<3> (CPLX Fout[3],
+        const CPLX Fin[3])
+{
+    CPLX scratch[4];
+
+    Fout[0] = Fin[0];
+    Fout[1] = Fin[1];
+    Fout[2] = Fin[2];
+
+    scratch[1] = Fout[1];
+    scratch[2] = Fout[2];
+
+    NE10_CPX_ADD_NEON_S32 (scratch[3], scratch[1], scratch[2]);
+    NE10_CPX_SUB_NEON_S32 (scratch[0], scratch[1], scratch[2]);
+
+    Fout[1].val[0] = Fout[0].val[0] - NE10_HALF (scratch[3].val[0]);
+    Fout[1].val[1] = Fout[0].val[1] - NE10_HALF (scratch[3].val[1]);
+
+    scratch[0].val[0] = NE10_S_MUL_NEON_S32 (scratch[0].val[0], TW_3IN_S32);
+    scratch[0].val[1] = NE10_S_MUL_NEON_S32 (scratch[0].val[1], TW_3IN_S32);
+
+    Fout[0].val[0] += scratch[3].val[0];
+    Fout[0].val[1] += scratch[3].val[1];
+
+    Fout[2].val[0] = Fout[1].val[0] + scratch[0].val[1];
+    Fout[2].val[1] = Fout[1].val[1] - scratch[0].val[0];
+
+    Fout[1].val[0] -= scratch[0].val[1];
+    Fout[1].val[1] += scratch[0].val[0];
+}
+
+template<>
+inline void NE10_FFT_FCU_NEON_S32<4> (CPLX scratch_out[4],
+        const CPLX scratch_in[4])
+{
+    CPLX scratch[4];
+
+    NE10_CPX_ADD_NEON_S32 (scratch[0], scratch_in[0], scratch_in[2]);
+    NE10_CPX_SUB_NEON_S32 (scratch[1], scratch_in[0], scratch_in[2]);
+    NE10_CPX_ADD_NEON_S32 (scratch[2], scratch_in[1], scratch_in[3]);
+    NE10_CPX_SUB_NEON_S32 (scratch[3], scratch_in[1], scratch_in[3]);
+
+    NE10_CPX_SUB_NEON_S32 (scratch_out[2], scratch[0], scratch[2]);
+    NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch[0], scratch[2]);
+
+    scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
+    scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
+    scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
+    scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
+}
+
+template<>
+inline void NE10_FFT_FCU_NEON_S32<5> (CPLX Fout[5],
+        const CPLX Fin[5])
+{
+    CPLX scratch[13], scratch_in[5];
+
+    scratch_in[0] = Fin[0];
+    scratch_in[1] = Fin[1];
+    scratch_in[2] = Fin[2];
+    scratch_in[3] = Fin[3];
+    scratch_in[4] = Fin[4];
+
+    scratch[0] = scratch_in[0];
+    scratch[1] = scratch_in[1];
+    scratch[2] = scratch_in[2];
+    scratch[3] = scratch_in[3];
+    scratch[4] = scratch_in[4];
+
+    NE10_CPX_ADD_NEON_S32 (scratch[ 7], scratch[1], scratch[4]);
+    NE10_CPX_SUB_NEON_S32 (scratch[10], scratch[1], scratch[4]);
+    NE10_CPX_ADD_NEON_S32 (scratch[ 8], scratch[2], scratch[3]);
+    NE10_CPX_SUB_NEON_S32 (scratch[ 9], scratch[2], scratch[3]);
+
+    scratch_in[0].val[0] += scratch[7].val[0] + scratch[8].val[0];
+    scratch_in[0].val[1] += scratch[7].val[1] + scratch[8].val[1];
+
+    scratch[5].val[0] = scratch[0].val[0]
+        + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5A_S32.r)
+        + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5B_S32.r);
+    scratch[5].val[1] = scratch[0].val[1]
+        + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5A_S32.r)
+        + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5B_S32.r);
+
+    scratch[6].val[0] = NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5A_S32.i)
+        + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5B_S32.i);
+    scratch[6].val[1] = -NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5A_S32.i)
+        - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5B_S32.i);
+
+    NE10_CPX_SUB_NEON_S32 (scratch_in[1], scratch[5], scratch[6]);
+    NE10_CPX_ADD_NEON_S32 (scratch_in[4], scratch[5], scratch[6]);
+
+    scratch[11].val[0] = scratch[0].val[0]
+        + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5B_S32.r)
+        + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5A_S32.r);
+    scratch[11].val[1] = scratch[0].val[1]
+        + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5B_S32.r)
+        + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5A_S32.r);
+
+    scratch[12].val[0] = -NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5B_S32.i)
+        + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5A_S32.i);
+    scratch[12].val[1] = NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5B_S32.i)
+        - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5A_S32.i);
+
+    NE10_CPX_ADD_NEON_S32 (scratch_in[2], scratch[11], scratch[12]);
+    NE10_CPX_SUB_NEON_S32 (scratch_in[3], scratch[11], scratch[12]);
+
+    Fout[0] = scratch_in[0];
+    Fout[1] = scratch_in[1];
+    Fout[2] = scratch_in[2];
+    Fout[3] = scratch_in[3];
+    Fout[4] = scratch_in[4];
+}
+
+////////////////////////////////////
+// Following are butterfly functions
+////////////////////////////////////
+template<ne10_int32_t RADIX, bool is_first_stage, bool is_inverse, bool is_scaled>
+static __attribute__ ((noinline)) void ne10_radix_butterfly_int32_neon (
+        CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_int32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+    PRINT_HIT;
+    const ne10_int32_t in_step = nfft / RADIX;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            CPLX in[RADIX];
+            CPLX out[RADIX];
+
+            NE10_LOAD_BY_STEP<RADIX, CPLX> (in, Fin, in_step);
+
+            if (is_inverse)
+            {
+                NE10_CONJ<RADIX> (in);
+            }
+
+            if (!is_first_stage)
+            {
+                NE10_LOAD_TW_AND_MUL<RADIX> (in, twiddles, out_step);
+            }
+
+            NE10_FFT_FCU_NEON_S32<RADIX> (out, in);
+
+            if (is_inverse)
+            {
+                NE10_CONJ<RADIX> (out);
+            }
+
+            if (is_scaled)
+            {
+                NE10_FFT_SCALING<RADIX> (out);
+            }
+
+            NE10_STORE_BY_STEP<RADIX, CPLX> (Fout, out, out_step);
+
+            Fin++;
+
+            if (!is_first_stage)
+            {
+                Fout++;
+                twiddles++;
+            }
+            else
+            {
+                Fout += RADIX;
+            }
+        }
+        if (!is_first_stage)
+        {
+            twiddles -= out_step;
+            Fout += (RADIX - 1) * out_step;
+        }
+    }
+}
+
+template<bool is_inverse, bool is_scaled>
+static void ne10_mixed_radix_generic_butterfly_int32_neon_impl (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_int32_t *twiddles,
+        CPLX *buffer)
+{
+    ne10_int32_t fstride, mstride, radix;
+    ne10_int32_t stage_count;
+    ne10_int32_t nfft;
+
+    // init fstride, mstride, radix, nfft
+    stage_count = factors[0];
+    fstride = factors[1];
+    mstride = 1;
+    radix = factors[ stage_count << 1 ]; // radix of first stage
+    nfft = fstride * radix;
+
+    // swap to make sure output to Fout
+    if (stage_count % 2 == 0)
+    {
+        ne10_swap_ptr (buffer, Fout);
+    }
+
+    // first stage
+    switch (radix)
+    {
+    case 2:
+        ne10_radix_butterfly_int32_neon<2, true, is_inverse, is_scaled> (Fout, Fin,
+                NULL,
+                fstride, 1, nfft);
+        break;
+    case 4:
+        ne10_radix_butterfly_int32_neon<4, true, is_inverse, is_scaled> (Fout, Fin,
+                NULL,
+                fstride, 1, nfft);
+        break;
+    case 3:
+        ne10_radix_butterfly_int32_neon<3, true, is_inverse, is_scaled> (Fout, Fin,
+                NULL,
+                fstride, 1, nfft);
+        break;
+    case 5:
+        ne10_radix_butterfly_int32_neon<5, true, is_inverse, is_scaled> (Fout, Fin,
+                NULL,
+                fstride, 1, nfft);
+        break;
+    }
+
+    stage_count--;
+    if (!stage_count) // finish
+    {
+        return;
+    }
+
+    mstride *= radix;
+
+    // update radix
+    if (radix % 2)
+    {
+        twiddles += radix;
+    }
+    radix = factors[ stage_count << 1 ];
+
+    // other stages
+    while (stage_count > 0)
+    {
+        // radix of first stage, should be one of {2,3,5,4}
+        assert ((radix > 1) && (radix < 6));
+
+        ne10_swap_ptr (buffer, Fout);
+
+        fstride /= radix;
+        switch (radix)
+        {
+        case 2:
+            ne10_radix_butterfly_int32_neon<2, false, is_inverse, is_scaled> (Fout, buffer,
+                    twiddles,
+                    fstride, mstride, nfft);
+            break;
+        case 3:
+            ne10_radix_butterfly_int32_neon<3, false, is_inverse, is_scaled> (Fout, buffer,
+                    twiddles,
+                    fstride, mstride, nfft);
+            break;
+        case 4:
+            ne10_radix_butterfly_int32_neon<4, false, is_inverse, is_scaled> (Fout, buffer,
+                    twiddles,
+                    fstride, mstride, nfft);
+            break;
+        case 5:
+            ne10_radix_butterfly_int32_neon<5, false, is_inverse, is_scaled> (Fout, buffer,
+                    twiddles, fstride, mstride, nfft);
+            break;
+        } // switch (radix)
+
+        twiddles += mstride * (radix - 1);
+        mstride *= radix;
+
+        stage_count--;
+        radix = factors[ stage_count << 1 ];
+    } // while (stage_count)
+}
+
+template<bool is_inverse, bool is_scaled>
+static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_int32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t)
+{
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        CPLX scratch_in[4];
+        CPLX scratch_out[4];
+
+        for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
+        {
+            scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
+            scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
+            scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
+            scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
+
+            // Transpose
+            {
+                float32x4x2_t scratch0, scratch_in0;
+                float32x4x2_t scratch1, scratch_in1;
+                float32x4x2_t scratch2, scratch_in2;
+                float32x4x2_t scratch3, scratch_in3;
+
+                scratch_in0.val[0] = vreinterpretq_f32_s32 (scratch_in[0].val[0]);
+                scratch_in1.val[0] = vreinterpretq_f32_s32 (scratch_in[1].val[0]);
+                scratch_in2.val[0] = vreinterpretq_f32_s32 (scratch_in[2].val[0]);
+                scratch_in3.val[0] = vreinterpretq_f32_s32 (scratch_in[3].val[0]);
+                scratch_in0.val[1] = vreinterpretq_f32_s32 (scratch_in[0].val[1]);
+                scratch_in1.val[1] = vreinterpretq_f32_s32 (scratch_in[1].val[1]);
+                scratch_in2.val[1] = vreinterpretq_f32_s32 (scratch_in[2].val[1]);
+                scratch_in3.val[1] = vreinterpretq_f32_s32 (scratch_in[3].val[1]);
+
+                NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
+
+                scratch_in[0].val[0] = vreinterpretq_s32_f32 (scratch0.val[0]);
+                scratch_in[1].val[0] = vreinterpretq_s32_f32 (scratch1.val[0]);
+                scratch_in[2].val[0] = vreinterpretq_s32_f32 (scratch2.val[0]);
+                scratch_in[3].val[0] = vreinterpretq_s32_f32 (scratch3.val[0]);
+                scratch_in[0].val[1] = vreinterpretq_s32_f32 (scratch0.val[1]);
+                scratch_in[1].val[1] = vreinterpretq_s32_f32 (scratch1.val[1]);
+                scratch_in[2].val[1] = vreinterpretq_s32_f32 (scratch2.val[1]);
+                scratch_in[3].val[1] = vreinterpretq_s32_f32 (scratch3.val[1]);
+            }
+
+            if (is_inverse)
+            {
+                NE10_CONJ<4, CPLX> (scratch_in);
+            }
+
+            // Not first stage
+            {
+                CPLX scratch_tw[3];
+
+                scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
+                scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
+                scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
+
+                NE10_CPX_MUL_NEON_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
+                NE10_CPX_MUL_NEON_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
+                NE10_CPX_MUL_NEON_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
+            }
+
+            NE10_FFT_FCU_NEON_S32<4> (scratch_out, scratch_in);
+
+            if (is_inverse)
+            {
+                NE10_CONJ<4, CPLX> (scratch_out);
+            }
+
+            if (is_scaled)
+            {
+                NE10_FFT_SCALING<4> () (scratch_out);
+            }
+
+            // Store.
+            {
+                ne10_fft_cpx_int32_t *Fout_cpx;
+                Fout_cpx = (ne10_fft_cpx_int32_t *) Fout;
+
+                NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
+                NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
+                NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
+                NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
+            }
+
+            Fin += 4;
+            Fout += 1;
+            twiddles += 4;
+        }
+    }
+
+    ne10_int32_t left_over = out_step % 4;
+    if (left_over == 0)
+    {
+        return;
+    }
+
+    // Left over.
+    const ne10_fft_cpx_int32_t *Fin_s  = (ne10_fft_cpx_int32_t *) Fin;
+    ne10_fft_cpx_int32_t *Fout_s = (ne10_fft_cpx_int32_t *) Fout;
+    for (m_count = out_step % 4; m_count > 0; m_count--)
+    {
+        ne10_fft_cpx_int32_t scratch_in[4];
+        ne10_fft_cpx_int32_t scratch_tw[4];
+
+        scratch_in[0] = Fin_s[0];
+        scratch_in[1] = Fin_s[1];
+        scratch_in[2] = Fin_s[2];
+        scratch_in[3] = Fin_s[3];
+
+        if (is_inverse)
+        {
+            scratch_in[0].i = -scratch_in[0].i;
+            scratch_in[1].i = -scratch_in[1].i;
+            scratch_in[2].i = -scratch_in[2].i;
+            scratch_in[3].i = -scratch_in[3].i;
+        }
+
+        scratch_tw[0] = twiddles[0 * out_step];
+        scratch_tw[1] = twiddles[1 * out_step];
+        scratch_tw[2] = twiddles[2 * out_step];
+
+        NE10_CPX_MUL_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
+        NE10_CPX_MUL_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
+        NE10_CPX_MUL_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
+
+        FFT_FCU<4> (scratch_in, scratch_in);
+
+        if (is_inverse)
+        {
+            scratch_in[0].i = -scratch_in[0].i;
+            scratch_in[1].i = -scratch_in[1].i;
+            scratch_in[2].i = -scratch_in[2].i;
+            scratch_in[3].i = -scratch_in[3].i;
+        }
+
+        Fout_s[0 * out_step] = scratch_in[0];
+        Fout_s[1 * out_step] = scratch_in[1];
+        Fout_s[2 * out_step] = scratch_in[2];
+        Fout_s[3 * out_step] = scratch_in[3];
+
+        Fin_s += 4;
+        Fout_s += 1;
+        twiddles += 1;
+    }
+}
+
+#endif
index a6f9809..51cda0e 100644 (file)
@@ -1742,6 +1742,29 @@ void ne10_fft_c2c_1d_int32_neon (ne10_fft_cpx_int32_t *fout,
                                  ne10_int32_t inverse_fft,
                                  ne10_int32_t scaled_flag)
 {
+    ne10_int32_t stage_count = cfg->factors[0];
+    ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
+
+    assert ((algorithm_flag == NE10_FFT_ALG_24)
+            || (algorithm_flag == NE10_FFT_ALG_ANY));
+
+    // For NE10_FFT_ALG_ANY.
+    // Function will return inside this branch.
+    if (algorithm_flag == NE10_FFT_ALG_ANY)
+    {
+        if (inverse_fft)
+        {
+            ne10_mixed_radix_generic_butterfly_inverse_int32_neon (fout, fin,
+                    cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
+        }
+        else
+        {
+            ne10_mixed_radix_generic_butterfly_int32_neon (fout, fin,
+                    cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
+        }
+        return;
+    }
+
     if (scaled_flag)
     {
         if (inverse_fft)