From 20b1896fd6532336e6a46608778bd6e0396dc4dc Mon Sep 17 00:00:00 2001
From: "Phil.Wang" <phil.wang@arm.com>
Date: Wed, 17 Dec 2014 11:52:11 +0800
Subject: [PATCH] NE10/FFT/complex-non-power-of-2 NEON

    ARM 64-bit (Cortex-A57)
    complex forward float LLVM 3.5
             Time in ms      |
        |kiss|opus|pffft|NE10|
        |   C|   C| NEON|NEON|
      60| 129| 113|   NA|  44|
     120| 148| 127|   NA|  49|
     240| 151| 128|   55|  47|
     480| 169| 142|   60|  55|
     960| 183| 149|   65|  58|
    1920| 193| 167|   71|  66|
    3840| 217| 175|   76|  71|
    SNR > 100dB

    ARM 64-bit (Cortex-A53)
    complex forward float LLVM 3.5
             Time in ms      |
        |kiss|opus|pffft|NE10|
        |   C|   C| NEON|NEON|
      60| 295| 311|   NA|  72|
     120| 368| 375|   NA|  79|
     240| 345| 342|  104|  77|
     480| 415| 407|  115|  87|
     960| 406| 378|  121|  95|
    1920| 476| 441|  138| 113|
    3840| 497| 424|  161| 126|
    SNR > 100dB

    ARM 32-bit (Cortex-A9)
    complex forward float LLVM 3.5
             Time in ms      |
        |kiss|opus|pffft|NE10|
        |   C|   C| NEON|NEON|
      60| 224| 211|   NA|  98|
     120| 265| 245|   NA| 104|
     240| 262| 240|  130| 106|
     480| 302| 274|  150| 122|
     960| 305| 271|  162| 153|
    1920| 369| 356|  230| 206|
    3840| 415| 440|  282| 239|
    SNR > 100dB

Change-Id: If9418041b01eed49dbdc8d6a18dd03f2c5684da8
---
 CMakeLists.txt                                     |   18 +-
 inc/NE10_dsp.h                                     |    7 +-
 inc/NE10_types.h                                   |    1 +
 modules/CMakeLists.txt                             |    1 +
 modules/dsp/NE10_fft.c                             |  253 ++++-
 modules/dsp/NE10_fft.h                             |   22 +-
 modules/dsp/NE10_fft.neonintrinsic.h               |   15 +-
 modules/dsp/NE10_fft_cplx_ops.h                    |   25 +-
 modules/dsp/NE10_fft_float32.c                     |   50 +-
 modules/dsp/NE10_fft_float32.neon.c                |   30 +-
 modules/dsp/NE10_fft_float32.neonintrinsic.c       |   31 +-
 modules/dsp/NE10_fft_generic_float32.c             |  265 ++---
 modules/dsp/NE10_fft_generic_float32.h             |  309 ++++++
 .../dsp/NE10_fft_generic_float32.neonintrinsic.cpp | 1108 ++++++++++++++++++++
 modules/dsp/NE10_fft_int16.c                       |    4 +-
 modules/dsp/NE10_fft_int32.c                       |    4 +-
 modules/dsp/NE10_fft_int32.neon.c                  |    1 +
 modules/dsp/NE10_fft_int32.neonintrinsic.c         |    1 +
 modules/dsp/NE10_init_dsp.c                        |   40 +-
 modules/dsp/NE10_rfft_float32.c                    |    4 +-
 modules/dsp/test/test_suite_fft_float32.c          |   73 +-
 21 files changed, 1946 insertions(+), 316 deletions(-)
 create mode 100644 modules/dsp/NE10_fft_generic_float32.h
 create mode 100644 modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd29984..fdf01ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@
 #
 cmake_minimum_required(VERSION 2.6)
 
-project(NE10 C ASM)
+project(NE10 C CXX ASM)
 
 option(NE10_BUILD_SHARED "Build NE10 shared libraries" OFF)
 option(NE10_BUILD_STATIC "Build NE10 static libraries" ON)
@@ -83,9 +83,6 @@ if(ANDROID_PLATFORM)
     ${ANDROID_TOOLCHAIN_PATH}/arm-linux-androideabi-as
     ${ANDROID_TOOLCHAIN_PATH}/arm-linux-androideabi-ar
     ${ANDROID_TOOLCHAIN_PATH}/arm-linux-androideabi-ranlib")
-    if(ANDROID_PLATFORM AND ANDROID_DEMO)
-      add_subdirectory(android/NE10Demo/jni)
-    endif()
 elseif(GNULINUX_PLATFORM)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mthumb-interwork -mthumb -march=armv7-a -mfpu=vfp3")
     set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -mthumb-interwork -mthumb -march=armv7-a -mfpu=neon")
@@ -102,11 +99,10 @@ elseif(IOS_PLATFORM)
 
     string(REPLACE ";" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
     string(REPLACE ";" "" CMAKE_ASM_FLAGS ${CMAKE_ASM_FLAGS})
-    if(IOS_DEMO)
-        add_subdirectory(ios)
-    endif()
 endif()
 
+set(CMAKE_CXX_FLAGS "${CMAKE_ASM_FLAGS}")
+
 # The NE10 library.
 add_subdirectory(modules)
 
@@ -117,3 +113,11 @@ endif()
 if(NE10_BUILD_UNIT_TEST)
     add_subdirectory(test)
 endif()
+
+if(ANDROID_PLATFORM AND ANDROID_DEMO)
+    add_subdirectory(android/NE10Demo/jni)
+endif()
+
+if(IOS_PLATFORM AND IOS_DEMO)
+    add_subdirectory(ios)
+endif()
diff --git a/inc/NE10_dsp.h b/inc/NE10_dsp.h
index d370be7..6200881 100644
--- a/inc/NE10_dsp.h
+++ b/inc/NE10_dsp.h
@@ -46,6 +46,8 @@ extern "C" {
     /* fft functions*/
 
     /* function pointers*/
+    extern ne10_fft_cfg_float32_t (*ne10_fft_alloc_c2c_float32) (ne10_int32_t nfft);
+
     extern void (*ne10_fft_c2c_1d_float32) (ne10_fft_cpx_float32_t *fout,
                                             ne10_fft_cpx_float32_t *fin,
                                             ne10_fft_cfg_float32_t cfg,
@@ -92,7 +94,6 @@ extern "C" {
                                           ne10_int32_t scaled_flag);
 
     /* init functions*/
-    extern ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32 (ne10_int32_t nfft);
     extern ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32 (ne10_int32_t nfft);
     extern ne10_fft_cfg_int16_t ne10_fft_alloc_c2c_int16 (ne10_int32_t nfft);
 
@@ -101,6 +102,8 @@ extern "C" {
     extern ne10_fft_r2c_cfg_int16_t ne10_fft_alloc_r2c_int16 (ne10_int32_t nfft);
 
     /* C version*/
+    extern ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_c (ne10_int32_t nfft);
+
     extern void ne10_fft_c2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
                                            ne10_fft_cpx_float32_t *fin,
                                            ne10_fft_cfg_float32_t cfg,
@@ -148,6 +151,8 @@ extern "C" {
 
 
     /* NEON version*/
+    extern ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_neon (ne10_int32_t nfft);
+
     extern void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
             ne10_fft_cpx_float32_t *fin,
             ne10_fft_cfg_float32_t cfg,
diff --git a/inc/NE10_types.h b/inc/NE10_types.h
index bd8781f..09760bc 100644
--- a/inc/NE10_types.h
+++ b/inc/NE10_types.h
@@ -223,6 +223,7 @@ typedef struct
     ne10_int32_t *factors;
     ne10_fft_cpx_float32_t *twiddles;
     ne10_fft_cpx_float32_t *buffer;
+    ne10_fft_cpx_float32_t *last_twiddles;
 } ne10_fft_state_float32_t;
 
 typedef ne10_fft_state_float32_t* ne10_fft_cfg_float32_t;
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 035d514..02c6a2e 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -190,6 +190,7 @@ if(NE10_ENABLE_DSP)
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neon.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp
         #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_float32.neonintrinsic.c
         #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.neonintrinsic.c
         #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neonintrinsic.c
diff --git a/modules/dsp/NE10_fft.c b/modules/dsp/NE10_fft.c
index b719d50..72ad82f 100644
--- a/modules/dsp/NE10_fft.c
+++ b/modules/dsp/NE10_fft.c
@@ -51,28 +51,82 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 /* factors buffer:
  * 0: stage number
  * 1: stride for the first stage
- * 2*(stage number+1): algorithm flag
- * others: factors */
-ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
+ * 2*stage number+2: algorithm flag to imply whether the generic algorithm works.
+ * others: factors
+ *
+ * Only the leading 42 int32 is used to store factors.
+ * The left can be used as algorithm flags, or status flags.
+ * Even the leading bits of stage number can be reused.
+ * */
+ne10_int32_t ne10_factor (ne10_int32_t n,
+        ne10_int32_t * facbuf,
+        ne10_int32_t ne10_factor_flags)
 {
+    // This is a walk around. We need to "return" some flags.
+    // Otherwise, we need to modify signature of ne10_factor.
+    assert (NE10_MAXFACTORS >= 32);
+
+    if ((facbuf == NULL)
+        || (n <= 0))
+    {
+        return NE10_ERR;
+    }
+
     ne10_int32_t p;
     ne10_int32_t i = 1;
     ne10_int32_t stage_num = 0;
     ne10_int32_t stride_max = n;
 
-    /* factor out powers of 4, 2, 5, 3, and other */
+    // Default algorithm flag is NE10_FFT_ALG_24
+    ne10_int32_t alg_flag = NE10_FFT_ALG_24;
+
+    // Factor out powers of 4, 2, 5, 3, and other.
     do
     {
-        if ( (n % 4) == 0)
+        // If NE10_FACTOR_FLAGS has enable NE10_FACTOR_EIGHT.
+        // Try to combine one radix-4 and one radix-2 stages
+        // into one radix-8 stage.
+        if ((ne10_factor_flags & NE10_FACTOR_EIGHT)
+                && ((n==8) || (n==40) || (n==24)))
+        {
+            switch (n)
+            {
+            case 8:
+                p = 8;
+                break;
+            case 40:
+                p = 5;
+                alg_flag = NE10_FFT_ALG_ANY;
+                break;
+            case 24:
+                p = 3;
+                alg_flag = NE10_FFT_ALG_ANY;
+                break;
+            }
+        }
+        else if ((n % 4) == 0)
+        {
             p = 4;
-        else if ( (n % 2) == 0)
+        }
+        else if ((n % 2) == 0)
+        {
             p = 2;
-        else if ( (n % 5) == 0)
+        }
+        else if ((n % 5) == 0)
+        {
             p = 5;
-        else if ( (n % 3) == 0)
+            alg_flag = NE10_FFT_ALG_ANY;
+        }
+        else if ((n % 3) == 0)
+        {
             p = 3;
+            alg_flag = NE10_FFT_ALG_ANY;
+        }
         else // stop factoring
+        {
             p = n;
+            alg_flag = NE10_FFT_ALG_ANY;
+        }
 
         n /= p;
         facbuf[2 * i] = p;
@@ -84,76 +138,167 @@ ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
     facbuf[0] = stage_num;
     facbuf[1] = stride_max / p;
 
-    /* set flag for algorithm */
-    if ( ((p % 2) == 0) && (stride_max >= 4) )
-    {
-        // last factor is 4 or 2
-        // and nfft >= 4
-        facbuf[2 * i] = NE10_FFT_ALG_24;
-    }
-    else
-    {
-        facbuf[2 * i] = NE10_FFT_ALG_ANY;
-    }
-
-    if ( stride_max < 3 )
+    if (stage_num > 21)
     {
-        // not support yet
+        // Since nfft is ne10_int32_t, stage_num can never be greater than 21,
+        // because 3^21 > 2^32
         return NE10_ERR;
     }
-    else
+
+    facbuf[2 * i] = alg_flag;
+    return NE10_OK;
+}
+
+// Twiddles matrix [mstride][radix-1]
+// First column (k == 0)is ignored because phase == 1, and
+// twiddle = (1.0, 0.0).
+static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
+        const ne10_int32_t mstride,
+        const ne10_int32_t fstride,
+        const ne10_int32_t radix,
+        const ne10_int32_t nfft)
+{
+    ne10_int32_t j, k;
+    ne10_float32_t phase;
+    const ne10_float64_t pi = NE10_PI;
+
+    for (j = 0; j < mstride; j++)
     {
-        return NE10_OK;
-    }
+        for (k = 1; k < radix; k++) // phase = 1 when k = 0
+        {
+            phase = -2 * pi * fstride * k * j / nfft;
+            twiddles[mstride * (k - 1) + j].r = (ne10_float32_t) cos (phase);
+            twiddles[mstride * (k - 1) + j].i = (ne10_float32_t) sin (phase);
+        } // radix
+    } // mstride
 }
 
 ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
         const ne10_int32_t * factors,
         const ne10_int32_t nfft )
 {
-    ne10_int32_t i, j, k;
-    ne10_fft_cpx_float32_t *tw;
     ne10_int32_t stage_count = factors[0];
     ne10_int32_t fstride = factors[1];
     ne10_int32_t mstride;
     ne10_int32_t cur_radix; // current radix
 
-    const ne10_float64_t pi = NE10_PI;
-    ne10_float32_t phase;
-
     // for first stage
-    i = stage_count;
-    cur_radix = factors[2 * i];
-    if (cur_radix%2) // current radix is not 4 or 2
+    cur_radix = factors[2 * stage_count];
+    if (cur_radix % 2) // current radix is not 4 or 2
     {
-        for (k = 0; k < cur_radix; k++)
-        {
-            phase = -2 * pi * k / cur_radix;
-            twiddles[k].r = (ne10_float32_t) cos (phase);
-            twiddles[k].i = (ne10_float32_t) sin (phase);
-        }
-        twiddles += cur_radix;
+        twiddles[0].r = 1.0;
+        twiddles[0].i = 0.0;
+        twiddles += 1;
+        ne10_fft_generate_twiddles_line_float32 (twiddles, 1, fstride, cur_radix, nfft);
+        twiddles += cur_radix - 1;
     }
+    stage_count --;
 
     // for other stage
-    for (i = stage_count - 1; i > 0; i--)
+    for (; stage_count > 0; stage_count --)
     {
-        cur_radix = factors[2 * i];
+        cur_radix = factors[2 * stage_count];
         fstride /= cur_radix;
-        mstride = factors[2 * i + 1];
-        tw = twiddles;
-        for (j = 0; j < mstride; j++)
-        {
-            for (k = 1; k < cur_radix; k++ ) // phase = 1 when k = 0
-            {
-                phase = -2 * pi * fstride * k * j / nfft;
-                tw[mstride * ( k - 1 )].r = (ne10_float32_t) cos (phase);
-                tw[mstride * ( k - 1 )].i = (ne10_float32_t) sin (phase);
-            } // cur_radix
-            tw ++;
-        } // mstride
+        mstride = factors[2 * stage_count + 1];
+        ne10_fft_generate_twiddles_line_float32 (twiddles, mstride, fstride, cur_radix, nfft);
         twiddles += mstride * (cur_radix - 1);
     } // stage_count
 
     return twiddles;
 }
+
+/**
+ * @addtogroup C2C_FFT_IFFT
+ * @{
+ */
+
+/**
+ * @brief User-callable function to allocate all necessary storage space for the fft.
+ * @param[in]   nfft             length of FFT
+ * @return      st               point to the FFT config memory. This memory is allocated with malloc.
+ * The function allocate all necessary storage space for the fft. It also factors out the length of FFT and generates the twiddle coeff.
+ */
+ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_neon (ne10_int32_t nfft)
+{
+    ne10_fft_cfg_float32_t st = NULL;
+    ne10_uint32_t memneeded = sizeof (ne10_fft_state_float32_t)
+                              + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/
+                              + sizeof (ne10_fft_cpx_float32_t) * nfft        /* twiddle*/
+                              + sizeof (ne10_fft_cpx_float32_t) * nfft        /* buffer*/
+                              + NE10_FFT_BYTE_ALIGNMENT;     /* 64-bit alignment*/
+
+    st = (ne10_fft_cfg_float32_t) NE10_MALLOC (memneeded);
+
+    // Bad allocation.
+    if (st == NULL)
+    {
+        return st;
+    }
+
+    uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_float32_t);
+    NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
+    st->factors = (ne10_int32_t*) address;
+    st->twiddles = (ne10_fft_cpx_float32_t*) (st->factors + (NE10_MAXFACTORS * 2));
+    st->buffer = st->twiddles + nfft;
+
+    // st->last_twiddles is default NULL.
+    // Calling fft_c or fft_neon is decided by this pointers.
+    st->last_twiddles = NULL;
+
+    st->nfft = nfft;
+    if (nfft % NE10_FFT_PARA_LEVEL == 0)
+    {
+        // Size of FFT satisfies requirement of NEON optimization.
+        st->nfft /= NE10_FFT_PARA_LEVEL;
+        st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL;
+    }
+
+    ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
+
+    // Can not factor.
+    if (result == NE10_ERR)
+    {
+        NE10_FREE (st);
+        return st;
+    }
+
+    // Check if radix-8 can be enabled
+    ne10_int32_t stage_count    = st->factors[0];
+    ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
+
+    // Enable radix-8.
+    if (algorithm_flag == NE10_FFT_ALG_ANY)
+    {
+        result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
+        if (result == NE10_ERR)
+        {
+            NE10_FREE (st);
+            return st;
+        }
+        ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, st->nfft);
+    }
+    else
+    {
+        st->last_twiddles = NULL;
+        st->nfft = nfft;
+        result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
+        ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, st->nfft);
+        return st;
+    }
+
+    // Generate super twiddles for the last stage.
+    if (nfft % NE10_FFT_PARA_LEVEL == 0)
+    {
+        // Size of FFT satisfies requirement of NEON optimization.
+        ne10_fft_generate_twiddles_line_float32 (st->last_twiddles,
+                st->nfft,
+                1,
+                NE10_FFT_PARA_LEVEL,
+                nfft);
+    }
+    return st;
+}
+
+/**
+ * @}
+ */ //end of C2C_FFT_IFFT group
diff --git a/modules/dsp/NE10_fft.h b/modules/dsp/NE10_fft.h
index 1ad6461..0d29c5a 100644
--- a/modules/dsp/NE10_fft.h
+++ b/modules/dsp/NE10_fft.h
@@ -49,10 +49,18 @@
 #define NE10_FFT_ALG_24 0
 #define NE10_FFT_ALG_ANY 1
 
+/* NE10_FACTOR_FLAGS */
+// Only factors into 2, 3, 4, 5
+#define NE10_FACTOR_DEFAULT 0
+// Factors into 2, 3, 4, 5, 8
+#define NE10_FACTOR_EIGHT   1
+
 // Comment when do not want to scale output result
 #define NE10_DSP_RFFT_SCALING
 #define NE10_DSP_CFFT_SCALING
 
+#define NE10_FFT_PARA_LEVEL 4
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -62,7 +70,7 @@ extern "C" {
 ///////////////////////////
 
     /*common fft functions */
-    extern ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf);
+    extern ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf, ne10_int32_t ne10_factor_flags);
 
     extern ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
         const ne10_int32_t * factors,
@@ -81,6 +89,18 @@ extern "C" {
             const ne10_fft_cpx_float32_t * twiddles,
             ne10_fft_cpx_float32_t * buffer);
 
+    extern void ne10_mixed_radix_generic_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
+            const ne10_fft_cpx_float32_t * Fin,
+            const ne10_int32_t * factors,
+            const ne10_fft_cpx_float32_t * twiddles,
+            ne10_fft_cpx_float32_t * buffer);
+
+    extern void ne10_mixed_radix_generic_butterfly_inverse_float32_neon (ne10_fft_cpx_float32_t * Fout,
+            const ne10_fft_cpx_float32_t * Fin,
+            const ne10_int32_t * factors,
+            const ne10_fft_cpx_float32_t * twiddles,
+            ne10_fft_cpx_float32_t * buffer);
+
     extern void ne10_mixed_radix_fft_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
             ne10_fft_cpx_float32_t * Fin,
             ne10_int32_t * factors,
diff --git a/modules/dsp/NE10_fft.neonintrinsic.h b/modules/dsp/NE10_fft.neonintrinsic.h
index d27f137..4e2983c 100644
--- a/modules/dsp/NE10_fft.neonintrinsic.h
+++ b/modules/dsp/NE10_fft.neonintrinsic.h
@@ -32,25 +32,24 @@
 #ifndef NE10_FFT_NEONINTRINSIC_H
 #define NE10_FFT_NEONINTRINSIC_H
 
+#include "NE10_fft.h"
 #include <arm_neon.h>
 
 #define NE10_CPX_ADD_NEON_F32(Z,A,B) do {           \
-    Z.val[0] = vaddq_f32( A.val[0] , B.val[0] );    \
-    Z.val[1] = vaddq_f32( A.val[1] , B.val[1] );    \
+    Z.val[0] = A.val[0] + B.val[0];    \
+    Z.val[1] = A.val[1] + B.val[1];    \
 } while (0);
 
 #define NE10_CPX_SUB_NEON_F32(Z,A,B) do {           \
-    Z.val[0] = vsubq_f32( A.val[0] , B.val[0] );    \
-    Z.val[1] = vsubq_f32( A.val[1] , B.val[1] );    \
+    Z.val[0] = A.val[0] - B.val[0];    \
+    Z.val[1] = A.val[1] - B.val[1];    \
 } while (0);
 
 #define NE10_CPX_MUL_NEON_F32(Z,A,B) do {           \
     float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
-    float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \
     float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
-    float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \
-    Z.val[0] = vsubq_f32(ARBR,AIBI);                \
-    Z.val[1] = vaddq_f32(AIBR,ARBI);                \
+    Z.val[0] = vmlsq_f32(ARBR, A.val[1], B.val[1]); \
+    Z.val[1] = vmlaq_f32(ARBI, A.val[1], B.val[0]); \
 } while (0);
 
 #define NE10_CPX_MUL_INV_NEON_F32(Z,A,B) do {           \
diff --git a/modules/dsp/NE10_fft_cplx_ops.h b/modules/dsp/NE10_fft_cplx_ops.h
index 3eefe49..759f4e8 100644
--- a/modules/dsp/NE10_fft_cplx_ops.h
+++ b/modules/dsp/NE10_fft_cplx_ops.h
@@ -46,26 +46,25 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 
 #include <NE10_types.h>
 
-#ifdef __cplusplus
-#include <algorithm>
-#endif
-
 #ifndef NE10_FFT_CPLX_OPS_H
 #define NE10_FFT_CPLX_OPS_H
 
 #ifdef __cplusplus
+template<class T>
+static inline void ne10_swap_ptr(T *&ptr_a, T *&ptr_b)
+{
+    ptr_a = (T *)((intptr_t)(ptr_a) ^ (intptr_t)(ptr_b));
+    ptr_b = (T *)((intptr_t)(ptr_a) ^ (intptr_t)(ptr_b));
+    ptr_a = (T *)((intptr_t)(ptr_a) ^ (intptr_t)(ptr_b));
+}
+#else
 #define ne10_swap_ptr(X,Y) \
     do { \
-        std::swap((X),(Y)); \
+        X = (void *)((intptr_t)(X) ^ (intptr_t)(Y)); \
+        Y = (void *)((intptr_t)(X) ^ (intptr_t)(Y)); \
+        X = (void *)((intptr_t)(X) ^ (intptr_t)(Y)); \
     } while (0)
-#else // __cplusplus
-#define ne10_swap_ptr(X,Y) \
-    do { \
-        void *ptr = (X); \
-        (X) = (Y); \
-        (Y) = ptr; \
-    } while (0)
-#endif // __cplusplus
+#endif
 
 // Multiply scalar X by scalar Y
 #define NE10_S_MUL(X,Y) ((X) * (Y))
diff --git a/modules/dsp/NE10_fft_float32.c b/modules/dsp/NE10_fft_float32.c
index 5b776f8..f03cd40 100644
--- a/modules/dsp/NE10_fft_float32.c
+++ b/modules/dsp/NE10_fft_float32.c
@@ -996,7 +996,7 @@ static void ne10_fft_split_c2r_1d_float32 (ne10_fft_cpx_float32_t *dst,
  * @return      st               point to the FFT config memory. This memory is allocated with malloc.
  * The function allocate all necessary storage space for the fft. It also factors out the length of FFT and generates the twiddle coeff.
  */
-ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32 (ne10_int32_t nfft)
+ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_c (ne10_int32_t nfft)
 {
     ne10_fft_cfg_float32_t st = NULL;
     ne10_uint32_t memneeded = sizeof (ne10_fft_state_float32_t)
@@ -1007,25 +1007,45 @@ ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32 (ne10_int32_t nfft)
 
     st = (ne10_fft_cfg_float32_t) NE10_MALLOC (memneeded);
 
-    if (st)
+    if (st == NULL)
     {
-        uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_float32_t);
-        NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
-        st->factors = (ne10_int32_t*) address;
-        st->twiddles = (ne10_fft_cpx_float32_t*) (st->factors + (NE10_MAXFACTORS * 2));
-        st->buffer = st->twiddles + nfft;
-        st->nfft = nfft;
+        return st;
+    }
 
-        ne10_int32_t result = ne10_factor (nfft, st->factors);
-        if (result == NE10_ERR)
+    uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_float32_t);
+    NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
+    st->factors = (ne10_int32_t*) address;
+    st->twiddles = (ne10_fft_cpx_float32_t*) (st->factors + (NE10_MAXFACTORS * 2));
+    st->buffer = st->twiddles + nfft;
+    st->nfft = nfft;
+
+    ne10_int32_t result = ne10_factor (nfft, st->factors, NE10_FACTOR_DEFAULT);
+    if (result == NE10_ERR)
+    {
+        NE10_FREE (st);
+        return st;
+    }
+
+    // Check if ALGORITHM FLAG is NE10_FFT_ALG_ANY.
+    {
+        ne10_int32_t stage_count    = st->factors[0];
+        ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
+
+        // Enable radix-8.
+        if (algorithm_flag == NE10_FFT_ALG_ANY)
         {
-            NE10_FREE (st);
-            return st;
+            result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
+            if (result == NE10_ERR)
+            {
+                PRINT_HIT;
+                NE10_FREE (st);
+                return st;
+            }
         }
+    }
 
-        ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, nfft);
+    ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, nfft);
 
-    }
     return st;
 }
 
@@ -1194,7 +1214,7 @@ ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32 (ne10_int32_t nfft)
         st->buffer = st->super_twiddles + (ncfft / 2);
         st->ncfft = ncfft;
 
-        ne10_int32_t result = ne10_factor (ncfft, st->factors);
+        ne10_int32_t result = ne10_factor (ncfft, st->factors, NE10_FACTOR_DEFAULT);
         if (result == NE10_ERR)
         {
             NE10_FREE (st);
diff --git a/modules/dsp/NE10_fft_float32.neon.c b/modules/dsp/NE10_fft_float32.neon.c
index 96fd9e0..8247e0b 100644
--- a/modules/dsp/NE10_fft_float32.neon.c
+++ b/modules/dsp/NE10_fft_float32.neon.c
@@ -34,6 +34,7 @@
 #include "NE10_types.h"
 #include "NE10_macros.h"
 #include "NE10_fft.h"
+#include "NE10_dsp.h"
 
 static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t * Fout,
         ne10_fft_cpx_float32_t * Fin)
@@ -661,13 +662,13 @@ static void ne10_fft_split_c2r_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
  */
 
 /**
- * @brief Mixed radix-2/4 complex FFT/IFFT of float(32-bit) data.
+ * @brief Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
  * @param[out]  *fout            point to the output buffer (out-of-place)
  * @param[in]   *fin             point to the input buffer (out-of-place)
  * @param[in]   cfg              point to the config struct
  * @param[in]   inverse_fft      the flag of IFFT, 0: FFT, 1: IFFT
  * @return none.
- * The function implements a mixed radix-2/4 complex FFT/IFFT. The length of 2^N(N is 1, 2, 3, 4, 5, 6 ....etc) is supported.
+ * The function implements a mixed radix-2/3/4/5 complex FFT/IFFT. The length of 2^N*3^M*5^K(N,M,K are 1, 2, 3, 4, 5, 6 ....etc, and length >= 4) is supported.
  * Otherwise, this FFT is an out-of-place algorithm. When you want to get an in-place FFT, it creates a temp buffer as
  *  output buffer and then copies the temp buffer back to input buffer. For the usage of this function, please check test/test_suite_fft_float32.c
  */
@@ -677,6 +678,31 @@ void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
                                    ne10_fft_cfg_float32_t cfg,
                                    ne10_int32_t inverse_fft)
 {
+    ne10_int32_t stage_count = cfg->factors[0];
+    ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
+
+    assert ((algorithm_flag == NE10_FFT_ALG_24)
+            || (algorithm_flag == NE10_FFT_ALG_ANY));
+
+    // For NE10_FFT_ALG_ANY.
+    // Function will return inside this branch.
+    if (algorithm_flag == NE10_FFT_ALG_ANY)
+    {
+        if (inverse_fft)
+        {
+            ne10_mixed_radix_generic_butterfly_inverse_float32_neon (fout, fin,
+                    cfg->factors, cfg->twiddles, cfg->buffer);
+        }
+        else
+        {
+            ne10_mixed_radix_generic_butterfly_float32_neon (fout, fin,
+                    cfg->factors, cfg->twiddles, cfg->buffer);
+        }
+        return;
+    }
+
+    // Since function goes pass assertion and skips branch above, algorithm_flag
+    // must be NE10_FFT_ALG_24.
     if (inverse_fft)
     {
         switch (cfg->nfft)
diff --git a/modules/dsp/NE10_fft_float32.neonintrinsic.c b/modules/dsp/NE10_fft_float32.neonintrinsic.c
index f1c8eb2..f7b576c 100644
--- a/modules/dsp/NE10_fft_float32.neonintrinsic.c
+++ b/modules/dsp/NE10_fft_float32.neonintrinsic.c
@@ -34,6 +34,7 @@
 #include "NE10_types.h"
 #include "NE10_macros.h"
 #include "NE10_fft.h"
+#include "NE10_dsp.h"
 
 static inline void ne10_fft4_forward_float32 (ne10_fft_cpx_float32_t * Fout,
         ne10_fft_cpx_float32_t * Fin)
@@ -1401,7 +1402,6 @@ void ne10_mixed_radix_fft_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
         // end of first stage
     }
 
-
     // others but the last one
     for (; stage_count > 1 ; stage_count--)
     {
@@ -1446,13 +1446,13 @@ void ne10_mixed_radix_fft_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
  */
 
 /**
- * @brief Mixed radix-2/4 complex FFT/IFFT of float(32-bit) data.
+ * @brief Mixed radix-2/3/4/5 complex FFT/IFFT of float(32-bit) data.
  * @param[out]  *fout            point to the output buffer (out-of-place)
  * @param[in]   *fin             point to the input buffer (out-of-place)
  * @param[in]   cfg              point to the config struct
  * @param[in]   inverse_fft      the flag of IFFT, 0: FFT, 1: IFFT
  * @return none.
- * The function implements a mixed radix-2/4 complex FFT/IFFT. The length of 2^N(N is 1, 2, 3, 4, 5, 6 ....etc) is supported.
+ * The function implements a mixed radix-2/3/4/5 complex FFT/IFFT. The length of 2^N*3^M*5^K(N,M,K are 1, 2, 3, 4, 5, 6 ....etc, and length >= 4) is supported.
  * Otherwise, this FFT is an out-of-place algorithm. When you want to get an in-place FFT, it creates a temp buffer as
  *  output buffer and then copies the temp buffer back to input buffer. For the usage of this function, please check test/test_suite_fft_float32.c
  */
@@ -1461,6 +1461,31 @@ void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
                                    ne10_fft_cfg_float32_t cfg,
                                    ne10_int32_t inverse_fft)
 {
+    ne10_int32_t stage_count = cfg->factors[0];
+    ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
+
+    assert ((algorithm_flag == NE10_FFT_ALG_24)
+            || (algorithm_flag == NE10_FFT_ALG_ANY));
+
+    // For NE10_FFT_ALG_ANY.
+    // Function will return inside this branch.
+    if (algorithm_flag == NE10_FFT_ALG_ANY)
+    {
+        if (inverse_fft)
+        {
+            ne10_mixed_radix_generic_butterfly_inverse_float32_neon (fout, fin,
+                    cfg->factors, cfg->twiddles, cfg->buffer);
+        }
+        else
+        {
+            ne10_mixed_radix_generic_butterfly_float32_neon (fout, fin,
+                    cfg->factors, cfg->twiddles, cfg->buffer);
+        }
+        return;
+    }
+
+    // Since function goes pass assertion and skips branch above, algorithm_flag
+    // must be NE10_FFT_ALG_24.
     if (inverse_fft)
     {
         switch (cfg->nfft)
diff --git a/modules/dsp/NE10_fft_generic_float32.c b/modules/dsp/NE10_fft_generic_float32.c
index 699fd4f..f6171a8 100644
--- a/modules/dsp/NE10_fft_generic_float32.c
+++ b/modules/dsp/NE10_fft_generic_float32.c
@@ -47,171 +47,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 #include "NE10_types.h"
 #include "NE10_macros.h"
 #include "NE10_fft.h"
-
-///////////////////////////////
-// Multiply input with twiddles
-///////////////////////////////
-static inline void FFT2_MUL_TW (ne10_fft_cpx_float32_t scratch_out[2],
-        const ne10_fft_cpx_float32_t scratch_in[2],
-        const ne10_fft_cpx_float32_t scratch_tw[1])
-{
-    scratch_out[0] = scratch_in[0];
-    NE10_CPX_MUL_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
-}
-
-static inline void FFT3_MUL_TW (ne10_fft_cpx_float32_t scratch_out[3],
-        const ne10_fft_cpx_float32_t scratch_in[3],
-        const ne10_fft_cpx_float32_t scratch_tw[2])
-{
-    FFT2_MUL_TW (scratch_out, scratch_in, scratch_tw);
-    NE10_CPX_MUL_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
-}
-
-static inline void FFT4_MUL_TW (ne10_fft_cpx_float32_t scratch_out[4],
-        const ne10_fft_cpx_float32_t scratch_in[4],
-        const ne10_fft_cpx_float32_t scratch_tw[3])
-{
-    FFT3_MUL_TW (scratch_out, scratch_in, scratch_tw);
-    NE10_CPX_MUL_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
-}
-
-static inline void FFT5_MUL_TW (ne10_fft_cpx_float32_t scratch_out[5],
-        const ne10_fft_cpx_float32_t scratch_in[5],
-        const ne10_fft_cpx_float32_t scratch_tw[4])
-{
-    FFT4_MUL_TW (scratch_out, scratch_in, scratch_tw);
-    NE10_CPX_MUL_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
-}
-
-///////////////////
-// FFT Kernel
-// F: Forward
-// C: Complex
-// U: Unscaled
-//////////////////
-static inline void FFT2_FCU (ne10_fft_cpx_float32_t scratch_out[2],
-        const ne10_fft_cpx_float32_t scratch_in[2])
-{
-    NE10_CPX_ADD (scratch_out[0], scratch_in[0], scratch_in[1]);
-    NE10_CPX_SUB (scratch_out[1], scratch_in[0], scratch_in[1]);
-}
-
-static inline void FFT3_FCU (ne10_fft_cpx_float32_t Fout[3],
-        const ne10_fft_cpx_float32_t Fin[3])
-{
-    ne10_fft_cpx_float32_t scratch[4];
-    ne10_fft_cpx_float32_t scratch_in[3];
-
-    scratch_in[0] = Fin[0];
-    scratch_in[1] = Fin[1];
-    scratch_in[2] = Fin[2];
-
-    scratch[1] = scratch_in[1];
-    scratch[2] = scratch_in[2];
-
-    NE10_CPX_ADD (scratch[3], scratch[1], scratch[2]);
-    NE10_CPX_SUB (scratch[0], scratch[1], scratch[2]);
-
-    scratch_in[1].r = scratch_in[0].r - scratch[3].r * 0.5;
-    scratch_in[1].i = scratch_in[0].i - scratch[3].i * 0.5;
-
-    scratch[0].r *= -TW_3I_F32;
-    scratch[0].i *= -TW_3I_F32;
-
-    scratch_in[0].r += scratch[3].r;
-    scratch_in[0].i += scratch[3].i;
-
-    scratch_in[2].r = scratch_in[1].r + scratch[0].i;
-    scratch_in[2].i = scratch_in[1].i - scratch[0].r;
-
-    scratch_in[1].r -= scratch[0].i;
-    scratch_in[1].i += scratch[0].r;
-
-    Fout[0] = scratch_in[0];
-    Fout[1] = scratch_in[1];
-    Fout[2] = scratch_in[2];
-}
-
-static inline void FFT4_FCU (ne10_fft_cpx_float32_t scratch_out[4],
-        const ne10_fft_cpx_float32_t scratch_in[4])
-{
-    ne10_fft_cpx_float32_t scratch[4];
-
-    NE10_CPX_ADD (scratch[0], scratch_in[0], scratch_in[2]);
-    NE10_CPX_SUB (scratch[1], scratch_in[0], scratch_in[2]);
-    NE10_CPX_ADD (scratch[2], scratch_in[1], scratch_in[3]);
-    NE10_CPX_SUB (scratch[3], scratch_in[1], scratch_in[3]);
-
-    NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
-    NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
-
-    scratch_out[1].r = scratch[1].r + scratch[3].i;
-    scratch_out[1].i = scratch[1].i - scratch[3].r;
-    scratch_out[3].r = scratch[1].r - scratch[3].i;
-    scratch_out[3].i = scratch[1].i + scratch[3].r;
-}
-
-static inline void FFT5_FCU (ne10_fft_cpx_float32_t Fout[5],
-        const ne10_fft_cpx_float32_t Fin[5])
-{
-    ne10_fft_cpx_float32_t scratch[13], scratch_in[5];
-
-    scratch_in[0] = Fin[0];
-    scratch_in[1] = Fin[1];
-    scratch_in[2] = Fin[2];
-    scratch_in[3] = Fin[3];
-    scratch_in[4] = Fin[4];
-
-    scratch[0] = scratch_in[0];
-    scratch[1] = scratch_in[1];
-    scratch[2] = scratch_in[2];
-    scratch[3] = scratch_in[3];
-    scratch[4] = scratch_in[4];
-
-    NE10_CPX_ADD (scratch[ 7], scratch[1], scratch[4]);
-    NE10_CPX_SUB (scratch[10], scratch[1], scratch[4]);
-    NE10_CPX_ADD (scratch[ 8], scratch[2], scratch[3]);
-    NE10_CPX_SUB (scratch[ 9], scratch[2], scratch[3]);
-
-    scratch_in[0].r += scratch[7].r + scratch[8].r;
-    scratch_in[0].i += scratch[7].i + scratch[8].i;
-
-    scratch[5].r = scratch[0].r
-        + NE10_S_MUL (scratch[7].r, TW_5A_F32.r)
-        + NE10_S_MUL (scratch[8].r, TW_5B_F32.r);
-    scratch[5].i = scratch[0].i
-        + NE10_S_MUL (scratch[7].i, TW_5A_F32.r)
-        + NE10_S_MUL (scratch[8].i, TW_5B_F32.r);
-
-    scratch[6].r = NE10_S_MUL (scratch[10].i, TW_5A_F32.i)
-        + NE10_S_MUL (scratch[9].i, TW_5B_F32.i);
-    scratch[6].i = -NE10_S_MUL (scratch[10].r, TW_5A_F32.i)
-        - NE10_S_MUL (scratch[9].r, TW_5B_F32.i);
-
-    NE10_CPX_SUB (scratch_in[1], scratch[5], scratch[6]);
-    NE10_CPX_ADD (scratch_in[4], scratch[5], scratch[6]);
-
-    scratch[11].r = scratch[0].r
-        + NE10_S_MUL (scratch[7].r, TW_5B_F32.r)
-        + NE10_S_MUL (scratch[8].r, TW_5A_F32.r);
-    scratch[11].i = scratch[0].i
-        + NE10_S_MUL (scratch[7].i, TW_5B_F32.r)
-        + NE10_S_MUL (scratch[8].i, TW_5A_F32.r);
-
-    scratch[12].r = -NE10_S_MUL (scratch[10].i, TW_5B_F32.i)
-        + NE10_S_MUL (scratch[9].i, TW_5A_F32.i);
-    scratch[12].i = NE10_S_MUL (scratch[10].r, TW_5B_F32.i)
-        - NE10_S_MUL (scratch[9].r, TW_5A_F32.i);
-
-    NE10_CPX_ADD (scratch_in[2], scratch[11], scratch[12]);
-    NE10_CPX_SUB (scratch_in[3], scratch[11], scratch[12]);
-
-    Fout[0] = scratch_in[0];
-    Fout[1] = scratch_in[1];
-    Fout[2] = scratch_in[2];
-    Fout[3] = scratch_in[3];
-    Fout[4] = scratch_in[4];
-}
+#include "NE10_fft_generic_float32.h"
 
 ////////////////////////////////////
 // Following are butterfly functions
@@ -402,6 +238,102 @@ static inline void ne10_radix_4_butterfly_float32_c (ne10_fft_cpx_float32_t *Fou
     }
 }
 
+static inline void ne10_radix_8_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
+        const ne10_fft_cpx_float32_t *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft,
+        const ne10_int32_t is_first_stage,
+        const ne10_int32_t is_inverse)
+{
+    assert (is_first_stage == 1);
+
+    ne10_fft_cpx_float32_t scratch_in[8];
+    ne10_fft_cpx_float32_t scratch_out[8];
+
+    const ne10_int32_t in_step = nfft / 8;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            scratch_in[0] = Fin[0 * in_step];
+            scratch_in[1] = Fin[1 * in_step];
+            scratch_in[2] = Fin[2 * in_step];
+            scratch_in[3] = Fin[3 * in_step];
+            scratch_in[4] = Fin[4 * in_step];
+            scratch_in[5] = Fin[5 * in_step];
+            scratch_in[6] = Fin[6 * in_step];
+            scratch_in[7] = Fin[7 * in_step];
+
+            if (is_inverse)
+            {
+                scratch_in[0].i = -scratch_in[0].i;
+                scratch_in[1].i = -scratch_in[1].i;
+                scratch_in[2].i = -scratch_in[2].i;
+                scratch_in[3].i = -scratch_in[3].i;
+                scratch_in[4].i = -scratch_in[4].i;
+                scratch_in[5].i = -scratch_in[5].i;
+                scratch_in[6].i = -scratch_in[6].i;
+                scratch_in[7].i = -scratch_in[7].i;
+            }
+
+#ifdef NE10_DSP_CFFT_SCALING
+            if (is_inverse)
+            {
+                const ne10_float32_t one_by_nfft = 1.0 / nfft;
+
+                scratch_in[0].r *= one_by_nfft;
+                scratch_in[0].i *= one_by_nfft;
+                scratch_in[1].r *= one_by_nfft;
+                scratch_in[1].i *= one_by_nfft;
+                scratch_in[2].r *= one_by_nfft;
+                scratch_in[2].i *= one_by_nfft;
+                scratch_in[3].r *= one_by_nfft;
+                scratch_in[3].i *= one_by_nfft;
+                scratch_in[4].r *= one_by_nfft;
+                scratch_in[4].i *= one_by_nfft;
+                scratch_in[5].r *= one_by_nfft;
+                scratch_in[5].i *= one_by_nfft;
+                scratch_in[6].r *= one_by_nfft;
+                scratch_in[6].i *= one_by_nfft;
+                scratch_in[7].r *= one_by_nfft;
+                scratch_in[7].i *= one_by_nfft;
+            }
+#endif
+
+            FFT8_FCU (scratch_out, scratch_in);
+
+            if (is_inverse)
+            {
+                scratch_out[0].i = -scratch_out[0].i;
+                scratch_out[1].i = -scratch_out[1].i;
+                scratch_out[2].i = -scratch_out[2].i;
+                scratch_out[3].i = -scratch_out[3].i;
+                scratch_out[4].i = -scratch_out[4].i;
+                scratch_out[5].i = -scratch_out[5].i;
+                scratch_out[6].i = -scratch_out[6].i;
+                scratch_out[7].i = -scratch_out[7].i;
+            }
+
+            Fout[0*out_step] = scratch_out[0];
+            Fout[1*out_step] = scratch_out[1];
+            Fout[2*out_step] = scratch_out[2];
+            Fout[3*out_step] = scratch_out[3];
+            Fout[4*out_step] = scratch_out[4];
+            Fout[5*out_step] = scratch_out[5];
+            Fout[6*out_step] = scratch_out[6];
+            Fout[7*out_step] = scratch_out[7];
+
+            Fin++;
+            Fout += 8;
+        }
+    }
+}
+
 static inline void ne10_radix_3_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_fft_cpx_float32_t *twiddles,
@@ -705,6 +637,9 @@ static inline void ne10_mixed_radix_generic_butterfly_float32_impl_c (ne10_fft_c
     case 5:
         ne10_radix_5_butterfly_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1, is_inverse);
         break;
+        break;
+    case 8:
+        ne10_radix_8_butterfly_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1, is_inverse);
     default:
         ne10_radix_generic_butterfly_float32_c (Fout, Fin, twiddles, radix,
                 fstride, 1, is_inverse);
diff --git a/modules/dsp/NE10_fft_generic_float32.h b/modules/dsp/NE10_fft_generic_float32.h
new file mode 100644
index 0000000..60f18b0
--- /dev/null
+++ b/modules/dsp/NE10_fft_generic_float32.h
@@ -0,0 +1,309 @@
+/*
+ *  Copyright 2014 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_fft_generic_float32.h
+ */
+
+#ifndef NE10_FFT_GENERIC_FLOAT32_H
+#define NE10_FFT_GENERIC_FLOAT32_H
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_fft.h"
+
+///////////////////////////////
+// Multiply input with twiddles
+///////////////////////////////
+static inline void FFT2_MUL_TW (ne10_fft_cpx_float32_t scratch_out[2],
+        const ne10_fft_cpx_float32_t scratch_in[2],
+        const ne10_fft_cpx_float32_t scratch_tw[1])
+{
+    scratch_out[0] = scratch_in[0];
+    NE10_CPX_MUL_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
+}
+
+static inline void FFT3_MUL_TW (ne10_fft_cpx_float32_t scratch_out[3],
+        const ne10_fft_cpx_float32_t scratch_in[3],
+        const ne10_fft_cpx_float32_t scratch_tw[2])
+{
+    FFT2_MUL_TW (scratch_out, scratch_in, scratch_tw);
+    NE10_CPX_MUL_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
+}
+
+static inline void FFT4_MUL_TW (ne10_fft_cpx_float32_t scratch_out[4],
+        const ne10_fft_cpx_float32_t scratch_in[4],
+        const ne10_fft_cpx_float32_t scratch_tw[3])
+{
+    FFT3_MUL_TW (scratch_out, scratch_in, scratch_tw);
+    NE10_CPX_MUL_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
+}
+
+static inline void FFT8_FCU (ne10_fft_cpx_float32_t out[8],
+        const ne10_fft_cpx_float32_t in[8])
+{
+    ne10_fft_cpx_float32_t s[8];
+
+    const static ne10_fft_cpx_float32_t TW_8[4] = {
+        {  1.00000,  0.00000 },
+        {  0.70711, -0.70711 },
+        {  0.00000, -1.00000 },
+        { -0.70711, -0.70711 },
+    };
+
+#define NE10_BUTTERFLY_INDEX_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
+    do { \
+        NE10_CPX_ADD (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
+        NE10_CPX_SUB (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
+    } while (0)
+
+    // STAGE - 1
+    // in -> s
+    {
+        NE10_BUTTERFLY_INDEX_F32 (s,in,0,4,0,4);
+        NE10_BUTTERFLY_INDEX_F32 (s,in,1,5,1,5);
+        NE10_BUTTERFLY_INDEX_F32 (s,in,2,6,2,6);
+        NE10_BUTTERFLY_INDEX_F32 (s,in,3,7,3,7);
+    }
+
+    // STAGE - 2
+    // s -> out
+    {
+        // TW
+#define NE10_CPX_MUL_TW8_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
+        do { \
+            ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
+            NE10_CPX_MUL_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP); \
+        } while (0)
+
+        NE10_CPX_MUL_TW8_F32 (s,TW_8,4,0);
+        NE10_CPX_MUL_TW8_F32 (s,TW_8,5,1);
+        NE10_CPX_MUL_TW8_F32 (s,TW_8,6,2);
+        NE10_CPX_MUL_TW8_F32 (s,TW_8,7,3);
+
+        NE10_BUTTERFLY_INDEX_F32 (out,s,0,2,0,2);
+        NE10_BUTTERFLY_INDEX_F32 (out,s,1,3,1,3);
+        NE10_BUTTERFLY_INDEX_F32 (out,s,4,6,4,6);
+        NE10_BUTTERFLY_INDEX_F32 (out,s,5,7,5,7);
+    }
+    // STAGE - 3
+    // out -> s
+    {
+        // TW
+        NE10_CPX_MUL_TW8_F32 (out,TW_8,2,0);
+        NE10_CPX_MUL_TW8_F32 (out,TW_8,3,2);
+        NE10_CPX_MUL_TW8_F32 (out,TW_8,6,0);
+        NE10_CPX_MUL_TW8_F32 (out,TW_8,7,2);
+#undef NE10_CPX_MUL_TW8_F32
+
+        NE10_BUTTERFLY_INDEX_F32 (s,out,0,4,0,1);
+        NE10_BUTTERFLY_INDEX_F32 (s,out,2,6,2,3);
+        NE10_BUTTERFLY_INDEX_F32 (s,out,1,5,4,5);
+        NE10_BUTTERFLY_INDEX_F32 (s,out,3,7,6,7);
+    }
+
+    out[0] = s[0];
+    out[1] = s[1];
+    out[2] = s[2];
+    out[3] = s[3];
+    out[4] = s[4];
+    out[5] = s[5];
+    out[6] = s[6];
+    out[7] = s[7];
+}
+
+static inline void FFT5_MUL_TW (ne10_fft_cpx_float32_t scratch_out[5],
+        const ne10_fft_cpx_float32_t scratch_in[5],
+        const ne10_fft_cpx_float32_t scratch_tw[4])
+{
+    FFT4_MUL_TW (scratch_out, scratch_in, scratch_tw);
+    NE10_CPX_MUL_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
+}
+
+///////////////////
+// FFT Kernel
+// F: Forward
+// C: Complex
+// U: Unscaled
+//////////////////
+static inline void FFT2_FCU (ne10_fft_cpx_float32_t scratch_out[2],
+        const ne10_fft_cpx_float32_t scratch_in[2])
+{
+    NE10_CPX_ADD (scratch_out[0], scratch_in[0], scratch_in[1]);
+    NE10_CPX_SUB (scratch_out[1], scratch_in[0], scratch_in[1]);
+}
+
+static inline void FFT3_FCU (ne10_fft_cpx_float32_t Fout[3],
+        const ne10_fft_cpx_float32_t Fin[3])
+{
+    ne10_fft_cpx_float32_t scratch[4];
+    ne10_fft_cpx_float32_t scratch_in[3];
+
+    scratch_in[0] = Fin[0];
+    scratch_in[1] = Fin[1];
+    scratch_in[2] = Fin[2];
+
+    scratch[1] = scratch_in[1];
+    scratch[2] = scratch_in[2];
+
+    NE10_CPX_ADD (scratch[3], scratch[1], scratch[2]);
+    NE10_CPX_SUB (scratch[0], scratch[1], scratch[2]);
+
+    scratch_in[1].r = scratch_in[0].r - scratch[3].r * 0.5;
+    scratch_in[1].i = scratch_in[0].i - scratch[3].i * 0.5;
+
+    scratch[0].r *= -TW_3I_F32;
+    scratch[0].i *= -TW_3I_F32;
+
+    scratch_in[0].r += scratch[3].r;
+    scratch_in[0].i += scratch[3].i;
+
+    scratch_in[2].r = scratch_in[1].r + scratch[0].i;
+    scratch_in[2].i = scratch_in[1].i - scratch[0].r;
+
+    scratch_in[1].r -= scratch[0].i;
+    scratch_in[1].i += scratch[0].r;
+
+    Fout[0] = scratch_in[0];
+    Fout[1] = scratch_in[1];
+    Fout[2] = scratch_in[2];
+}
+
+static inline void FFT4_FCU (ne10_fft_cpx_float32_t scratch_out[4],
+        const ne10_fft_cpx_float32_t scratch_in[4])
+{
+    ne10_fft_cpx_float32_t scratch[4];
+
+    NE10_CPX_ADD (scratch[0], scratch_in[0], scratch_in[2]);
+    NE10_CPX_SUB (scratch[1], scratch_in[0], scratch_in[2]);
+    NE10_CPX_ADD (scratch[2], scratch_in[1], scratch_in[3]);
+    NE10_CPX_SUB (scratch[3], scratch_in[1], scratch_in[3]);
+
+    NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
+    NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
+
+    scratch_out[1].r = scratch[1].r + scratch[3].i;
+    scratch_out[1].i = scratch[1].i - scratch[3].r;
+    scratch_out[3].r = scratch[1].r - scratch[3].i;
+    scratch_out[3].i = scratch[1].i + scratch[3].r;
+}
+
+static inline void FFT4_FCU_INPLACE (ne10_fft_cpx_float32_t scratch_out[4])
+{
+    ne10_fft_cpx_float32_t scratch[4];
+
+    NE10_CPX_ADD (scratch[0], scratch_out[0], scratch_out[2]);
+    NE10_CPX_SUB (scratch[1], scratch_out[0], scratch_out[2]);
+    NE10_CPX_ADD (scratch[2], scratch_out[1], scratch_out[3]);
+    NE10_CPX_SUB (scratch[3], scratch_out[1], scratch_out[3]);
+
+    NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
+    NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
+
+    scratch_out[1].r = scratch[1].r + scratch[3].i;
+    scratch_out[1].i = scratch[1].i - scratch[3].r;
+    scratch_out[3].r = scratch[1].r - scratch[3].i;
+    scratch_out[3].i = scratch[1].i + scratch[3].r;
+}
+
+static inline void FFT5_FCU (ne10_fft_cpx_float32_t Fout[5],
+        const ne10_fft_cpx_float32_t Fin[5])
+{
+    ne10_fft_cpx_float32_t scratch[13], scratch_in[5];
+
+    scratch_in[0] = Fin[0];
+    scratch_in[1] = Fin[1];
+    scratch_in[2] = Fin[2];
+    scratch_in[3] = Fin[3];
+    scratch_in[4] = Fin[4];
+
+    scratch[0] = scratch_in[0];
+    scratch[1] = scratch_in[1];
+    scratch[2] = scratch_in[2];
+    scratch[3] = scratch_in[3];
+    scratch[4] = scratch_in[4];
+
+    NE10_CPX_ADD (scratch[ 7], scratch[1], scratch[4]);
+    NE10_CPX_SUB (scratch[10], scratch[1], scratch[4]);
+    NE10_CPX_ADD (scratch[ 8], scratch[2], scratch[3]);
+    NE10_CPX_SUB (scratch[ 9], scratch[2], scratch[3]);
+
+    scratch_in[0].r += scratch[7].r + scratch[8].r;
+    scratch_in[0].i += scratch[7].i + scratch[8].i;
+
+    scratch[5].r = scratch[0].r
+        + NE10_S_MUL (scratch[7].r, TW_5A_F32.r)
+        + NE10_S_MUL (scratch[8].r, TW_5B_F32.r);
+    scratch[5].i = scratch[0].i
+        + NE10_S_MUL (scratch[7].i, TW_5A_F32.r)
+        + NE10_S_MUL (scratch[8].i, TW_5B_F32.r);
+
+    scratch[6].r = NE10_S_MUL (scratch[10].i, TW_5A_F32.i)
+        + NE10_S_MUL (scratch[9].i, TW_5B_F32.i);
+    scratch[6].i = -NE10_S_MUL (scratch[10].r, TW_5A_F32.i)
+        - NE10_S_MUL (scratch[9].r, TW_5B_F32.i);
+
+    NE10_CPX_SUB (scratch_in[1], scratch[5], scratch[6]);
+    NE10_CPX_ADD (scratch_in[4], scratch[5], scratch[6]);
+
+    scratch[11].r = scratch[0].r
+        + NE10_S_MUL (scratch[7].r, TW_5B_F32.r)
+        + NE10_S_MUL (scratch[8].r, TW_5A_F32.r);
+    scratch[11].i = scratch[0].i
+        + NE10_S_MUL (scratch[7].i, TW_5B_F32.r)
+        + NE10_S_MUL (scratch[8].i, TW_5A_F32.r);
+
+    scratch[12].r = -NE10_S_MUL (scratch[10].i, TW_5B_F32.i)
+        + NE10_S_MUL (scratch[9].i, TW_5A_F32.i);
+    scratch[12].i = NE10_S_MUL (scratch[10].r, TW_5B_F32.i)
+        - NE10_S_MUL (scratch[9].r, TW_5A_F32.i);
+
+    NE10_CPX_ADD (scratch_in[2], scratch[11], scratch[12]);
+    NE10_CPX_SUB (scratch_in[3], scratch[11], scratch[12]);
+
+    Fout[0] = scratch_in[0];
+    Fout[1] = scratch_in[1];
+    Fout[2] = scratch_in[2];
+    Fout[3] = scratch_in[3];
+    Fout[4] = scratch_in[4];
+}
+#endif // NE10_FFT_GENERIC_FLOAT32_H
diff --git a/modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp b/modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp
new file mode 100644
index 0000000..804e6d6
--- /dev/null
+++ b/modules/dsp/NE10_fft_generic_float32.neonintrinsic.cpp
@@ -0,0 +1,1108 @@
+/*
+ *  Copyright 2014 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_fft_generic_float32.neonintrisic.cpp
+ *
+ * This file must be compiled by C++ toolchain because some functions are
+ * written as template functions to make it easier for compiler to
+ * reduce branch jump.
+ */
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_fft.neonintrinsic.h"
+#include "NE10_fft_generic_float32.h"
+
+typedef float32x4x2_t CPLX;
+typedef float32x4_t   REAL;
+#define NE10_REAL_DUP_NEON_F32 vdupq_n_f32
+#define NE10_CPLX_LOAD(PTR) vld2q_f32 ((ne10_float32_t*) (PTR))
+#define NE10_CPLX_STORE(PTR,OUT) \
+    do { \
+        vst2q_f32 ((ne10_float32_t*) (PTR), OUT); \
+    } while (0)
+
+static inline void NE10_LOAD_TW_AND_MUL (CPLX &scratch_in,
+        const ne10_fft_cpx_float32_t *ptr_in)
+{
+    CPLX scratch_tw;
+    float32x2_t d2_tmp = vld1_f32 ((ne10_float32_t *)ptr_in);
+    scratch_tw.val[0] = NE10_REAL_DUP_NEON_F32 (d2_tmp[0]);
+    scratch_tw.val[1] = NE10_REAL_DUP_NEON_F32 (d2_tmp[1]);
+    NE10_CPX_MUL_NEON_F32 (scratch_in, scratch_in, scratch_tw);
+}
+
+static inline REAL NE10_S_MUL_NEON_F32 (const REAL vec,
+        const ne10_float32_t scalar)
+{
+    REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
+    REAL result = scalar_neon * vec;
+    return result;
+}
+
+static inline REAL NE10_S_MLA_NEON_F32 (const REAL dst,
+        const REAL src,
+        const ne10_float32_t scalar)
+{
+    REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
+    return vmlaq_f32 (dst, src, scalar_neon);
+}
+
+static inline REAL NE10_S_MLS_NEON_F32 (const REAL dst,
+        const REAL src,
+        const ne10_float32_t scalar)
+{
+    REAL scalar_neon = NE10_REAL_DUP_NEON_F32 (scalar);
+    return vmlsq_f32 (dst, src, scalar_neon);
+}
+
+///////////////////////////////
+// Multiply input with twiddles
+///////////////////////////////
+static inline void NE10_FFT2_MUL_TW_NEON (CPLX scratch_out[2],
+        const CPLX scratch_in[2],
+        const CPLX scratch_tw[1])
+{
+    scratch_out[0] = scratch_in[0];
+    NE10_CPX_MUL_NEON_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
+}
+
+static inline void NE10_FFT3_MUL_TW_NEON (CPLX scratch_out[3],
+        const CPLX scratch_in[3],
+        const CPLX scratch_tw[2])
+{
+    NE10_FFT2_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
+    NE10_CPX_MUL_NEON_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
+}
+
+static inline void NE10_FFT4_MUL_TW_NEON (CPLX scratch_out[4],
+        const CPLX scratch_in[4],
+        const CPLX scratch_tw[3])
+{
+    NE10_FFT3_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
+    NE10_CPX_MUL_NEON_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
+}
+
+static inline void NE10_FFT5_MUL_TW_NEON (CPLX scratch_out[5],
+        const CPLX scratch_in[5],
+        const CPLX scratch_tw[4])
+{
+    NE10_FFT4_MUL_TW_NEON (scratch_out, scratch_in, scratch_tw);
+    NE10_CPX_MUL_NEON_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
+}
+
+////////////////
+// Conj inplace.
+////////////////
+static inline void NE10_FFT2_CONJ (CPLX scratch_out[2])
+{
+    scratch_out[0].val[1] = -scratch_out[0].val[1];
+    scratch_out[1].val[1] = -scratch_out[1].val[1];
+}
+
+static inline void NE10_FFT3_CONJ (CPLX scratch_out[3])
+{
+    NE10_FFT2_CONJ (scratch_out);
+    scratch_out[2].val[1] = -scratch_out[2].val[1];
+}
+
+static inline void NE10_FFT4_CONJ (CPLX scratch_out[4])
+{
+    NE10_FFT3_CONJ (scratch_out);
+    scratch_out[3].val[1] = -scratch_out[3].val[1];
+}
+
+static inline void NE10_FFT5_CONJ (CPLX scratch_out[5])
+{
+    NE10_FFT4_CONJ (scratch_out);
+    scratch_out[4].val[1] = -scratch_out[4].val[1];
+}
+
+static inline void NE10_FFT8_CONJ (CPLX scratch_out[8])
+{
+    NE10_FFT5_CONJ (scratch_out);
+    scratch_out[5].val[1] = -scratch_out[5].val[1];
+    scratch_out[6].val[1] = -scratch_out[6].val[1];
+    scratch_out[7].val[1] = -scratch_out[7].val[1];
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Scaling
+// If Macro NE10_DSP_CFFT_SCALING is not defined, these functions do nothing.
+/////////////////////////////////////////////////////////////////////////////
+static inline void NE10_FFT2_SCALING (CPLX scratch_out[2],
+        const REAL one_by_fft_neon)
+{
+#ifdef NE10_DSP_CFFT_SCALING
+    scratch_out[0].val[0] *= one_by_fft_neon;
+    scratch_out[0].val[1] *= one_by_fft_neon;
+    scratch_out[1].val[0] *= one_by_fft_neon;
+    scratch_out[1].val[1] *= one_by_fft_neon;
+#endif
+}
+
+static inline void NE10_FFT3_SCALING (CPLX scratch_out[3],
+        const REAL one_by_fft_neon)
+{
+#ifdef NE10_DSP_CFFT_SCALING
+    NE10_FFT2_SCALING (scratch_out, one_by_fft_neon);
+    scratch_out[2].val[0] *= one_by_fft_neon;
+    scratch_out[2].val[1] *= one_by_fft_neon;
+#endif
+}
+
+static inline void NE10_FFT4_SCALING (CPLX scratch_out[4],
+        const REAL one_by_fft_neon)
+{
+#ifdef NE10_DSP_CFFT_SCALING
+    NE10_FFT3_SCALING (scratch_out, one_by_fft_neon);
+    scratch_out[3].val[0] *= one_by_fft_neon;
+    scratch_out[3].val[1] *= one_by_fft_neon;
+#endif
+}
+
+static inline void NE10_FFT5_SCALING (CPLX scratch_out[5],
+        const REAL one_by_fft_neon)
+{
+#ifdef NE10_DSP_CFFT_SCALING
+    NE10_FFT4_SCALING (scratch_out, one_by_fft_neon);
+    scratch_out[4].val[0] *= one_by_fft_neon;
+    scratch_out[4].val[1] *= one_by_fft_neon;
+#endif
+}
+
+static inline void NE10_FFT8_SCALING (CPLX scratch_out[8],
+        const REAL one_by_fft_neon)
+{
+#ifdef NE10_DSP_CFFT_SCALING
+    NE10_FFT5_SCALING (scratch_out, one_by_fft_neon);
+    scratch_out[5].val[0] *= one_by_fft_neon;
+    scratch_out[5].val[1] *= one_by_fft_neon;
+    scratch_out[6].val[0] *= one_by_fft_neon;
+    scratch_out[6].val[1] *= one_by_fft_neon;
+    scratch_out[7].val[0] *= one_by_fft_neon;
+    scratch_out[7].val[1] *= one_by_fft_neon;
+#endif
+}
+
+///////////////////
+// FFT Kernel
+// F: Forward
+// C: Complex
+// U: Unscaled
+//////////////////
+static inline void NE10_FFT2_FUC_NEON_F32 (CPLX scratch_out[2],
+        const CPLX scratch_in[2])
+{
+    NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch_in[0], scratch_in[1]);
+    NE10_CPX_SUB_NEON_F32 (scratch_out[1], scratch_in[0], scratch_in[1]);
+}
+
+static inline void NE10_FFT3_FUC_NEON_F32 (CPLX Fout[3],
+        const CPLX Fin[3])
+{
+    const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);
+    const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);
+
+    CPLX scratch[4];
+
+    Fout[0] = Fin[0];
+    Fout[1] = Fin[1];
+    Fout[2] = Fin[2];
+
+    scratch[1] = Fout[1];
+    scratch[2] = Fout[2];
+
+    NE10_CPX_ADD_NEON_F32 (scratch[3], scratch[1], scratch[2]);
+    NE10_CPX_SUB_NEON_F32 (scratch[0], scratch[1], scratch[2]);
+
+    Fout[1].val[0] = Fout[0].val[0] - scratch[3].val[0] * HALF_NEON_F32;
+    Fout[1].val[1] = Fout[0].val[1] - scratch[3].val[1] * HALF_NEON_F32;
+
+    scratch[0].val[0] = scratch[0].val[0] * TW_3IN_NEON_F32;
+    scratch[0].val[1] = scratch[0].val[1] * TW_3IN_NEON_F32;
+
+    Fout[0].val[0] += scratch[3].val[0];
+    Fout[0].val[1] += scratch[3].val[1];
+
+    Fout[2].val[0] = Fout[1].val[0] + scratch[0].val[1];
+    Fout[2].val[1] = Fout[1].val[1] - scratch[0].val[0];
+
+    Fout[1].val[0] -= scratch[0].val[1];
+    Fout[1].val[1] += scratch[0].val[0];
+}
+
+static inline void NE10_FFT4_FUC_NEON_F32 (CPLX scratch_out[4],
+        const CPLX scratch_in[4])
+{
+    CPLX scratch[4];
+
+    NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_in[0], scratch_in[2]);
+    NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_in[0], scratch_in[2]);
+    NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_in[1], scratch_in[3]);
+    NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_in[1], scratch_in[3]);
+
+    NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
+    NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
+
+    scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
+    scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
+    scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
+    scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
+}
+
+static inline void NE10_FFT4_FUC_INPLACE_NEON_F32 (CPLX scratch_out[4])
+{
+    CPLX scratch[4];
+
+    NE10_CPX_ADD_NEON_F32 (scratch[0], scratch_out[0], scratch_out[2]);
+    NE10_CPX_SUB_NEON_F32 (scratch[1], scratch_out[0], scratch_out[2]);
+    NE10_CPX_ADD_NEON_F32 (scratch[2], scratch_out[1], scratch_out[3]);
+    NE10_CPX_SUB_NEON_F32 (scratch[3], scratch_out[1], scratch_out[3]);
+
+    NE10_CPX_SUB_NEON_F32 (scratch_out[2], scratch[0], scratch[2]);
+    NE10_CPX_ADD_NEON_F32 (scratch_out[0], scratch[0], scratch[2]);
+
+    scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
+    scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
+    scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
+    scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
+}
+
+static inline void NE10_FFT5_FUC_INPLACE_NEON_F32 (CPLX Fout[5])
+{
+    CPLX s[6];
+
+    NE10_CPX_ADD_NEON_F32 (s[1], Fout[1], Fout[4]);
+    NE10_CPX_ADD_NEON_F32 (s[2], Fout[2], Fout[3]);
+
+    s[0] = Fout[0];
+    s[5] = Fout[0];
+
+    Fout[0].val[0] = Fout[0].val[0] + s[1].val[0] + s[2].val[0];
+    Fout[0].val[1] = Fout[0].val[1] + s[1].val[1] + s[2].val[1];
+
+    s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[1].val[0], TW_5A_F32.r);
+    s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[1].val[1], TW_5A_F32.r);
+    s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[1].val[0], TW_5B_F32.r);
+    s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[1].val[1], TW_5B_F32.r);
+
+    s[0].val[0] = NE10_S_MLA_NEON_F32 (s[0].val[0], s[2].val[0], TW_5B_F32.r);
+    s[0].val[1] = NE10_S_MLA_NEON_F32 (s[0].val[1], s[2].val[1], TW_5B_F32.r);
+    s[5].val[0] = NE10_S_MLA_NEON_F32 (s[5].val[0], s[2].val[0], TW_5A_F32.r);
+    s[5].val[1] = NE10_S_MLA_NEON_F32 (s[5].val[1], s[2].val[1], TW_5A_F32.r);
+
+    NE10_CPX_SUB_NEON_F32 (s[4], Fout[1], Fout[4]);
+    NE10_CPX_SUB_NEON_F32 (s[3], Fout[2], Fout[3]);
+
+    s[1].val[0] = NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5A_F32.i);
+    s[1].val[1] = -NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5A_F32.i);
+    s[2].val[0] = -NE10_S_MUL_NEON_F32 (s[4].val[1], TW_5B_F32.i);
+    s[2].val[1] = NE10_S_MUL_NEON_F32 (s[4].val[0], TW_5B_F32.i);
+
+    s[1].val[0] = NE10_S_MLA_NEON_F32 (s[1].val[0], s[3].val[1], TW_5B_F32.i);
+    s[1].val[1] = NE10_S_MLS_NEON_F32 (s[1].val[1], s[3].val[0], TW_5B_F32.i);
+    s[2].val[0] = NE10_S_MLA_NEON_F32 (s[2].val[0], s[3].val[1], TW_5A_F32.i);
+    s[2].val[1] = NE10_S_MLS_NEON_F32 (s[2].val[1], s[3].val[0], TW_5A_F32.i);
+
+    NE10_CPX_SUB_NEON_F32 (Fout[1], s[0], s[1]);
+    NE10_CPX_ADD_NEON_F32 (Fout[4], s[0], s[1]);
+    NE10_CPX_ADD_NEON_F32 (Fout[2], s[5], s[2]);
+    NE10_CPX_SUB_NEON_F32 (Fout[3], s[5], s[2]);
+}
+
+#define NE10_BUTTERFLY_INDEX_NEON_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
+    do { \
+        NE10_CPX_ADD_NEON_F32 (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
+        NE10_CPX_SUB_NEON_F32 (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
+    } while (0)
+
+static inline void NE10_FFT8_FUC_NEON_F32 (CPLX out[8],
+        const CPLX in[8])
+{
+    PRINT_HIT;
+
+    CPLX s[8];
+    const static ne10_fft_cpx_float32_t TW_8[4] =
+    {
+        {  1.00000,  0.00000 },
+        {  0.70711, -0.70711 },
+        {  0.00000, -1.00000 },
+        { -0.70711, -0.70711 },
+    };
+
+    // STAGE - 1
+    // in -> s
+    {
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 0, 4, 0, 4);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 1, 5, 1, 5);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 2, 6, 2, 6);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, in, 3, 7, 3, 7);
+    }
+
+    // STAGE - 2
+    // s -> out
+    {
+        // TW
+#define NE10_CPX_MUL_TW8_NEON_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
+        do { \
+            ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
+            CPLX TW_TMP_NEON; \
+            TW_TMP_NEON.val[0] = NE10_REAL_DUP_NEON_F32 (TW_TMP.r); \
+            TW_TMP_NEON.val[1] = NE10_REAL_DUP_NEON_F32 (TW_TMP.i); \
+            NE10_CPX_MUL_NEON_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP_NEON); \
+        } while (0)
+
+        NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 4, 0);
+        NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 5, 1);
+        NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 6, 2);
+        NE10_CPX_MUL_TW8_NEON_F32 (s, TW_8, 7, 3);
+
+        NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 0, 2, 0, 2);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 1, 3, 1, 3);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 4, 6, 4, 6);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (out, s, 5, 7, 5, 7);
+    }
+    // STAGE - 3
+    // out -> s
+    {
+        // TW
+        NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 2, 0);
+        NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 3, 2);
+        NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 6, 0);
+        NE10_CPX_MUL_TW8_NEON_F32 (out, TW_8, 7, 2);
+#undef NE10_CPX_MUL_TW8_NEON_F32
+
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 0, 4, 0, 1);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 2, 6, 2, 3);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 1, 5, 4, 5);
+        NE10_BUTTERFLY_INDEX_NEON_F32 (s, out, 3, 7, 6, 7);
+    }
+
+    out[0] = s[0];
+    out[1] = s[1];
+    out[2] = s[2];
+    out[3] = s[3];
+    out[4] = s[4];
+    out[5] = s[5];
+    out[6] = s[6];
+    out[7] = s[7];
+}
+
+////////////////////////////////////
+// Following are butterfly functions
+////////////////////////////////////
+template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse>
+static void ne10_radix_2_butterfly_float32_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+    PRINT_HIT;
+
+    CPLX in[2];
+    CPLX out[2];
+
+    const ne10_int32_t in_step = nfft / 2;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
+            in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT2_CONJ (in);
+            }
+
+            if (is_first_stage == 0)
+            {
+                NE10_LOAD_TW_AND_MUL (in[1], twiddles);
+            }
+
+            NE10_FFT2_FUC_NEON_F32 (out, in);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT2_CONJ (out);
+
+                if (is_first_stage == 1)
+                {
+                    NE10_FFT2_SCALING (out, one_by_fft_neon);
+                }
+            }
+
+            NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
+            NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
+
+            Fin++;
+
+            if (is_first_stage == 0)
+            {
+                Fout++;
+                twiddles++;
+            }
+            else
+            {
+                Fout += 2;
+            }
+        }
+        if (is_first_stage == 0)
+        {
+            twiddles -= out_step;
+            Fout += (2 - 1) * out_step;
+        }
+    }
+}
+template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse>
+static void ne10_radix_4_butterfly_float32_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+    CPLX in[4];
+
+    const ne10_int32_t in_step = nfft / 4;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
+            in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
+            in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
+            in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT4_CONJ (in);
+            }
+
+            if (is_first_stage == 0)
+            {
+                NE10_LOAD_TW_AND_MUL (in[1], twiddles);
+                NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
+                NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
+            }
+
+            NE10_FFT4_FUC_INPLACE_NEON_F32 (in);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT4_CONJ (in);
+                if (is_first_stage == 1)
+                {
+                    NE10_FFT4_SCALING (in, one_by_fft_neon);
+                }
+            }
+
+            NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
+            NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
+            NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
+            NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
+
+            Fin++;
+
+            if (is_first_stage == 0)
+            {
+                Fout++;
+                twiddles++;
+            }
+            else
+            {
+                Fout += 4;
+            }
+        }
+        if (is_first_stage == 0)
+        {
+            twiddles -= out_step;
+            Fout += (4 - 1) * out_step;
+        }
+    }
+}
+
+template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse>
+static void ne10_radix_3_butterfly_float32_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+    CPLX in[3];
+    CPLX out[3];
+
+    const ne10_int32_t in_step = nfft / 3;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
+            in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
+            in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT3_CONJ (in);
+            }
+
+            if (is_first_stage == 0)
+            {
+                NE10_LOAD_TW_AND_MUL (in[1], twiddles);
+                NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
+            }
+
+            NE10_FFT3_FUC_NEON_F32 (out, in);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT3_CONJ (out);
+                if (is_first_stage == 1)
+                {
+                    NE10_FFT3_SCALING (out, one_by_fft_neon);
+                }
+            }
+
+            NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
+            NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
+            NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
+
+            Fin++;
+
+            if (is_first_stage == 0)
+            {
+                Fout++;
+                twiddles++;
+            }
+            else
+            {
+                Fout += 3;
+            }
+        }
+        if (is_first_stage == 0)
+        {
+            twiddles -= out_step;
+            Fout += (3 - 1) * out_step;
+        }
+    }
+}
+
+template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse>
+static void ne10_radix_5_butterfly_float32_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+    CPLX in[5];
+
+    const ne10_int32_t in_step = nfft / 5;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
+            in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
+            in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
+            in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
+            in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT5_CONJ (in);
+            }
+
+            if (is_first_stage == 0)
+            {
+                NE10_LOAD_TW_AND_MUL (in[1], twiddles);
+                NE10_LOAD_TW_AND_MUL (in[2], twiddles + out_step);
+                NE10_LOAD_TW_AND_MUL (in[3], twiddles + out_step * 2);
+                NE10_LOAD_TW_AND_MUL (in[4], twiddles + out_step * 3);
+            }
+
+            NE10_FFT5_FUC_INPLACE_NEON_F32 (in);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT5_CONJ (in);
+                if (is_first_stage == 1)
+                {
+                    NE10_FFT5_SCALING (in, one_by_fft_neon);
+                }
+            }
+
+            NE10_CPLX_STORE (Fout + 0 * out_step, in[0]);
+            NE10_CPLX_STORE (Fout + 1 * out_step, in[1]);
+            NE10_CPLX_STORE (Fout + 2 * out_step, in[2]);
+            NE10_CPLX_STORE (Fout + 3 * out_step, in[3]);
+            NE10_CPLX_STORE (Fout + 4 * out_step, in[4]);
+
+            Fin++;
+
+            if (is_first_stage == 0)
+            {
+                Fout++;
+                twiddles++;
+            }
+            else
+            {
+                Fout += 5;
+            }
+        }
+        if (is_first_stage == 0)
+        {
+            twiddles -= out_step;
+            Fout += (5 - 1) * out_step;
+        }
+    }
+}
+
+template<ne10_int32_t is_first_stage, ne10_int32_t is_inverse>
+static void ne10_radix_8_butterfly_float32_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+    PRINT_HIT;
+
+    CPLX in[8];
+    CPLX out[8];
+
+    const ne10_int32_t in_step = nfft / 8;
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        for (m_count = out_step; m_count > 0; m_count--)
+        {
+            in[0] = NE10_CPLX_LOAD (Fin + 0 * in_step);
+            in[1] = NE10_CPLX_LOAD (Fin + 1 * in_step);
+            in[2] = NE10_CPLX_LOAD (Fin + 2 * in_step);
+            in[3] = NE10_CPLX_LOAD (Fin + 3 * in_step);
+            in[4] = NE10_CPLX_LOAD (Fin + 4 * in_step);
+            in[5] = NE10_CPLX_LOAD (Fin + 5 * in_step);
+            in[6] = NE10_CPLX_LOAD (Fin + 6 * in_step);
+            in[7] = NE10_CPLX_LOAD (Fin + 7 * in_step);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT8_CONJ (in);
+            }
+
+            NE10_FFT8_FUC_NEON_F32 (out, in);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT8_CONJ (out);
+                NE10_FFT8_SCALING (out, one_by_fft_neon);
+            }
+
+            NE10_CPLX_STORE (Fout + 0 * out_step, out[0]);
+            NE10_CPLX_STORE (Fout + 1 * out_step, out[1]);
+            NE10_CPLX_STORE (Fout + 2 * out_step, out[2]);
+            NE10_CPLX_STORE (Fout + 3 * out_step, out[3]);
+            NE10_CPLX_STORE (Fout + 4 * out_step, out[4]);
+            NE10_CPLX_STORE (Fout + 5 * out_step, out[5]);
+            NE10_CPLX_STORE (Fout + 6 * out_step, out[6]);
+            NE10_CPLX_STORE (Fout + 7 * out_step, out[7]);
+
+            Fin++;
+            Fout += 8;
+        }
+    }
+}
+
+template<ne10_int32_t is_inverse>
+static void ne10_mixed_radix_generic_butterfly_float32_neon_impl (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_float32_t *twiddles,
+        CPLX *buffer)
+{
+    PRINT_HIT;
+
+    ne10_int32_t fstride, mstride, radix;
+    ne10_int32_t stage_count;
+    ne10_int32_t nfft;
+
+    // init fstride, mstride, radix, nfft
+    stage_count = factors[0];
+    fstride = factors[1];
+    mstride = 1;
+    radix = factors[ stage_count << 1 ]; // radix of first stage
+    nfft = fstride * radix;
+
+    PRINT_HIT;
+
+    // swap to make sure output to Fout
+    if (stage_count % 2 == 0)
+    {
+        ne10_swap_ptr (buffer, Fout);
+    }
+
+    // first stage
+    switch (radix)
+    {
+    case 2:
+        PRINT_HIT;
+        ne10_radix_2_butterfly_float32_neon<1, is_inverse> (Fout, Fin, NULL,
+                fstride, 1, nfft);
+        break;
+    case 4:
+        PRINT_HIT;
+        ne10_radix_4_butterfly_float32_neon<1, is_inverse> (Fout, Fin, NULL,
+                fstride, 1, nfft);
+        break;
+    case 3:
+        PRINT_HIT;
+        ne10_radix_3_butterfly_float32_neon<1, is_inverse> (Fout, Fin, NULL,
+                fstride, 1, nfft);
+        break;
+    case 5:
+        PRINT_HIT;
+        ne10_radix_5_butterfly_float32_neon<1, is_inverse> (Fout, Fin, NULL,
+                fstride, 1, nfft);
+        break;
+    case 8:
+        PRINT_HIT;
+        ne10_radix_8_butterfly_float32_neon<1, is_inverse> (Fout, Fin, NULL,
+                fstride, 1, nfft);
+        break;
+    }
+
+    stage_count--;
+    if (! stage_count) // finish
+    {
+        return;
+    }
+
+    mstride *= radix;
+
+    // update radix
+    if (radix % 2)
+    {
+        twiddles += radix;
+    }
+    radix = factors[ stage_count << 1 ];
+
+    // radix of first stage, should be one of {2,3,5,4}
+    assert ((radix > 1) && (radix < 6));
+
+    // other stages
+    while (stage_count > 0)
+    {
+        ne10_swap_ptr (buffer, Fout);
+
+        fstride /= radix;
+        switch (radix)
+        {
+        case 2:
+            ne10_radix_2_butterfly_float32_neon<0, is_inverse> (Fout, buffer,
+                    twiddles, fstride, mstride, nfft);
+            break;
+        case 3:
+            ne10_radix_3_butterfly_float32_neon<0, is_inverse> (Fout, buffer,
+                    twiddles, fstride, mstride, nfft);
+            break;
+        case 4:
+            ne10_radix_4_butterfly_float32_neon<0, is_inverse> (Fout, buffer,
+                    twiddles, fstride, mstride, nfft);
+            break;
+        case 5:
+            ne10_radix_5_butterfly_float32_neon<0, is_inverse> (Fout, buffer,
+                    twiddles, fstride, mstride, nfft);
+            break;
+        } // switch (radix)
+
+        twiddles += mstride * (radix - 1);
+        mstride *= radix;
+
+        stage_count--;
+        radix = factors[ stage_count << 1 ];
+
+        assert ((radix > 1) && (radix < 6));
+    } // while (stage_count)
+}
+
+template<ne10_int32_t is_inverse>
+static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
+        const CPLX *Fin,
+        const ne10_fft_cpx_float32_t *twiddles,
+        const ne10_int32_t fstride,
+        const ne10_int32_t out_step,
+        const ne10_int32_t nfft)
+{
+#ifdef NE10_VERBOSE
+    // Clear Fout
+    int i;
+    for (i = 0; i < nfft; i++)
+    {
+        ((ne10_fft_cpx_float32_t *) Fout)[i].r = 0.0;
+        ((ne10_fft_cpx_float32_t *) Fout)[i].i = 0.0;
+    }
+#endif
+
+    ne10_int32_t f_count;
+    ne10_int32_t m_count;
+
+    for (f_count = fstride; f_count > 0; f_count--)
+    {
+        CPLX scratch_in[4];
+        CPLX scratch_out[4];
+
+        for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
+        {
+            scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
+            scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
+            scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
+            scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
+
+            // Transpose
+            {
+                CPLX scratch0, scratch_in0;
+                CPLX scratch1, scratch_in1;
+                CPLX scratch2, scratch_in2;
+                CPLX scratch3, scratch_in3;
+
+                scratch_in0 = scratch_in[0];
+                scratch_in1 = scratch_in[1];
+                scratch_in2 = scratch_in[2];
+                scratch_in3 = scratch_in[3];
+
+                NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
+
+                scratch_in[0] = scratch0;
+                scratch_in[1] = scratch1;
+                scratch_in[2] = scratch2;
+                scratch_in[3] = scratch3;
+            }
+
+            if (is_inverse)
+            {
+                NE10_FFT4_CONJ (scratch_in);
+            }
+
+            // Not first stage
+            {
+                CPLX scratch_tw[3];
+
+                scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
+                scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
+                scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
+
+                NE10_FFT4_MUL_TW_NEON (scratch_in, scratch_in, scratch_tw);
+            }
+
+            NE10_FFT4_FUC_NEON_F32 (scratch_out, scratch_in);
+
+            if (is_inverse == 1)
+            {
+                NE10_FFT4_CONJ (scratch_out);
+            }
+
+            // Store.
+            {
+                ne10_fft_cpx_float32_t *Fout_cpx;
+                Fout_cpx = (ne10_fft_cpx_float32_t *) Fout;
+
+                NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
+                NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
+                NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
+                NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
+            }
+
+            Fin += 4;
+            Fout += 1;
+            twiddles += 4;
+        }
+    }
+
+    ne10_int32_t left_over = out_step % 4;
+    if (left_over == 0)
+    {
+        return;
+    }
+
+    // Left over.
+    const ne10_fft_cpx_float32_t *Fin_s  = (ne10_fft_cpx_float32_t *) Fin;
+    ne10_fft_cpx_float32_t *Fout_s = (ne10_fft_cpx_float32_t *) Fout;
+    for (m_count = out_step % 4; m_count > 0; m_count--)
+    {
+        ne10_fft_cpx_float32_t scratch_in[4];
+        ne10_fft_cpx_float32_t scratch_tw[4];
+
+        scratch_in[0] = Fin_s[0];
+        scratch_in[1] = Fin_s[1];
+        scratch_in[2] = Fin_s[2];
+        scratch_in[3] = Fin_s[3];
+
+        if (is_inverse)
+        {
+            scratch_in[0].i = -scratch_in[0].i;
+            scratch_in[1].i = -scratch_in[1].i;
+            scratch_in[2].i = -scratch_in[2].i;
+            scratch_in[3].i = -scratch_in[3].i;
+        }
+
+        scratch_tw[0] = twiddles[0 * out_step];
+        scratch_tw[1] = twiddles[1 * out_step];
+        scratch_tw[2] = twiddles[2 * out_step];
+
+        FFT4_MUL_TW (scratch_in, scratch_in, scratch_tw);
+
+        FFT4_FCU_INPLACE (scratch_in);
+
+        if (is_inverse)
+        {
+            scratch_in[0].i = -scratch_in[0].i;
+            scratch_in[1].i = -scratch_in[1].i;
+            scratch_in[2].i = -scratch_in[2].i;
+            scratch_in[3].i = -scratch_in[3].i;
+        }
+
+        Fout_s[0 * out_step] = scratch_in[0];
+        Fout_s[1 * out_step] = scratch_in[1];
+        Fout_s[2 * out_step] = scratch_in[2];
+        Fout_s[3 * out_step] = scratch_in[3];
+
+        Fin_s += 4;
+        Fout_s += 1;
+        twiddles += 1;
+    }
+}
+
+void ne10_mixed_radix_generic_butterfly_float32_neon (
+        ne10_fft_cpx_float32_t *Fout,
+        const ne10_fft_cpx_float32_t *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_float32_t *twiddles,
+        ne10_fft_cpx_float32_t *buffer)
+{
+    PRINT_HIT;
+
+    ne10_int32_t stage_count = factors[0];
+    ne10_int32_t fstride = factors[1];
+    ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
+
+    // nfft below is not the actual length of FFT, it is 1/4 of the actual one
+    // instead.
+    ne10_int32_t nfft = fstride * radix;
+
+    ne10_mixed_radix_generic_butterfly_float32_neon_impl<0> ((CPLX *) buffer,
+            (const CPLX *) Fin, // From Fin to buffer
+            factors,
+            twiddles,
+            (CPLX *) Fout); // Fout is "buffer" for these stages.
+
+    ne10_c2c_1d_last_stage_neon<0> ((CPLX *) Fout,
+            (const CPLX *) buffer, // From buffer to Fout
+            twiddles + nfft,
+            1, // out_step == fstride == 1
+            nfft, // in_step == mstride == nfft
+            nfft * 4); // Actual length of FFT
+
+    PRINT_HIT;
+}
+
+void ne10_mixed_radix_generic_butterfly_inverse_float32_neon (
+        ne10_fft_cpx_float32_t *Fout,
+        const ne10_fft_cpx_float32_t *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_float32_t *twiddles,
+        ne10_fft_cpx_float32_t *buffer)
+{
+    PRINT_HIT;
+
+    ne10_int32_t stage_count = factors[0];
+    ne10_int32_t fstride = factors[1];
+    ne10_int32_t radix = factors[stage_count << 1]; // radix of first stage
+
+    // nfft below is not the actual length of FFT, it is 1/4 of the actual one
+    // instead.
+    ne10_int32_t nfft = fstride * radix;
+
+    ne10_mixed_radix_generic_butterfly_float32_neon_impl<1> ((CPLX *) buffer,
+            (const CPLX *) Fin, // From Fin to buffer
+            factors,
+            twiddles,
+            (CPLX *) Fout); // Fout is "buffer" for these stages.
+
+    ne10_c2c_1d_last_stage_neon<1> ((CPLX *) Fout,
+            (const CPLX *) buffer, // From buffer to Fout
+            twiddles + nfft,
+            1, // out_step == fstride == 1
+            nfft, // in_step == mstride == nfft
+            nfft * 4); // Actual length of FFT
+
+    PRINT_HIT;
+}
diff --git a/modules/dsp/NE10_fft_int16.c b/modules/dsp/NE10_fft_int16.c
index c44eef9..f35719a 100644
--- a/modules/dsp/NE10_fft_int16.c
+++ b/modules/dsp/NE10_fft_int16.c
@@ -1049,7 +1049,7 @@ ne10_fft_cfg_int16_t ne10_fft_alloc_c2c_int16 (ne10_int32_t nfft)
         st->buffer = st->twiddles + nfft;
         st->nfft = nfft;
 
-        ne10_int32_t result = ne10_factor (nfft, st->factors);
+        ne10_int32_t result = ne10_factor (nfft, st->factors, NE10_FACTOR_DEFAULT);
         if (result == NE10_ERR)
         {
             NE10_FREE (st);
@@ -1163,7 +1163,7 @@ ne10_fft_r2c_cfg_int16_t ne10_fft_alloc_r2c_int16 (ne10_int32_t nfft)
         st->buffer = st->super_twiddles + (ncfft / 2);
         st->ncfft = ncfft;
 
-        ne10_int32_t result = ne10_factor (ncfft, st->factors);
+        ne10_int32_t result = ne10_factor (ncfft, st->factors, NE10_FACTOR_DEFAULT);
         if (result == NE10_ERR)
         {
             NE10_FREE (st);
diff --git a/modules/dsp/NE10_fft_int32.c b/modules/dsp/NE10_fft_int32.c
index f8bcc96..64dbbdf 100644
--- a/modules/dsp/NE10_fft_int32.c
+++ b/modules/dsp/NE10_fft_int32.c
@@ -1045,7 +1045,7 @@ ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32 (ne10_int32_t nfft)
         st->buffer = st->twiddles + nfft;
         st->nfft = nfft;
 
-        ne10_int32_t result = ne10_factor (nfft, st->factors);
+        ne10_int32_t result = ne10_factor (nfft, st->factors, NE10_FACTOR_DEFAULT);
         if (result == NE10_ERR)
         {
             NE10_FREE (st);
@@ -1157,7 +1157,7 @@ ne10_fft_r2c_cfg_int32_t ne10_fft_alloc_r2c_int32 (ne10_int32_t nfft)
         st->buffer = st->super_twiddles + (ncfft / 2);
         st->ncfft = ncfft;
 
-        ne10_int32_t result = ne10_factor (ncfft, st->factors);
+        ne10_int32_t result = ne10_factor (ncfft, st->factors, NE10_FACTOR_DEFAULT);
         if (result == NE10_ERR)
         {
             NE10_FREE (st);
diff --git a/modules/dsp/NE10_fft_int32.neon.c b/modules/dsp/NE10_fft_int32.neon.c
index 41c7f46..0d40a0b 100644
--- a/modules/dsp/NE10_fft_int32.neon.c
+++ b/modules/dsp/NE10_fft_int32.neon.c
@@ -34,6 +34,7 @@
 #include "NE10_types.h"
 #include "NE10_macros.h"
 #include "NE10_fft.h"
+#include "NE10_dsp.h"
 
 static inline void ne10_fft4_forward_int32_unscaled (ne10_fft_cpx_int32_t * Fout,
         ne10_fft_cpx_int32_t * Fin)
diff --git a/modules/dsp/NE10_fft_int32.neonintrinsic.c b/modules/dsp/NE10_fft_int32.neonintrinsic.c
index 13a0cbc..c559e19 100644
--- a/modules/dsp/NE10_fft_int32.neonintrinsic.c
+++ b/modules/dsp/NE10_fft_int32.neonintrinsic.c
@@ -34,6 +34,7 @@
 #include "NE10_types.h"
 #include "NE10_macros.h"
 #include "NE10_fft.h"
+#include "NE10_dsp.h"
 
 #define FFT4_FS_START \
     ne10_int32_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i; \
diff --git a/modules/dsp/NE10_init_dsp.c b/modules/dsp/NE10_init_dsp.c
index 1da07db..49c1f2a 100644
--- a/modules/dsp/NE10_init_dsp.c
+++ b/modules/dsp/NE10_init_dsp.c
@@ -33,17 +33,18 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
 {
     if (NE10_OK == is_NEON_available)
     {
-        //ne10_fft_c2c_1d_float32 = ne10_fft_c2c_1d_float32_neon;
-        //ne10_fft_r2c_1d_float32 = ne10_fft_r2c_1d_float32_neon;
-        //ne10_fft_c2r_1d_float32 = ne10_fft_c2r_1d_float32_neon;
+        ne10_fft_alloc_c2c_float32 = ne10_fft_alloc_c2c_float32_neon;
+        ne10_fft_c2c_1d_float32 = ne10_fft_c2c_1d_float32_neon;
+        ne10_fft_r2c_1d_float32 = ne10_fft_r2c_1d_float32_neon;
+        ne10_fft_c2r_1d_float32 = ne10_fft_c2r_1d_float32_neon;
 
-        //ne10_fft_c2c_1d_int32 = ne10_fft_c2c_1d_int32_neon;
-        //ne10_fft_r2c_1d_int32 = ne10_fft_r2c_1d_int32_neon;
-        //ne10_fft_c2r_1d_int32 = ne10_fft_c2r_1d_int32_neon;
+        ne10_fft_c2c_1d_int32 = ne10_fft_c2c_1d_int32_neon;
+        ne10_fft_r2c_1d_int32 = ne10_fft_r2c_1d_int32_neon;
+        ne10_fft_c2r_1d_int32 = ne10_fft_c2r_1d_int32_neon;
 
-        //ne10_fft_c2c_1d_int16 = ne10_fft_c2c_1d_int16_neon;
-        //ne10_fft_c2r_1d_int16 = ne10_fft_c2r_1d_int16_neon;
-        //ne10_fft_r2c_1d_int16 = ne10_fft_r2c_1d_int16_neon;
+        ne10_fft_c2c_1d_int16 = ne10_fft_c2c_1d_int16_neon;
+        ne10_fft_c2r_1d_int16 = ne10_fft_c2r_1d_int16_neon;
+        ne10_fft_r2c_1d_int16 = ne10_fft_r2c_1d_int16_neon;
 
         ne10_fir_float = ne10_fir_float_neon;
         ne10_fir_decimate_float = ne10_fir_decimate_float_neon;
@@ -55,17 +56,18 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
     }
     else
     {
-        //ne10_fft_c2c_1d_float32 = ne10_fft_c2c_1d_float32_c;
-        //ne10_fft_r2c_1d_float32 = ne10_fft_r2c_1d_float32_c;
-        //ne10_fft_c2r_1d_float32 = ne10_fft_c2r_1d_float32_c;
+        ne10_fft_alloc_c2c_float32 = ne10_fft_alloc_c2c_float32_c;
+        ne10_fft_c2c_1d_float32 = ne10_fft_c2c_1d_float32_c;
+        ne10_fft_r2c_1d_float32 = ne10_fft_r2c_1d_float32_c;
+        ne10_fft_c2r_1d_float32 = ne10_fft_c2r_1d_float32_c;
 
-        //ne10_fft_c2c_1d_int32 = ne10_fft_c2c_1d_int32_c;
-        //ne10_fft_r2c_1d_int32 = ne10_fft_r2c_1d_int32_c;
-        //ne10_fft_c2r_1d_int32 = ne10_fft_c2r_1d_int32_c;
+        ne10_fft_c2c_1d_int32 = ne10_fft_c2c_1d_int32_c;
+        ne10_fft_r2c_1d_int32 = ne10_fft_r2c_1d_int32_c;
+        ne10_fft_c2r_1d_int32 = ne10_fft_c2r_1d_int32_c;
 
-        //ne10_fft_c2c_1d_int16 = ne10_fft_c2c_1d_int16_c;
-        //ne10_fft_r2c_1d_int16 = ne10_fft_r2c_1d_int16_c;
-        //ne10_fft_c2r_1d_int16 = ne10_fft_c2r_1d_int16_c;
+        ne10_fft_c2c_1d_int16 = ne10_fft_c2c_1d_int16_c;
+        ne10_fft_r2c_1d_int16 = ne10_fft_r2c_1d_int16_c;
+        ne10_fft_c2r_1d_int16 = ne10_fft_c2r_1d_int16_c;
 
         ne10_fir_float = ne10_fir_float_c;
         ne10_fir_decimate_float = ne10_fir_decimate_float_c;
@@ -79,6 +81,8 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
 }
 
 // These are actual definitions of our function pointers that are declared in inc/NE10_dsp.h
+ne10_fft_cfg_float32_t (*ne10_fft_alloc_c2c_float32) (ne10_int32_t nfft);
+
 void (*ne10_fft_c2c_1d_float32) (ne10_fft_cpx_float32_t *fout,
                                  ne10_fft_cpx_float32_t *fin,
                                  ne10_fft_cfg_float32_t cfg,
diff --git a/modules/dsp/NE10_rfft_float32.c b/modules/dsp/NE10_rfft_float32.c
index 038bc5f..9570d87 100644
--- a/modules/dsp/NE10_rfft_float32.c
+++ b/modules/dsp/NE10_rfft_float32.c
@@ -786,13 +786,13 @@ ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32 (ne10_int32_t nfft)
     }
 
     // factors and twiddles for rfft C
-    ne10_factor (nfft, st->r_factors);
+    ne10_factor (nfft, st->r_factors, NE10_FACTOR_DEFAULT);
 
     // backward twiddles pointers
     st->r_twiddles_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles, st->r_factors, nfft);
 
     // factors and twiddles for rfft neon
-    result = ne10_factor (nfft/4, st->r_factors_neon);
+    result = ne10_factor (nfft/4, st->r_factors_neon, NE10_FACTOR_DEFAULT);
     if (result == NE10_ERR)
     {
         return st;
diff --git a/modules/dsp/test/test_suite_fft_float32.c b/modules/dsp/test/test_suite_fft_float32.c
index 5e57793..aa49b45 100644
--- a/modules/dsp/test/test_suite_fft_float32.c
+++ b/modules/dsp/test/test_suite_fft_float32.c
@@ -35,6 +35,7 @@
 #include <string.h>
 
 #include "NE10_dsp.h"
+#include "NE10_macros.h"
 #include "seatest.h"
 #include "unit_test_common.h"
 
@@ -79,34 +80,37 @@ static ne10_int64_t time_neon = 0;
 static ne10_float32_t time_speedup = 0.0f;
 static ne10_float32_t time_savings = 0.0f;
 
+static ne10_fft_cfg_float32_t cfg_c;
+static ne10_fft_cfg_float32_t cfg_neon;
+
+static ne10_int32_t test_c2c_alloc (ne10_int32_t fftSize);
+
 void test_fft_c2c_1d_float32_conformance()
 {
-
     ne10_int32_t i = 0;
     ne10_int32_t fftSize = 0;
-    ne10_fft_cfg_float32_t cfg;
+    ne10_int32_t flag_result = NE10_OK;
 
     fprintf (stdout, "----------%30s start\n", __FUNCTION__);
 
     for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
         fprintf (stdout, "FFT size %d\n", fftSize);
+        flag_result = test_c2c_alloc (fftSize);
+        if (flag_result == NE10_ERR)
+        {
+            return;
+        }
 
         /* FFT test */
         memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
         memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
-        cfg = ne10_fft_alloc_c2c_float32 (fftSize);
-        if (cfg == NULL)
-        {
-            fprintf (stdout, "======ERROR, FFT alloc fails\n");
-            return;
-        }
 
         GUARD_ARRAY (out_c, fftSize * 2);
         GUARD_ARRAY (out_neon, fftSize * 2);
 
-        ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg, 0);
-        ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg, 0);
+        ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
+        ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
 
         CHECK_ARRAY_GUARD (out_c, fftSize * 2);
         CHECK_ARRAY_GUARD (out_neon, fftSize * 2);
@@ -122,8 +126,8 @@ void test_fft_c2c_1d_float32_conformance()
         GUARD_ARRAY (out_c, fftSize * 2);
         GUARD_ARRAY (out_neon, fftSize * 2);
 
-        ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg, 1);
-        ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg, 1);
+        ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c,  1);
+        ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
 
         CHECK_ARRAY_GUARD (out_c, fftSize * 2);
         CHECK_ARRAY_GUARD (out_neon, fftSize * 2);
@@ -132,16 +136,16 @@ void test_fft_c2c_1d_float32_conformance()
         snr = CAL_SNR_FLOAT32 (out_c, out_neon, fftSize * 2);
         assert_false ( (snr < SNR_THRESHOLD));
 
-        NE10_FREE (cfg);
+        NE10_FREE (cfg_c);
+        NE10_FREE (cfg_neon);
     }
 }
 
 void test_fft_c2c_1d_float32_performance()
 {
-
     ne10_int32_t i = 0;
     ne10_int32_t fftSize = 0;
-    ne10_fft_cfg_float32_t cfg;
+    ne10_int32_t flag_result = NE10_OK;
     ne10_int32_t test_loop = 0;
 
     fprintf (stdout, "----------%30s start\n", __FUNCTION__);
@@ -154,12 +158,12 @@ void test_fft_c2c_1d_float32_performance()
         /* FFT test */
         memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
         memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
-        cfg = ne10_fft_alloc_c2c_float32 (fftSize);
-        if (cfg == NULL)
+        flag_result = test_c2c_alloc (fftSize);
+        if (flag_result == NE10_ERR)
         {
-            fprintf (stdout, "======ERROR, FFT alloc fails\n");
             return;
         }
+
         test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
@@ -167,7 +171,7 @@ void test_fft_c2c_1d_float32_performance()
             time_c,
         {
             for (i = 0; i < test_loop; i++)
-                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg, 0);
+                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
         }
         );
         GET_TIME
@@ -175,7 +179,7 @@ void test_fft_c2c_1d_float32_performance()
             time_neon,
         {
             for (i = 0; i < test_loop; i++)
-                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg, 0);
+                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
         }
         );
 
@@ -192,7 +196,7 @@ void test_fft_c2c_1d_float32_performance()
             time_c,
         {
             for (i = 0; i < test_loop; i++)
-                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg, 1);
+                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 1);
         }
         );
         GET_TIME
@@ -200,7 +204,7 @@ void test_fft_c2c_1d_float32_performance()
             time_neon,
         {
             for (i = 0; i < test_loop; i++)
-                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg, 1);
+                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
         }
         );
 
@@ -208,7 +212,8 @@ void test_fft_c2c_1d_float32_performance()
         time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
         ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);
 
-        NE10_FREE (cfg);
+        NE10_FREE (cfg_c);
+        NE10_FREE (cfg_neon);
     }
 }
 
@@ -454,3 +459,25 @@ void test_fixture_fft_r2c_1d_float32 (void)
 
     test_fixture_end();                 // ends a fixture
 }
+
+ne10_int32_t test_c2c_alloc (ne10_int32_t fftSize)
+{
+    NE10_FREE (cfg_c);
+    NE10_FREE (cfg_neon);
+
+    cfg_c = ne10_fft_alloc_c2c_float32_c (fftSize);
+    if (cfg_c == NULL)
+    {
+        fprintf (stdout, "======ERROR, FFT alloc fails\n");
+        return NE10_ERR;
+    }
+
+    cfg_neon = ne10_fft_alloc_c2c_float32_neon (fftSize);
+    if (cfg_neon == NULL)
+    {
+        NE10_FREE (cfg_c);
+        fprintf (stdout, "======ERROR, FFT alloc fails\n");
+        return NE10_ERR;
+    }
+    return NE10_OK;
+}
-- 
2.7.4