NE10/DSP/RFFT: optimise RFFT for armv8
authorPhil.Wang <phil.wang@arm.com>
Mon, 22 Sep 2014 04:26:46 +0000 (12:26 +0800)
committerZhou (Joe) Yu <joe.yu@arm.com>
Thu, 9 Oct 2014 07:15:25 +0000 (08:15 +0100)
intrinsic, LLVM 3.5, -O2
on a57, juno, android
   size|   time in ms   |   boost   |
       |  NE10 |  pffft | pffft/NE10|
       |R2C|C2R|R2C|C2R*|  R2C|  C2R|
     32|145|185|279| 319|1.92x|1.72x|
     64|175|200|239| 279|1.36x|1.39x|
    128|166|185|237| 262|1.42x|1.41x|
    256|197|208|232| 256|1.17x|1.23x|
    512|208|216|254| 270|1.22x|1.25x|
   1024|241|244|260| 278|1.07x|1.14x|
   2048|258|263|332| 322|1.28x|1.22x|
   4096|303|304|388| 353|1.28x|1.16x|
   8192|339|334|424| 426|1.25x|1.27x|

intrinsic, GCC 4.9, -O2
on a57, juno, android
   size|    time in ms  |   boost   |
       |  NE10 |  pffft | pffft/NE10|
       |R2C|C2R|R2C|C2R*|  R2C|  C2R|
     32|174|181|328| 410|1.88x|2.26x|
     64|214|216|270| 338|1.26x|1.56x|
    128|210|197|259| 310|1.23x|1.57x|
    256|232|223|243| 283|1.04x|1.26x|
    512|250|222|263| 307|1.04x|1.38x|
   1024|274|251|272| 304|1.00x|1.20x|
   2048|288|277|314| 353|1.08x|1.27x|
   4096|333|303|349| 379|1.04x|1.25x|
   8192|370|342|424| 452|1.14x|1.31x|

* Ne10 supports scale of output for backward RFFT,
  while pffft doesn't. To normalize the benchmark,
  a scale operation was added to the end of each
  call to pffft.
* pffft C2R FFT costs 410ms when size==32, 338ms when
  size==64, this is because the former loops more times
  than the latter does, so it does not mean pffft cost
  more time for short input.

intrinsic, GCC 4.9, -O2
on a53, juno, android
   size|    time in ms  |   boost   |
       |  NE10 |  pffft | pffft/NE10|
       |    R2C|     R2C|        R2C|
     32|    347|     607|      1.74x|
     64|    389|     489|      1.25x|
    128|    334|     484|      1.44x|
    256|    401|     456|      1.13x|
    512|    380|     502|      1.32x|
   1024|    460|     512|      1.11x|
   2048|    481|     593|      1.23x|
   4096|    605|     709|      1.17x|
   8192|    704|     891|      1.26x|

Change-Id: Ide0b974620ae8d06cfa862769004b2110abaaeff

12 files changed:
inc/NE10_types.h
modules/CMakeLists.txt
modules/dsp/NE10_fft.c [new file with mode: 0644]
modules/dsp/NE10_fft.h
modules/dsp/NE10_fft.neonintrinsic.h [new file with mode: 0644]
modules/dsp/NE10_fft_float32.c
modules/dsp/NE10_fft_float32.neonintrinsic.c
modules/dsp/NE10_fft_int16.c
modules/dsp/NE10_fft_int32.c
modules/dsp/NE10_rfft_float32.c [new file with mode: 0644]
modules/dsp/NE10_rfft_float32.neonintrinsic.c [new file with mode: 0644]
modules/dsp/test/test_suite_fft_float32.c

index e6f3f4e..bd8781f 100644 (file)
@@ -229,12 +229,24 @@ typedef ne10_fft_state_float32_t* ne10_fft_cfg_float32_t;
 
 typedef struct
 {
-    ne10_int32_t nfft;
+    ne10_fft_cpx_float32_t *buffer;
+#if defined(__arm__)
     ne10_int32_t ncfft;
     ne10_int32_t *factors;
     ne10_fft_cpx_float32_t *twiddles;
     ne10_fft_cpx_float32_t *super_twiddles;
-    ne10_fft_cpx_float32_t *buffer;
+#elif defined( __aarch64__)
+    ne10_int32_t nfft;
+    ne10_fft_cpx_float32_t *r_twiddles;
+    ne10_int32_t *r_factors;
+    ne10_fft_cpx_float32_t *r_twiddles_backward;
+    ne10_fft_cpx_float32_t *r_twiddles_neon;
+    ne10_fft_cpx_float32_t *r_twiddles_neon_backward;
+    ne10_int32_t *r_factors_neon;
+    ne10_fft_cpx_float32_t *r_super_twiddles_neon;
+#else
+    #error("unsupported platform, current supported are arm(32) and aarch64")
+#endif
 } ne10_fft_r2c_state_float32_t;
 
 typedef ne10_fft_r2c_state_float32_t* ne10_fft_r2c_cfg_float32_t;
index 5609bed..a96c465 100644 (file)
@@ -173,7 +173,9 @@ if(NE10_ENABLE_DSP)
     # Add dsp C files.
     set(NE10_DSP_C_SRCS
         ${PROJECT_SOURCE_DIR}/common/NE10_mask_table.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.c
+        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_float32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.c
@@ -187,6 +189,7 @@ if(NE10_ENABLE_DSP)
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neon.c
+        #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_float32.neonintrinsic.c
         #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.neonintrinsic.c
         #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neonintrinsic.c
         #${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neonintrinsic.c
diff --git a/modules/dsp/NE10_fft.c b/modules/dsp/NE10_fft.c
new file mode 100644 (file)
index 0000000..e769e5a
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ *  Copyright 2014 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_fft.c
+ */
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_fft.h"
+
+/* factors buffer:
+ * 0: stage number
+ * 1: stride for the first stage
+ * others: factors */
+ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
+{
+    ne10_int32_t p;
+    ne10_int32_t i = 1;
+    ne10_int32_t stage_num = 0;
+    ne10_int32_t stride_max = n;
+
+    /* factor out powers of 4, 2, 5, 3, and other */
+    do
+    {
+        if ( (n % 4) == 0)
+            p = 4;
+        else if ( (n % 2) == 0)
+            p = 2;
+        else if ( (n % 5) == 0)
+            p = 5;
+        else if ( (n % 3) == 0)
+            p = 3;
+        else // stop factoring
+            p = n;
+
+        n /= p;
+        facbuf[2 * i] = p;
+        facbuf[2 * i + 1] = n;
+        i++;
+        stage_num++;
+    }
+    while (n > 1);
+    facbuf[0] = stage_num;
+    facbuf[1] = stride_max / p;
+
+    if ( ((p % 2) == 0) && (stride_max >= 4) )
+    {
+        // last factor is 4 or 2
+        // and nfft >= 4
+        facbuf[2 * i] = NE10_FFT_ALG_24;
+        return NE10_OK;
+    }
+    else // not support yet
+    {
+        facbuf[2 * i] = NE10_FFT_ALG_ANY;
+        return NE10_ERR;
+    }
+}
+
+ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
+        const ne10_int32_t * factors,
+        const ne10_int32_t nfft )
+{
+    ne10_int32_t i, j, k;
+    ne10_fft_cpx_float32_t *tw;
+    ne10_int32_t stage_count = factors[0];
+    ne10_int32_t fstride = factors[1];
+    ne10_int32_t mstride;
+    ne10_int32_t cur_radix; // current radix
+
+    const ne10_float64_t pi = NE10_PI;
+    ne10_float32_t phase;
+
+    // for first stage
+    i = stage_count;
+    cur_radix = factors[2 * i];
+    if (cur_radix%2) // current radix is not 4 or 2
+    {
+        for (k = 0; k < cur_radix; k++)
+        {
+            phase = -2 * pi * k / cur_radix;
+            twiddles[k].r = (ne10_float32_t) cos (phase);
+            twiddles[k].i = (ne10_float32_t) sin (phase);
+        }
+        twiddles += cur_radix;
+    }
+
+    // for other stage
+    for (i = stage_count - 1; i > 0; i--)
+    {
+        cur_radix = factors[2 * i];
+        fstride /= cur_radix;
+        mstride = factors[2 * i + 1];
+        tw = twiddles;
+        for (j = 0; j < mstride; j++)
+        {
+            for (k = 1; k < cur_radix; k++ ) // phase = 1 when k = 0
+            {
+                phase = -2 * pi * fstride * k * j / nfft;
+                tw[mstride * ( k - 1 )].r = (ne10_float32_t) cos (phase);
+                tw[mstride * ( k - 1 )].i = (ne10_float32_t) sin (phase);
+            } // cur_radix
+            tw ++;
+        } // mstride
+        twiddles += mstride * (cur_radix - 1);
+    } // stage_count
+
+    return twiddles;
+}
index 75d25c6..f514ff3 100644 (file)
 
 #include <NE10_types.h>
 
+#ifdef __cplusplus
+#include <algorithm>
+#endif
+
 #ifndef NE10_FFT_H
 #define NE10_FFT_H
 
@@ -43,13 +47,219 @@ extern "C" {
 // internal macro define
 ///////////////////////////
 #define NE10_FFT_BYTE_ALIGNMENT 8
+#define NE10_INLINE inline static
+
+/* algorithms used in FFT */
+#define NE10_FFT_ALG_24 0
+#define NE10_FFT_ALG_ANY 1
+// Comment when do not want to scale output result
+#define NE10_DSP_RFFT_SCALING
+
+#ifdef __cplusplus
+  #define ne10_swap_ptr(X,Y) std::swap((X),(Y));
+#else
+  #define ne10_swap_ptr(X,Y) do { void *ptr = (X); (X) = (Y); (Y) = ptr; } while(0);
+#endif
+
+#define NE10_CPX_ADD(Z,A,B) do { Z.r = A.r + B.r; Z.i = A.i + B.i; } while (0);
+#define NE10_CPX_SUB(Z,A,B) do { Z.r = A.r - B.r; Z.i = A.i - B.i; } while (0);
+#define NE10_CPX_MUL_F32(Z,A,B) do {        \
+    ne10_float32_t ARBR = A.r * B.r;        \
+    ne10_float32_t AIBI = A.i * B.i;        \
+    ne10_float32_t ARBI = A.r * B.i;        \
+    ne10_float32_t AIBR = A.i * B.r;        \
+    Z.r = ARBR - AIBI;                      \
+    Z.i = AIBR + ARBI;                      \
+    } while (0);
+#define NE10_CPX_CONJ_MUL_F32(Z,A,B) do {   \
+    ne10_float32_t ARBR = A.r * B.r;        \
+    ne10_float32_t AIBI = A.i * B.i;        \
+    ne10_float32_t ARBI = A.r * B.i;        \
+    ne10_float32_t AIBR = A.i * B.r;        \
+    Z.r = ARBR + AIBI;                      \
+    Z.i = AIBR - ARBI;                      \
+    } while (0);
+#define NE10_CPX_MUL_TW_F32(Z,TW) do {      \
+    ne10_fft_cpx_float32_t tmp;             \
+    NE10_CPX_MUL(tmp,Z,TW);                 \
+    Z = tmp;                                \
+} while (0);
+#define NE10_CPX_MUL_INV_TW_F32(Z,TW) do {  \
+    ne10_fft_cpx_float32_t tmp;             \
+    NE10_CPX_CONJ_MUL(tmp,Z,TW);            \
+    Z = tmp;                                \
+} while (0);
+
+// R2C FFT size==4
+// In[4]    R[0],R[1],R[2],R[3]
+// OUT[4]   R[0],R[1],I[1],R[2]
+#define NE10_FFT_R2C_4R_RCR(OUT,IN) do {    \
+    ne10_float32_t SCRATCH [2];             \
+    SCRATCH[0] = IN[0] + IN[2];             \
+    SCRATCH[1] = IN[1] + IN[3];             \
+    OUT[0] = SCRATCH[0] + SCRATCH[1];       \
+    OUT[1] = IN[0] - IN[2];                 \
+    OUT[2] = IN[3] - IN[1];                 \
+    OUT[3] = SCRATCH[0] - SCRATCH[1];       \
+} while(0);
+
+// C2R FFT size==4  - inversed of R2C FFT
+// In[4]    R[0],R[1],I[1],R[2]
+// OUT[4]   R[0],R[1],R[2],R[3]
+#define NE10_FFT_C2R_RCR_4R(OUT,IN) do {    \
+    ne10_float32_t SCRATCH [4];             \
+    SCRATCH[0] =(IN[0] + IN[3]) ;    \
+    SCRATCH[1] =(IN[0] - IN[3]) ;    \
+    SCRATCH[2] = IN[1] + IN[1];              \
+    SCRATCH[3] = IN[2] + IN[2];              \
+    OUT[0] = SCRATCH[0] + SCRATCH[2];       \
+    OUT[1] = SCRATCH[1] - SCRATCH[3];       \
+    OUT[2] = SCRATCH[0] - SCRATCH[2];       \
+    OUT[3] = SCRATCH[1] + SCRATCH[3];       \
+} while(0);
+
+// R2C FFT size==4
+// In[4]    R[0],R[1],R[2],R[3]
+// OUT[4]   R[0],I[0],R[1],I[1]
+#define NE10_FFT_R2C_4R_CC(OUT,IN) do {    \
+    ne10_float32_t SCRATCH [2];            \
+    ne10_float32_t TMP [2];                \
+    SCRATCH[0] = (IN[3] - IN[1]) * TW_81N;     \
+    SCRATCH[1] = (IN[3] + IN[1]) * TW_81N;     \
+    OUT[0] = IN[0] + SCRATCH[0];    \
+    OUT[2] = IN[0] - SCRATCH[0];    \
+    OUT[1] = SCRATCH[1] - IN[2];    \
+    OUT[3] = SCRATCH[1] + IN[2];    \
+} while(0);
+
+// C2R FFT size==4  - inversed of R2C FFT
+// In[4]    R[0],I[0],R[1],I[1]
+// OUT[4]   R[0],R[1],R[2],R[3]
+#define NE10_FFT_C2R_CC_4R(OUT,IN) do {    \
+    ne10_float32_t SCRATCH [4];            \
+    OUT[0] = ( IN[0] + IN[2]) ;    \
+    OUT[2] = (-IN[1] + IN[3]) ;    \
+    OUT[0] = OUT[0] + OUT[0] ;  \
+    OUT[2] = OUT[2] + OUT[2] ;  \
+    SCRATCH[0] = (IN[0] - IN[2]) ;     \
+    SCRATCH[1] = (IN[1] + IN[3]) ;     \
+    SCRATCH[2] = (SCRATCH[0] + SCRATCH[1]) ;     \
+    SCRATCH[3] = (SCRATCH[0] - SCRATCH[1]) ;     \
+    OUT[3] = SCRATCH[2] / TW_81N;    \
+    OUT[1] = SCRATCH[3] / TW_81;    \
+} while(0);
 
+// R2C FFT size==4
+// In[4]    R[0],I[0],R[1],I[1]
+// OUT[4]   R[0],I[0],R[1],I[1]
+#define NE10_FFT_R2C_CC_CC(OUT,IN) do { \
+    ne10_fft_cpx_float32_t TMP[4];      \
+    ne10_float32_t TMP_SWAP;            \
+    TMP[0].r = IN[0].r + IN[2].r;       \
+    TMP[0].i = IN[0].i + IN[2].i;       \
+    TMP[1].r = IN[0].r - IN[2].r;       \
+    TMP[1].i = IN[0].i - IN[2].i;       \
+    TMP[2].r = IN[1].r + IN[3].r;       \
+    TMP[2].i = IN[1].i + IN[3].i;       \
+    TMP[3].r = IN[1].r - IN[3].r;       \
+    TMP[3].i = IN[1].i - IN[3].i;       \
+    TMP_SWAP = TMP[3].i;                \
+    TMP[3].i = - TMP[3].r;              \
+    TMP[3].r = TMP_SWAP;                \
+    OUT[0].r = TMP[0].r + TMP[2].r;     \
+    OUT[0].i = TMP[0].i + TMP[2].i;     \
+    OUT[2].r =   TMP[0].r - TMP[2].r;   \
+    OUT[2].i = -(TMP[0].i - TMP[2].i);  \
+    OUT[1].r = TMP[1].r + TMP[3].r;     \
+    OUT[1].i = TMP[1].i + TMP[3].i;     \
+    OUT[3].r =   TMP[1].r - TMP[3].r;   \
+    OUT[3].i = -(TMP[1].i - TMP[3].i);  \
+} while(0);
+
+
+// C2R FFT size==4  - inversed of R2C FFT
+// In[4]    R[0],I[0],R[1],I[1]
+// OUT[4]   R[0],I[0],R[1],I[1]
+#define NE10_FFT_C2R_CC_CC(OUT,IN) do {             \
+    ne10_fft_cpx_float32_t SCRATCH[4];              \
+    SCRATCH[0].r = (IN[0].r + IN[1].r) ;      \
+    SCRATCH[2].r = (IN[0].r - IN[1].r) ;      \
+    SCRATCH[2].i = (IN[0].i + IN[1].i) ;      \
+    SCRATCH[0].i = (IN[0].i - IN[1].i) ;      \
+    SCRATCH[1].r = (IN[2].r + IN[3].r) ;      \
+    SCRATCH[3].i = (IN[2].r - IN[3].r) ;      \
+    SCRATCH[3].r = (IN[2].i + IN[3].i) * -1.0f;     \
+    SCRATCH[1].i = (IN[2].i - IN[3].i) ;     \
+    OUT[0].r = (SCRATCH[0].r + SCRATCH[1].r) ;\
+    OUT[2].r = (SCRATCH[0].r - SCRATCH[1].r) ;\
+    OUT[0].i = (SCRATCH[0].i + SCRATCH[1].i) ;\
+    OUT[2].i = (SCRATCH[0].i - SCRATCH[1].i) ;\
+    OUT[1].r = (SCRATCH[2].r + SCRATCH[3].r) ;\
+    OUT[3].r = (SCRATCH[2].r - SCRATCH[3].r) ;\
+    OUT[1].i = (SCRATCH[2].i + SCRATCH[3].i) ;\
+    OUT[3].i = (SCRATCH[2].i - SCRATCH[3].i) ;\
+} while(0);
+
+#ifdef PERFORMANCE_TEST
+  #undef NE10_VERBOSE
+#endif
+
+#ifdef NE10_VERBOSE
+  #define PRINT_STAGE_INFO do {   \
+      fprintf( stdout, "%s,%d\n\t:f,m,n=%d,%d,%d\n", __FUNCTION__, __LINE__, fstride, mstride, nfft );  \
+  } while(0);
+  #define PRINT_POINTERS_INFO(IN,OUT,BUF,TW) do {  \
+      fprintf( stdout, "%s,%d\t:"              "IN:%p\t" "OUT:%p\t" "BUF:%p\t" "TW:%p\n",    \
+                        __FUNCTION__, __LINE__, IN,       OUT,       BUF,       TW);  \
+  } while(0);
+  #define PRINT_BUTTERFLY_INFO ;
+  #define PRINT_HIT do {    \
+      fprintf( stderr, "HIT %s:%d\n", __FUNCTION__, __LINE__);  \
+  } while(0);
+  #define PRINT_VAR(X,FORM) do {   \
+      fprintf( stderr, #X "=" FORM "\n", X );                  \
+  } while(0);
+  #define RFORM "\t%+8.4e\t"
+  #define CFORM "(" RFORM "," RFORM ")\t"
+
+  #define NE10_PRINT_Q_VECTOR(Q_VECTOR) do {            \
+      fprintf(stderr,"inside %s\n", __FUNCTION__ );     \
+      fprintf(stderr, #Q_VECTOR "\n");                  \
+      fprintf(stderr, RFORM RFORM RFORM RFORM "\n", Q_VECTOR[0], Q_VECTOR[1], Q_VECTOR[2], Q_VECTOR[3] ); \
+  } while(0);
+
+  #define NE10_PRINT_Q2_VECTOR(Q2_VECTOR) do {                                                                    \
+      fprintf(stderr,"inside %s\n", __FUNCTION__ );                                                               \
+      fprintf(stderr, #Q2_VECTOR "\n");                                                                            \
+      fprintf(stderr,"REAL:" RFORM RFORM RFORM RFORM "\n", Q2_VECTOR[0].r, Q2_VECTOR[1].r, Q2_VECTOR[2].r, Q2_VECTOR[3].r ); \
+      fprintf(stderr,"IMAG:" RFORM RFORM RFORM RFORM "\n", Q2_VECTOR[0].i, Q2_VECTOR[1].i, Q2_VECTOR[2].i, Q2_VECTOR[3].i ); \
+  } while(0);
+#else
+  #define PRINT_STAGE_INFO ;
+  #define PRINT_BUTTERFLY_INFO ;
+  #define PRINT_HIT ;
+  #define PRINT_VAR(X,FORM) ;
+  #define PRINT_POINTERS_INFO(IN,OUT,BUF,TW) ;
+  #define NE10_PRINT_Q_VECTOR(Q_VECTOR) ;
+  #define NE10_PRINT_Q2_VECTOR(Q2_VECTOR) ;
+#endif
+
+///////////////////////////
+// common varibles
+///////////////////////////
+const static ne10_float32_t TW_81  =  0.70710678;
+const static ne10_float32_t TW_81N = -0.70710678;
 
 ///////////////////////////
 // function prototypes:
 ///////////////////////////
 
     /*common fft functions */
+    extern ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf);
+
+    extern ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
+        const ne10_int32_t * factors,
+        const ne10_int32_t nfft );
 
     /*common functions for float fft */
     extern void ne10_mixed_radix_fft_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
diff --git a/modules/dsp/NE10_fft.neonintrinsic.h b/modules/dsp/NE10_fft.neonintrinsic.h
new file mode 100644 (file)
index 0000000..d27f137
--- /dev/null
@@ -0,0 +1,430 @@
+/*
+ *  Copyright 2014 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NE10 Library : dsp/NE10_fft.neonintrinsic.h
+ */
+
+#ifndef NE10_FFT_NEONINTRINSIC_H
+#define NE10_FFT_NEONINTRINSIC_H
+
+#include <arm_neon.h>
+
+#define NE10_CPX_ADD_NEON_F32(Z,A,B) do {           \
+    Z.val[0] = vaddq_f32( A.val[0] , B.val[0] );    \
+    Z.val[1] = vaddq_f32( A.val[1] , B.val[1] );    \
+} while (0);
+
+#define NE10_CPX_SUB_NEON_F32(Z,A,B) do {           \
+    Z.val[0] = vsubq_f32( A.val[0] , B.val[0] );    \
+    Z.val[1] = vsubq_f32( A.val[1] , B.val[1] );    \
+} while (0);
+
+#define NE10_CPX_MUL_NEON_F32(Z,A,B) do {           \
+    float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
+    float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \
+    float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
+    float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \
+    Z.val[0] = vsubq_f32(ARBR,AIBI);                \
+    Z.val[1] = vaddq_f32(AIBR,ARBI);                \
+} while (0);
+
+#define NE10_CPX_MUL_INV_NEON_F32(Z,A,B) do {           \
+    float32x4_t ARBR = vmulq_f32( A.val[0], B.val[0] ); \
+    float32x4_t AIBI = vmulq_f32( A.val[1], B.val[1] ); \
+    float32x4_t ARBI = vmulq_f32( A.val[0], B.val[1] ); \
+    float32x4_t AIBR = vmulq_f32( A.val[1], B.val[0] ); \
+    Z.val[0] = vaddq_f32(ARBR,AIBI);                \
+    Z.val[1] = vsubq_f32(AIBR,ARBI);                \
+} while (0);
+
+#define NE10_BUTTERFLY_NEON_F32(O1,O2,I1,I2) do {   \
+    NE10_CPX_ADD_NEON(O1,I1,I2);                    \
+    NE10_CPX_SUB_NEON(O2,I1,I2);                    \
+} while(0);
+
+#define NE10_DECLARE_2(TYPE,NAME)   TYPE NAME ## 0; \
+                                    TYPE NAME ## 1;
+
+#define NE10_DECLARE_3(TYPE,NAME)   NE10_DECLARE_2(TYPE,NAME);  \
+                                    TYPE NAME ## 2;
+
+#define NE10_DECLARE_4(TYPE,NAME)   NE10_DECLARE_3(TYPE,NAME);  \
+                                    TYPE NAME ## 3;
+
+#define NE10_DECLARE_8(TYPE,NAME)   NE10_DECLARE_4(TYPE,NAME);  \
+                                    TYPE NAME ## 4; \
+                                    TYPE NAME ## 5; \
+                                    TYPE NAME ## 6; \
+                                    TYPE NAME ## 7;
+
+#define NE10_REVERSE_FLOAT32X4(VECTOR4F) do {                                       \
+    VECTOR4F = vrev64q_f32(VECTOR4F);                                               \
+    VECTOR4F = vcombine_f32( vget_high_f32( VECTOR4F ), vget_low_f32( VECTOR4F ) ); \
+} while (0);
+
+#define NE10_REVERSE_OUT_FLOAT32X4(VECTOR4F_OUT,VECTOR4F) do {                      \
+    float32x4_t Q_TMP = vrev64q_f32(VECTOR4F);                                      \
+    VECTOR4F_OUT = vcombine_f32( vget_high_f32( Q_TMP ), vget_low_f32( Q_TMP ) );   \
+} while (0);
+
+#define NE10_RADIX4X4C_TRANSPOSE_NEON(Q2_OUT,Q2_IN) do {                                                  \
+    NE10_DECLARE_4(float32x4x2_t,q2_tmp);                                                                 \
+    q2_tmp0 = vtrnq_f32 (Q2_IN ## 0 .val[0], Q2_IN ## 1 .val[0]);                                         \
+    q2_tmp1 = vtrnq_f32 (Q2_IN ## 0 .val[1], Q2_IN ## 1 .val[1]);                                         \
+    q2_tmp2 = vtrnq_f32 (Q2_IN ## 2 .val[0], Q2_IN ## 3 .val[0]);                                         \
+    q2_tmp3 = vtrnq_f32 (Q2_IN ## 2 .val[1], Q2_IN ## 3 .val[1]);                                         \
+    Q2_OUT ## 0 .val[0] = vcombine_f32 (vget_low_f32  (q2_tmp0.val[0]), vget_low_f32  (q2_tmp2.val[0]));  \
+    Q2_OUT ## 0 .val[1] = vcombine_f32 (vget_low_f32  (q2_tmp1.val[0]), vget_low_f32  (q2_tmp3.val[0]));  \
+    Q2_OUT ## 1 .val[0] = vcombine_f32 (vget_low_f32  (q2_tmp0.val[1]), vget_low_f32  (q2_tmp2.val[1]));  \
+    Q2_OUT ## 1 .val[1] = vcombine_f32 (vget_low_f32  (q2_tmp1.val[1]), vget_low_f32  (q2_tmp3.val[1]));  \
+    Q2_OUT ## 2 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[0]), vget_high_f32 (q2_tmp2.val[0]));  \
+    Q2_OUT ## 2 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[0]), vget_high_f32 (q2_tmp3.val[0]));  \
+    Q2_OUT ## 3 .val[0] = vcombine_f32 (vget_high_f32 (q2_tmp0.val[1]), vget_high_f32 (q2_tmp2.val[1]));  \
+    Q2_OUT ## 3 .val[1] = vcombine_f32 (vget_high_f32 (q2_tmp1.val[1]), vget_high_f32 (q2_tmp3.val[1]));  \
+} while(0);
+
+#define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR }
+
+#define CONST_TW_81   0.70710678
+#define CONST_TW_81N -0.70710678
+
+const static float32x4_t Q_TW_81    = VDUPQ_N_F32(CONST_TW_81 );
+const static float32x4_t Q_TW_81N   = VDUPQ_N_F32(CONST_TW_81N);
+
+#define DIV_TW81    1.4121356f
+#define DIV_TW81N - 1.4121356f
+
+const static float32x4_t DIV_TW81_NEON  = VDUPQ_N_F32( 1.4121356f);
+const static float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(-1.4121356f);
+
+#define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do {   \
+        Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4);      \
+        Q_OUT ## 1 = vsubq_f32 (Q_IN ## 0, Q_IN ## 4);      \
+        Q_OUT ## 2 = vaddq_f32 (Q_IN ## 1, Q_IN ## 5);      \
+        Q_OUT ## 3 = vsubq_f32 (Q_IN ## 1, Q_IN ## 5);      \
+        Q_OUT ## 4 = vaddq_f32 (Q_IN ## 2, Q_IN ## 6);      \
+        Q_OUT ## 5 = vsubq_f32 (Q_IN ## 2, Q_IN ## 6);      \
+        Q_OUT ## 6 = vaddq_f32 (Q_IN ## 3, Q_IN ## 7);      \
+        Q_OUT ## 7 = vsubq_f32 (Q_IN ## 3, Q_IN ## 7);      \
+        Q_OUT ## 3 = vmulq_f32 (Q_OUT ## 3, Q_TW_81 );      \
+        Q_OUT ## 7 = vmulq_f32 (Q_OUT ## 7, Q_TW_81N);      \
+} while(0);
+
+#define NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_IN) do {   \
+        NE10_DECLARE_4(float32x4_t,Q_S);                    \
+        Q_S0 =  vaddq_f32 (Q_IN ## 0, Q_IN ## 4);           \
+        Q_S1 =  vaddq_f32 (Q_IN ## 2, Q_IN ## 6);           \
+        Q_S2 =  vsubq_f32 (Q_IN ## 7, Q_IN ## 3);           \
+        Q_S3 =  vaddq_f32 (Q_IN ## 3, Q_IN ## 7);           \
+        Q_OUT ## 0 = vaddq_f32 (      Q_S0,      Q_S1 );    \
+        Q_OUT ## 1 = vaddq_f32 ( Q_IN ## 1,      Q_S3 );    \
+        Q_OUT ## 2 = vsubq_f32 (      Q_S2, Q_IN ## 5 );    \
+        Q_OUT ## 3 = vsubq_f32 ( Q_IN ## 0, Q_IN ## 4 );    \
+        Q_OUT ## 4 = vsubq_f32 ( Q_IN ## 6, Q_IN ## 2 );    \
+        Q_OUT ## 5 = vsubq_f32 ( Q_IN ## 1,      Q_S3 );    \
+        Q_OUT ## 6 = vaddq_f32 ( Q_IN ## 5,      Q_S2 );    \
+        Q_OUT ## 7 = vsubq_f32 (      Q_S0,      Q_S1 );    \
+} while(0);
+
+#define NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_OUT,Q_IN) do {   \
+        NE10_DECLARE_8(float32x4_t,Q_S_IN);                 \
+        Q_S_IN0 = vaddq_f32(Q_IN ## 0, Q_IN ## 7);          \
+        Q_S_IN1 = vsubq_f32(Q_IN ## 0, Q_IN ## 7);          \
+        Q_S_IN2 = vaddq_f32(Q_IN ## 1, Q_IN ## 5);          \
+        Q_S_IN3 = vsubq_f32(Q_IN ## 1, Q_IN ## 5);          \
+        Q_S_IN4 = vaddq_f32(Q_IN ## 6, Q_IN ## 2);          \
+        Q_S_IN5 = vsubq_f32(Q_IN ## 6, Q_IN ## 2);          \
+        Q_S_IN6 = vaddq_f32(Q_IN ## 3, Q_IN ## 3);          \
+        Q_S_IN7 = vaddq_f32(Q_IN ## 4, Q_IN ## 4);          \
+        Q_OUT ## 0 = vaddq_f32(Q_S_IN0, Q_S_IN6);           \
+        Q_OUT ## 1 = vaddq_f32(Q_S_IN2, Q_S_IN2);           \
+        Q_OUT ## 2 = vsubq_f32(Q_S_IN1, Q_S_IN7);           \
+        Q_OUT ## 3 = vsubq_f32(Q_S_IN3, Q_S_IN4);           \
+        Q_OUT ## 4 = vsubq_f32(Q_S_IN0, Q_S_IN6);           \
+        Q_OUT ## 5 = vaddq_f32(Q_S_IN5, Q_S_IN5);           \
+        Q_OUT ## 6 = vaddq_f32(Q_S_IN1, Q_S_IN7);           \
+        Q_OUT ## 7 = vaddq_f32(Q_S_IN4, Q_S_IN3);           \
+} while (0);
+
+#define NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_IN) do {   \
+        Q_IN ## 3 = vmulq_f32(Q_IN ## 3,DIV_TW81_NEON);     \
+        Q_IN ## 7 = vmulq_f32(Q_IN ## 7,DIV_TW81N_NEON);    \
+        Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1);       \
+        Q_OUT ## 4 = vsubq_f32(Q_IN ## 0, Q_IN ## 1);       \
+        Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3);       \
+        Q_OUT ## 5 = vsubq_f32(Q_IN ## 2, Q_IN ## 3);       \
+        Q_OUT ## 2 = vaddq_f32(Q_IN ## 4, Q_IN ## 5);       \
+        Q_OUT ## 6 = vsubq_f32(Q_IN ## 4, Q_IN ## 5);       \
+        Q_OUT ## 3 = vaddq_f32(Q_IN ## 6, Q_IN ## 7);       \
+        Q_OUT ## 7 = vsubq_f32(Q_IN ## 6, Q_IN ## 7);       \
+} while(0);
+
+#define NE10_RADIX8x4_C2R_NEON_KERNEL_SCALE(Q_OUT)  do {    \
+    Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, EIGH_NEON);         \
+    Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, EIGH_NEON);         \
+    Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, EIGH_NEON);         \
+    Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, EIGH_NEON);         \
+    Q_OUT ## 4 = vmulq_f32( Q_OUT ## 4, EIGH_NEON);         \
+    Q_OUT ## 5 = vmulq_f32( Q_OUT ## 5, EIGH_NEON);         \
+    Q_OUT ## 6 = vmulq_f32( Q_OUT ## 6, EIGH_NEON);         \
+    Q_OUT ## 7 = vmulq_f32( Q_OUT ## 7, EIGH_NEON);         \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_NEON_KERNEL_SCALE(Q_OUT)  do {    \
+    Q_OUT ## 0 = vmulq_f32( Q_OUT ## 0, QUAD_NEON);         \
+    Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, QUAD_NEON);         \
+    Q_OUT ## 2 = vmulq_f32( Q_OUT ## 2, QUAD_NEON);         \
+    Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, QUAD_NEON);         \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_SCALE(Q2_OUT)  do {            \
+    Q2_OUT ## 0 .val[0] = vmulq_f32( Q2_OUT ## 0 .val[0], QUAD_NEON);   \
+    Q2_OUT ## 1 .val[0] = vmulq_f32( Q2_OUT ## 1 .val[0], QUAD_NEON);   \
+    Q2_OUT ## 2 .val[0] = vmulq_f32( Q2_OUT ## 2 .val[0], QUAD_NEON);   \
+    Q2_OUT ## 3 .val[0] = vmulq_f32( Q2_OUT ## 3 .val[0], QUAD_NEON);   \
+    Q2_OUT ## 0 .val[1] = vmulq_f32( Q2_OUT ## 0 .val[1], QUAD_NEON);   \
+    Q2_OUT ## 1 .val[1] = vmulq_f32( Q2_OUT ## 1 .val[1], QUAD_NEON);   \
+    Q2_OUT ## 2 .val[1] = vmulq_f32( Q2_OUT ## 2 .val[1], QUAD_NEON);   \
+    Q2_OUT ## 3 .val[1] = vmulq_f32( Q2_OUT ## 3 .val[1], QUAD_NEON);   \
+} while(0);
+
+#define NE10_RADIX8x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do {  \
+    NE10_DECLARE_8(float32x4_t,Q_S_IN);                 \
+    NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_S_IN,Q_IN);      \
+    NE10_RADIX8x4_R2C_NEON_KERNEL_S2(Q_OUT,Q_S_IN);     \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_NEON_KERNEL(Q_OUT,Q_IN) do {  \
+    NE10_DECLARE_4(float32x4_t,Q_S_IN);                 \
+    Q_S_IN0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 2);         \
+    Q_S_IN1 = vaddq_f32 (Q_IN ## 1, Q_IN ## 3);         \
+    Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN1);          \
+    Q_OUT ## 1 = vsubq_f32 (Q_IN##0, Q_IN##2);          \
+    Q_OUT ## 2 = vsubq_f32 (Q_IN##3, Q_IN##1);          \
+    Q_OUT ## 3 = vsubq_f32 (Q_S_IN0, Q_S_IN1);          \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do {  \
+    NE10_DECLARE_4(float32x4_t,Q_S_IN);                 \
+    Q_S_IN0 = vaddq_f32 (Q_IN##0, Q_IN##3);             \
+    Q_S_IN1 = vsubq_f32 (Q_IN##0, Q_IN##3);             \
+    Q_S_IN2 = vaddq_f32 (Q_IN##1, Q_IN##1);             \
+    Q_S_IN3 = vaddq_f32 (Q_IN##2, Q_IN##2);             \
+    Q_OUT ## 0 = vaddq_f32 (Q_S_IN0, Q_S_IN2);          \
+    Q_OUT ## 1 = vsubq_f32 (Q_S_IN1, Q_S_IN3);          \
+    Q_OUT ## 2 = vsubq_f32 (Q_S_IN0, Q_S_IN2);          \
+    Q_OUT ## 3 = vaddq_f32 (Q_S_IN1, Q_S_IN3);          \
+} while(0);
+
+#define NE10_RADIX8x4_C2R_NEON_KERNEL(Q_OUT,Q_IN) do {          \
+    NE10_DECLARE_8(float32x4_t,Q_S_IN_C2R_KERNEL);              \
+    NE10_RADIX8x4_C2R_NEON_KERNEL_S1(Q_S_IN_C2R_KERNEL,Q_IN);   \
+    NE10_RADIX8x4_C2R_NEON_KERNEL_S2(Q_OUT,Q_S_IN_C2R_KERNEL);  \
+} while(0);
+
+#define NE10_RADIX8x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do { \
+    Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 4 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 5 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 6 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+    Q_IN ## 7 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );    \
+    PTR_IN += IN_STEP;  \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_NEON_LOAD(PTR_IN,Q_IN,IN_STEP) do {\
+    Q_IN ## 0 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );  \
+    PTR_IN += IN_STEP;                                      \
+    Q_IN ## 1 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );  \
+    PTR_IN += IN_STEP;                                      \
+    Q_IN ## 2 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );  \
+    PTR_IN += IN_STEP;                                      \
+    Q_IN ## 3 = vld1q_f32( (ne10_float32_t*) ( PTR_IN ) );  \
+    PTR_IN += IN_STEP;                                      \
+} while(0);
+
+#define NE10_RADIX8x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do {           \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 4 * OUT_STEP ), Q_OUT ## 4);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 5 * OUT_STEP ), Q_OUT ## 5);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 6 * OUT_STEP ), Q_OUT ## 6);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 7 * OUT_STEP ), Q_OUT ## 7);  \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_NEON_STORE(PTR_OUT,Q_OUT,OUT_STEP) do {           \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 0 * OUT_STEP ), Q_OUT ## 0);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 1 * OUT_STEP ), Q_OUT ## 1);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 2 * OUT_STEP ), Q_OUT ## 2);  \
+     vst1q_f32( (ne10_float32_t*) ( PTR_OUT + 3 * OUT_STEP ), Q_OUT ## 3);  \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do {  \
+    Q2_OUT ## 0 = Q2_IN ## 0;                                   \
+    NE10_CPX_MUL_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0);   \
+    NE10_CPX_MUL_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1);   \
+    NE10_CPX_MUL_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2);   \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW) do {      \
+    Q2_OUT ## 0 = Q2_IN ## 0;                                       \
+    NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 1,Q2_IN ## 1,Q2_TW ## 0);   \
+    NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 2,Q2_IN ## 2,Q2_TW ## 1);   \
+    NE10_CPX_MUL_INV_NEON_F32(Q2_OUT ## 3,Q2_IN ## 3,Q2_TW ## 2);   \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do {  \
+    NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 2);   \
+    NE10_CPX_SUB_NEON_F32(Q2_OUT ## 1,Q2_IN ## 0,Q2_IN ## 2);   \
+    NE10_CPX_ADD_NEON_F32(Q2_OUT ## 2,Q2_IN ## 1,Q2_IN ## 3);   \
+    NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 1,Q2_IN ## 3);   \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do {  \
+    Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
+    Q2_OUT ## 0 .val[1] = vaddq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
+    Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
+    Q2_OUT ## 2 .val[1] = vsubq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
+    Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
+    Q2_OUT ## 1 .val[1] = vsubq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[0]); \
+    Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[1]); \
+    Q2_OUT ## 3 .val[1] = vaddq_f32(Q2_IN ## 3 .val[0] , Q2_IN ## 1 .val[1]); \
+    Q2_OUT ## 3 .val[1] = vnegq_f32(Q2_OUT ## 3 .val[1]); \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do {  \
+    float32x4_t Q_TMP;  \
+    Q_IN ## 1 = vmulq_f32(Q_IN ## 1, Q_TW_81);  \
+    Q_IN ## 3 = vmulq_f32(Q_IN ## 3, Q_TW_81);  \
+    Q_TMP = vsubq_f32(Q_IN ## 1, Q_IN ## 3);    \
+    Q_IN ## 3 = vaddq_f32(Q_IN ## 1, Q_IN ## 3);    \
+    Q_IN ## 1 = Q_TMP;                      \
+    Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 1);   \
+    Q_OUT ## 1 = vaddq_f32(Q_IN ## 2, Q_IN ## 3);   \
+    Q_OUT ## 2 = vsubq_f32(Q_IN ## 0, Q_IN ## 1);   \
+    Q_OUT ## 3 = vsubq_f32(Q_IN ## 2, Q_IN ## 3);   \
+    Q_OUT ## 1 = vnegq_f32(Q_OUT ## 1);         \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(Q_OUT,Q_IN) do {  \
+    float32x4_t Q_TMP;  \
+    Q_IN ## 1 = vnegq_f32(Q_IN ## 1 );  \
+    Q_OUT ## 0 = vaddq_f32(Q_IN ## 0, Q_IN ## 2);   \
+    Q_OUT ## 1 = vsubq_f32(Q_IN ## 0, Q_IN ## 2);   \
+    Q_OUT ## 2 = vaddq_f32(Q_IN ## 1, Q_IN ## 3);   \
+    Q_OUT ## 3 = vsubq_f32(Q_IN ## 1, Q_IN ## 3);   \
+    Q_TMP = vaddq_f32(Q_OUT ## 1, Q_OUT ## 3);  \
+    Q_OUT ## 3 = vsubq_f32(Q_OUT ## 3, Q_OUT ## 1);  \
+    Q_OUT ## 1 = Q_TMP; \
+    Q_OUT ## 1 = vmulq_f32( Q_OUT ## 1, DIV_TW81_NEON); \
+    Q_OUT ## 3 = vmulq_f32( Q_OUT ## 3, DIV_TW81_NEON); \
+    Q_OUT ## 0 = vaddq_f32( Q_OUT ## 0, Q_OUT ## 0 );   \
+    Q_OUT ## 2 = vaddq_f32( Q_OUT ## 2, Q_OUT ## 2 );   \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN) do {  \
+    Q2_IN ## 3 .val[1] = vnegq_f32(Q2_IN ## 3 .val[1]); \
+    Q2_OUT ## 0 .val[0] = vaddq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
+    Q2_OUT ## 0 .val[1] = vsubq_f32(Q2_IN ## 0 .val[1] , Q2_IN ## 2 .val[1]); \
+    Q2_OUT ## 2 .val[0] = vsubq_f32(Q2_IN ## 0 .val[0] , Q2_IN ## 2 .val[0]); \
+    Q2_OUT ## 2 .val[1] = vaddq_f32(Q2_IN ## 2 .val[1] , Q2_IN ## 0 .val[1]); \
+    Q2_OUT ## 1 .val[0] = vaddq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
+    Q2_OUT ## 1 .val[1] = vaddq_f32(Q2_IN ## 1 .val[1] , Q2_IN ## 3 .val[1]); \
+    Q2_OUT ## 3 .val[0] = vsubq_f32(Q2_IN ## 3 .val[1] , Q2_IN ## 1 .val[1]); \
+    Q2_OUT ## 3 .val[1] = vsubq_f32(Q2_IN ## 1 .val[0] , Q2_IN ## 3 .val[0]); \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN) do {  \
+    NE10_CPX_ADD_NEON_F32(Q2_OUT ## 0,Q2_IN ## 0,Q2_IN ## 1);   \
+    NE10_CPX_SUB_NEON_F32(Q2_OUT ## 2,Q2_IN ## 0,Q2_IN ## 1);   \
+    NE10_CPX_ADD_NEON_F32(Q2_OUT ## 1,Q2_IN ## 2,Q2_IN ## 3);   \
+    NE10_CPX_SUB_NEON_F32(Q2_OUT ## 3,Q2_IN ## 2,Q2_IN ## 3);   \
+} while(0);
+
+#define NE10_RADIX4x4_R2C_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do {   \
+    NE10_RADIX4x4_R2C_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW);              \
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(Q2_IN,Q2_OUT);              \
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(Q2_OUT,Q2_IN);              \
+} while(0);
+
+#define NE10_RADIX4x4_C2R_TW_NEON_KERNEL(Q2_OUT,Q2_IN,Q2_TW) do {   \
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(Q2_OUT,Q2_IN);              \
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(Q2_IN,Q2_OUT);              \
+    NE10_RADIX4x4_C2R_TW_MUL_NEON(Q2_OUT,Q2_IN,Q2_TW);              \
+} while(0);
+
+#ifdef NE10_VERBOSE
+  #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) do {                                                                    \
+      fprintf(stderr,"inside %s\n", __FUNCTION__ );                                                               \
+      fprintf(stderr, #Q_VECTOR "\n");                                                                            \
+      fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
+      fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
+      fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
+      fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
+      fprintf(stderr,"4:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 4[0], Q_VECTOR ## 4[1], Q_VECTOR ## 4[2], Q_VECTOR ## 4[3] ); \
+      fprintf(stderr,"5:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 5[0], Q_VECTOR ## 5[1], Q_VECTOR ## 5[2], Q_VECTOR ## 5[3] ); \
+      fprintf(stderr,"6:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 6[0], Q_VECTOR ## 6[1], Q_VECTOR ## 6[2], Q_VECTOR ## 6[3] ); \
+      fprintf(stderr,"7:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 7[0], Q_VECTOR ## 7[1], Q_VECTOR ## 7[2], Q_VECTOR ## 7[3] ); \
+  } while(0);
+  #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) do {                                                                    \
+      fprintf(stderr,"inside %s\n", __FUNCTION__ );                                                               \
+      fprintf(stderr, #Q_VECTOR "\n");                                                                            \
+      fprintf(stderr,"0:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0[0], Q_VECTOR ## 0[1], Q_VECTOR ## 0[2], Q_VECTOR ## 0[3] ); \
+      fprintf(stderr,"1:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1[0], Q_VECTOR ## 1[1], Q_VECTOR ## 1[2], Q_VECTOR ## 1[3] ); \
+      fprintf(stderr,"2:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2[0], Q_VECTOR ## 2[1], Q_VECTOR ## 2[2], Q_VECTOR ## 2[3] ); \
+      fprintf(stderr,"3:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3[0], Q_VECTOR ## 3[1], Q_VECTOR ## 3[2], Q_VECTOR ## 3[3] ); \
+  } while(0);
+  #define NE10_PRINT_Q2x4_VECTOR(Q_VECTOR) do {                                                                   \
+      fprintf(stderr,"inside %s\n", __FUNCTION__ );                                                               \
+      fprintf(stderr, #Q_VECTOR "\n");                                                                            \
+      fprintf(stderr,"0R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[0][0], Q_VECTOR ## 0 .val[0][1], Q_VECTOR ## 0 .val[0][2], Q_VECTOR ## 0 .val[0][3] ); \
+      fprintf(stderr,"1R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[0][0], Q_VECTOR ## 1 .val[0][1], Q_VECTOR ## 1 .val[0][2], Q_VECTOR ## 1 .val[0][3] ); \
+      fprintf(stderr,"2R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[0][0], Q_VECTOR ## 2 .val[0][1], Q_VECTOR ## 2 .val[0][2], Q_VECTOR ## 2 .val[0][3] ); \
+      fprintf(stderr,"3R:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[0][0], Q_VECTOR ## 3 .val[0][1], Q_VECTOR ## 3 .val[0][2], Q_VECTOR ## 3 .val[0][3] ); \
+      fprintf(stderr,"0I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 0 .val[1][0], Q_VECTOR ## 0 .val[1][1], Q_VECTOR ## 0 .val[1][2], Q_VECTOR ## 0 .val[1][3] ); \
+      fprintf(stderr,"1I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 1 .val[1][0], Q_VECTOR ## 1 .val[1][1], Q_VECTOR ## 1 .val[1][2], Q_VECTOR ## 1 .val[1][3] ); \
+      fprintf(stderr,"2I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 2 .val[1][0], Q_VECTOR ## 2 .val[1][1], Q_VECTOR ## 2 .val[1][2], Q_VECTOR ## 2 .val[1][3] ); \
+      fprintf(stderr,"3I:%+8.2e,%+8.2e,%+8.2e,%+8.2e\n", Q_VECTOR ## 3 .val[1][0], Q_VECTOR ## 3 .val[1][1], Q_VECTOR ## 3 .val[1][2], Q_VECTOR ## 3 .val[1][3] ); \
+  } while(0);
+#else // NE10_VERBOSE not defined
+  #define NE10_PRINT_Qx8_VECTOR(Q_VECTOR) ;
+  #define NE10_PRINT_Qx4_VECTOR(Q_VECTOR) ;
+  #define NE10_PRINT_Q2x4_VECTOR(Q2_VECTOR) ;
+#endif // NE10_VERBOSE
+#endif // header
index 1500b28..fa0e95d 100644 (file)
@@ -850,40 +850,6 @@ static void ne10_mixed_radix_butterfly_inverse_float32_c (ne10_fft_cpx_float32_t
     } // last stage
 }
 
-/* factors buffer:
- * 0: stage number
- * 1: stride for the first stage
- * others: factors */
-static ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
-{
-    ne10_int32_t p = 4;
-    ne10_int32_t i = 1;
-    ne10_int32_t stage_num = 0;
-    ne10_int32_t stride_max = n;
-
-    /* factor out powers of 4, powers of 2 */
-    do
-    {
-        if ( (n % p) == 2)
-            p = 2;
-        else if (n % p)
-        {
-            return NE10_ERR;
-        }
-
-        n /= p;
-        facbuf[2 * i] = p;
-        facbuf[2 * i + 1] = n;
-        i++;
-        stage_num++;
-    }
-    while (n > 1);
-    facbuf[0] = stage_num;
-    facbuf[1] = stride_max / p;
-    return NE10_OK;
-}
-
-
 static void ne10_fft_split_r2c_1d_float32 (ne10_fft_cpx_float32_t *dst,
         const ne10_fft_cpx_float32_t *src,
         ne10_fft_cpx_float32_t *twiddles,
@@ -1203,6 +1169,13 @@ void ne10_fft_c2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
  *
  */
 
+// only for ARMv7-A and AArch32 platform.
+// For AArch64 these functions are implemented in NE10_rfft_float32.c
+#ifdef __arm__
+////////////////////////////////////////////////////
+// RFFT reference model for ARMv7-A and AArch32 neon
+////////////////////////////////////////////////////
+
 /**
  * @brief User-callable function to allocate all necessary storage space for the fft (r2c/c2r).
  * @param[in]   nfft             length of FFT
@@ -1335,3 +1308,4 @@ void ne10_fft_c2r_1d_float32_c (ne10_float32_t *fout,
 /**
  * @} end of R2C_FFT_IFFT group
  */
+#endif
index 4f06695..f1c8eb2 100644 (file)
@@ -1440,199 +1440,6 @@ void ne10_mixed_radix_fft_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
     } // last stage
 }
 
-static void ne10_fft_split_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
-        const ne10_fft_cpx_float32_t *src,
-        ne10_fft_cpx_float32_t *twiddles,
-        ne10_int32_t ncfft)
-{
-    ne10_int32_t k;
-    ne10_int32_t count = ncfft / 2;
-    ne10_fft_cpx_float32_t fpnk, fpk, f1k, f2k, tw, tdc;
-    float32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
-    float32x4_t q_fpnk_r, q_fpnk_i;
-    float32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
-    float32x4_t q_tw_r, q_tw_i;
-    float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
-    float32x4_t q_dst_r, q_dst_i, q_dst2_r, q_dst2_i;
-    float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
-
-    tdc.r = src[0].r;
-    tdc.i = src[0].i;
-
-    dst[0].r = tdc.r + tdc.i;
-    dst[ncfft].r = tdc.r - tdc.i;
-    dst[ncfft].i = dst[0].i = 0;
-
-    if (count >= 4)
-    {
-        for (k = 1; k <= count ; k += 4)
-        {
-            p_src  = (float32_t*) (& (src[k]));
-            p_src2  = (float32_t*) (& (src[ncfft - k - 3]));
-            p_twiddles  = (float32_t*) (& (twiddles[k - 1]));
-            p_dst  = (float32_t*) (& (dst[k]));
-            p_dst2  = (float32_t*) (& (dst[ncfft - k - 3]));
-
-            q2_fpk  = vld2q_f32 (p_src);
-            q2_fpnk = vld2q_f32 (p_src2);
-            q2_tw = vld2q_f32 (p_twiddles);
-            q2_fpnk.val[0] = vrev64q_f32 (q2_fpnk.val[0]);
-            q2_fpnk.val[1] = vrev64q_f32 (q2_fpnk.val[1]);
-            q_fpnk_r = vcombine_f32 (vget_high_f32 (q2_fpnk.val[0]), vget_low_f32 (q2_fpnk.val[0]));
-            q_fpnk_i = vcombine_f32 (vget_high_f32 (q2_fpnk.val[1]), vget_low_f32 (q2_fpnk.val[1]));
-            q_fpnk_i = vnegq_f32 (q_fpnk_i);
-
-            q_f1k_r = vaddq_f32 (q2_fpk.val[0], q_fpnk_r);
-            q_f1k_i = vaddq_f32 (q2_fpk.val[1], q_fpnk_i);
-
-            q_f2k_r = vsubq_f32 (q2_fpk.val[0], q_fpnk_r);
-            q_f2k_i = vsubq_f32 (q2_fpk.val[1], q_fpnk_i);
-
-            q_tmp0 = vmulq_f32 (q_f2k_r, q2_tw.val[0]);
-            q_tmp1 = vmulq_f32 (q_f2k_i, q2_tw.val[1]);
-            q_tmp2 = vmulq_f32 (q_f2k_r, q2_tw.val[1]);
-            q_tmp3 = vmulq_f32 (q_f2k_i, q2_tw.val[0]);
-            q_tw_r = vsubq_f32 (q_tmp0, q_tmp1);
-            q_tw_i = vaddq_f32 (q_tmp2, q_tmp3);
-
-            q_val = vdupq_n_f32 (0.5f);
-            q_dst2_r = vsubq_f32 (q_f1k_r, q_tw_r);
-            q_dst2_i = vsubq_f32 (q_tw_i, q_f1k_i);
-            q_dst_r = vaddq_f32 (q_f1k_r, q_tw_r);
-            q_dst_i = vaddq_f32 (q_f1k_i, q_tw_i);
-            q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
-            q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
-            q2_dst.val[0] = vmulq_f32 (q_dst_r, q_val);
-            q2_dst.val[1] = vmulq_f32 (q_dst_i, q_val);
-            q_dst2_r = vrev64q_f32 (q_dst2_r);
-            q_dst2_i = vrev64q_f32 (q_dst2_i);
-            q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
-            q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
-            vst2q_f32 (p_dst, q2_dst);
-            vst2q_f32 (p_dst2, q2_dst2);
-
-        }
-    }
-    else
-    {
-        for (k = 1; k <= count ; k++)
-        {
-            fpk    = src[k];
-            fpnk.r =   src[ncfft - k].r;
-            fpnk.i = - src[ncfft - k].i;
-
-            f1k.r = fpk.r + fpnk.r;
-            f1k.i = fpk.i + fpnk.i;
-
-            f2k.r = fpk.r - fpnk.r;
-            f2k.i = fpk.i - fpnk.i;
-
-            tw.r = f2k.r * (twiddles[k - 1]).r - f2k.i * (twiddles[k - 1]).i;
-            tw.i = f2k.r * (twiddles[k - 1]).i + f2k.i * (twiddles[k - 1]).r;
-
-            dst[k].r = (f1k.r + tw.r) * 0.5f;
-            dst[k].i = (f1k.i + tw.i) * 0.5f;
-            dst[ncfft - k].r = (f1k.r - tw.r) * 0.5f;
-            dst[ncfft - k].i = (tw.i - f1k.i) * 0.5f;
-        }
-    }
-}
-
-static void ne10_fft_split_c2r_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
-        const ne10_fft_cpx_float32_t *src,
-        ne10_fft_cpx_float32_t *twiddles,
-        ne10_int32_t ncfft)
-{
-
-    ne10_int32_t k;
-    ne10_int32_t count = ncfft / 2;
-    ne10_fft_cpx_float32_t fk, fnkc, fek, fok, tmp;
-    float32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
-    float32x4_t q_fnkc_r, q_fnkc_i;
-    float32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
-    float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
-    float32x4_t q_dst2_r, q_dst2_i;
-    float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
-
-    dst[0].r = (src[0].r + src[ncfft].r) * 0.5f;
-    dst[0].i = (src[0].r - src[ncfft].r) * 0.5f;
-
-    if (count >= 4)
-    {
-        for (k = 1; k <= count ; k += 4)
-        {
-            p_src  = (float32_t*) (& (src[k]));
-            p_src2  = (float32_t*) (& (src[ncfft - k - 3]));
-            p_twiddles  = (float32_t*) (& (twiddles[k - 1]));
-            p_dst  = (float32_t*) (& (dst[k]));
-            p_dst2  = (float32_t*) (& (dst[ncfft - k - 3]));
-
-            q2_fk  = vld2q_f32 (p_src);
-            q2_fnkc = vld2q_f32 (p_src2);
-            q2_tw = vld2q_f32 (p_twiddles);
-            q2_fnkc.val[0] = vrev64q_f32 (q2_fnkc.val[0]);
-            q2_fnkc.val[1] = vrev64q_f32 (q2_fnkc.val[1]);
-            q_fnkc_r = vcombine_f32 (vget_high_f32 (q2_fnkc.val[0]), vget_low_f32 (q2_fnkc.val[0]));
-            q_fnkc_i = vcombine_f32 (vget_high_f32 (q2_fnkc.val[1]), vget_low_f32 (q2_fnkc.val[1]));
-            q_fnkc_i = vnegq_f32 (q_fnkc_i);
-
-            q_fek_r = vaddq_f32 (q2_fk.val[0], q_fnkc_r);
-            q_fek_i = vaddq_f32 (q2_fk.val[1], q_fnkc_i);
-
-            q_tmp0 = vsubq_f32 (q2_fk.val[0], q_fnkc_r);
-            q_tmp1 = vsubq_f32 (q2_fk.val[1], q_fnkc_i);
-
-            q_fok_r = vmulq_f32 (q_tmp0, q2_tw.val[0]);
-            q_fok_i = vmulq_f32 (q_tmp1, q2_tw.val[0]);
-            q_tmp2 = vmulq_f32 (q_tmp1, q2_tw.val[1]);
-            q_tmp3 = vmulq_f32 (q_tmp0, q2_tw.val[1]);
-            q_fok_r = vaddq_f32 (q_fok_r, q_tmp2);
-            q_fok_i = vsubq_f32 (q_fok_i, q_tmp3);
-
-            q_val = vdupq_n_f32 (0.5f);
-            q_dst2_r = vsubq_f32 (q_fek_r, q_fok_r);
-            q_dst2_i = vsubq_f32 (q_fok_i, q_fek_i);
-            q2_dst.val[0] = vaddq_f32 (q_fek_r, q_fok_r);
-            q2_dst.val[1] = vaddq_f32 (q_fek_i, q_fok_i);
-            q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
-            q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
-            q2_dst.val[0] = vmulq_f32 (q2_dst.val[0], q_val);
-            q2_dst.val[1] = vmulq_f32 (q2_dst.val[1], q_val);
-            q_dst2_r = vrev64q_f32 (q_dst2_r);
-            q_dst2_i = vrev64q_f32 (q_dst2_i);
-            q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
-            q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
-            vst2q_f32 (p_dst, q2_dst);
-            vst2q_f32 (p_dst2, q2_dst2);
-
-        }
-    }
-    else
-    {
-        for (k = 1; k <= count ; k++)
-        {
-            fk = src[k];
-            fnkc.r = src[ncfft - k].r;
-            fnkc.i = -src[ncfft - k].i;
-
-            fek.r = fk.r + fnkc.r;
-            fek.i = fk.i + fnkc.i;
-
-            tmp.r = fk.r - fnkc.r;
-            tmp.i = fk.i - fnkc.i;
-
-            fok.r = tmp.r * twiddles[k - 1].r + tmp.i * twiddles[k - 1].i;
-            fok.i = tmp.i * twiddles[k - 1].r - tmp.r * twiddles[k - 1].i;
-
-            dst[k].r = (fek.r + fok.r) * 0.5f;
-            dst[k].i = (fek.i + fok.i) * 0.5f;
-
-            dst[ncfft - k].r = (fek.r - fok.r) * 0.5f;
-            dst[ncfft - k].i = (fok.i - fek.i) * 0.5f;
-        }
-    }
-}
-
 /**
  * @addtogroup C2C_FFT_IFFT
  * @{
@@ -1695,66 +1502,3 @@ void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
 /**
  * @}
  */ //end of C2C_FFT_IFFT group
-
-/**
- * @addtogroup R2C_FFT_IFFT
- * @{
- */
-
-/**
- * @brief Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
- * @param[out]  *fout            point to the output buffer
- * @param[in]   *fin             point to the input buffer
- * @param[in]   cfg              point to the config struct
- * @return none.
- * The function implements a mixed radix-2/4 FFT (real to complex). The length of 2^N(N is 2, 3, 4, 5, 6 ....etc) is supported.
- * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia.
- * For the usage of this function, please check test/test_suite_fft_float32.c
- */
-void ne10_fft_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
-                                   ne10_float32_t *fin,
-                                   ne10_fft_r2c_cfg_float32_t cfg)
-{
-    ne10_fft_cpx_float32_t * tmpbuf1 = cfg->buffer;
-    ne10_fft_cpx_float32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
-    ne10_fft_state_float32_t c2c_state;
-
-    c2c_state.nfft = cfg->ncfft;
-    c2c_state.factors = cfg->factors;
-    c2c_state.twiddles = cfg->twiddles;
-    c2c_state.buffer = tmpbuf2;
-
-    ne10_fft_c2c_1d_float32_neon (tmpbuf1, (ne10_fft_cpx_float32_t*) fin, &c2c_state, 0);
-    ne10_fft_split_r2c_1d_float32_neon (fout, tmpbuf1,  cfg->super_twiddles, cfg->ncfft);
-}
-
-/**
- * @brief Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.
- * @param[out]  *fout            point to the output buffer
- * @param[in]   *fin             point to the input buffer
- * @param[in]   cfg              point to the config struct
- * @return none.
- * The function implements a mixed radix-2/4 FFT (complex to real). The length of 2^N(N is 2, 3, 4, 5, 6 ....etc) is supported.
- * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia.
- * For the usage of this function, please check test/test_suite_fft_float32.c
- */
-void ne10_fft_c2r_1d_float32_neon (ne10_float32_t *fout,
-                                   ne10_fft_cpx_float32_t *fin,
-                                   ne10_fft_r2c_cfg_float32_t cfg)
-{
-    ne10_fft_cpx_float32_t * tmpbuf1 = cfg->buffer;
-    ne10_fft_cpx_float32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
-    ne10_fft_state_float32_t c2c_state;
-
-    c2c_state.nfft = cfg->ncfft;
-    c2c_state.factors = cfg->factors;
-    c2c_state.twiddles = cfg->twiddles;
-    c2c_state.buffer = tmpbuf2;
-
-    ne10_fft_split_c2r_1d_float32_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft);
-    ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) fout, tmpbuf1, &c2c_state, 1);
-}
-
-/**
- * @} end of R2C_FFT_IFFT group
- */
index 5c3b025..c44eef9 100644 (file)
@@ -923,40 +923,6 @@ static void ne10_mixed_radix_butterfly_inverse_int16_c (ne10_fft_cpx_int16_t * F
     } // last stage
 }
 
-
-/* factors buffer:
- * 0: stage number
- * 1: stride for the first stage
- * others: factors */
-static ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
-{
-    ne10_int32_t p = 4;
-    ne10_int32_t i = 1;
-    ne10_int32_t stage_num = 0;
-    ne10_int32_t stride_max = n;
-
-    /* factor out powers of 4, powers of 2 */
-    do
-    {
-        if ( (n % p) == 2)
-            p = 2;
-        else if (n % p)
-        {
-            return NE10_ERR;
-        }
-
-        n /= p;
-        facbuf[2 * i] = p;
-        facbuf[2 * i + 1] = n;
-        i++;
-        stage_num++;
-    }
-    while (n > 1);
-    facbuf[0] = stage_num;
-    facbuf[1] = stride_max / p;
-    return NE10_OK;
-}
-
 static void ne10_fft_split_r2c_1d_int16 (ne10_fft_cpx_int16_t *dst,
         const ne10_fft_cpx_int16_t *src,
         ne10_fft_cpx_int16_t *twiddles,
index 9e2d45a..f8bcc96 100644 (file)
@@ -923,41 +923,6 @@ static void ne10_mixed_radix_butterfly_inverse_int32_c (ne10_fft_cpx_int32_t * F
     } // last stage
 }
 
-
-/* factors buffer:
- * 0: stage number
- * 1: stride for the first stage
- * others: factors */
-static ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
-{
-    ne10_int32_t p = 4;
-    ne10_int32_t i = 1;
-    ne10_int32_t stage_num = 0;
-    ne10_int32_t stride_max = n;
-
-    /* factor out powers of 4, powers of 2 */
-    do
-    {
-        if ( (n % p) == 2)
-            p = 2;
-        else if (n % p)
-        {
-            return NE10_ERR;
-        }
-
-        n /= p;
-        facbuf[2 * i] = p;
-        facbuf[2 * i + 1] = n;
-        i++;
-        stage_num++;
-    }
-    while (n > 1);
-    facbuf[0] = stage_num;
-    facbuf[1] = stride_max / p;
-    return NE10_OK;
-}
-
-
 static void ne10_fft_split_r2c_1d_int32 (ne10_fft_cpx_int32_t *dst,
         const ne10_fft_cpx_int32_t *src,
         ne10_fft_cpx_int32_t *twiddles,
diff --git a/modules/dsp/NE10_rfft_float32.c b/modules/dsp/NE10_rfft_float32.c
new file mode 100644 (file)
index 0000000..7408eab
--- /dev/null
@@ -0,0 +1,910 @@
+/*
+ *  Copyright 2014 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_rfft_float32.c
+ */
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_fft.h"
+#include "NE10_dsp.h"
+
+#if defined(__aarch64__)
+
+extern void ne10_radix8_r2c_c (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    const ne10_int32_t in_step = nfft >> 3;
+          ne10_int32_t f_count;
+
+    ne10_float32_t scratch_in[8];
+    ne10_float32_t scratch   [4];
+
+    /* real pointers */
+    const ne10_float32_t* Fin_r  = (ne10_float32_t*) Fin;
+          ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
+    Fout_r ++; // always leave the first real empty
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        scratch_in[0] = Fin_r[in_step * 0] + Fin_r[in_step * (0 + 4)];
+        scratch_in[1] = Fin_r[in_step * 0] - Fin_r[in_step * (0 + 4)];
+        scratch_in[2] = Fin_r[in_step * 1] + Fin_r[in_step * (1 + 4)];
+        scratch_in[3] = Fin_r[in_step * 1] - Fin_r[in_step * (1 + 4)];
+        scratch_in[4] = Fin_r[in_step * 2] + Fin_r[in_step * (2 + 4)];
+        scratch_in[5] = Fin_r[in_step * 2] - Fin_r[in_step * (2 + 4)];
+        scratch_in[6] = Fin_r[in_step * 3] + Fin_r[in_step * (3 + 4)];
+        scratch_in[7] = Fin_r[in_step * 3] - Fin_r[in_step * (3 + 4)];
+
+        scratch_in[3] *= TW_81;
+        scratch_in[7] *= TW_81N;
+
+        // radix 2 butterfly
+        scratch[0] =   scratch_in[0] + scratch_in[4];
+        scratch[1] =   scratch_in[2] + scratch_in[6];
+        scratch[2] =   scratch_in[7] - scratch_in[3];
+        scratch[3] =   scratch_in[3] + scratch_in[7];
+
+        Fout_r[0] = scratch   [0] + scratch   [1];
+        Fout_r[7] = scratch   [0] - scratch   [1];
+
+        Fout_r[1] = scratch_in[1] + scratch   [3];
+        Fout_r[5] = scratch_in[1] - scratch   [3];
+
+        Fout_r[2] = scratch   [2] - scratch_in[5];
+        Fout_r[6] = scratch   [2] + scratch_in[5];
+
+        Fout_r[3] = scratch_in[0] - scratch_in[4];
+
+        Fout_r[4] = scratch_in[6] - scratch_in[2];
+
+        Fin_r  ++;
+        Fout_r += 8;
+    }
+}
+
+extern void ne10_radix8_c2r_c (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    const ne10_int32_t in_step = nfft >> 3;
+          ne10_int32_t f_count;
+
+    ne10_float32_t scratch_in[8];
+
+    const ne10_float32_t one_by_N = 1.0 / nfft;
+
+    /* real pointers */
+    const ne10_float32_t* Fin_r  = (ne10_float32_t*) Fin;
+          ne10_float32_t* Fout_r = (ne10_float32_t*) Fout;
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        scratch_in[0] =   Fin_r[0] + Fin_r[3] + Fin_r[3] + Fin_r[7];
+        scratch_in[1] =   Fin_r[1] + Fin_r[1] + Fin_r[5] + Fin_r[5];
+        scratch_in[2] =   Fin_r[0] - Fin_r[4] - Fin_r[4] - Fin_r[7];
+        scratch_in[3] =   Fin_r[1] - Fin_r[2] - Fin_r[5] - Fin_r[6];
+        scratch_in[4] =   Fin_r[0] - Fin_r[3] - Fin_r[3] + Fin_r[7];
+        scratch_in[5] = - Fin_r[2] - Fin_r[2] + Fin_r[6] + Fin_r[6];
+        scratch_in[6] =   Fin_r[0] + Fin_r[4] + Fin_r[4] - Fin_r[7];
+        scratch_in[7] =   Fin_r[1] + Fin_r[2] - Fin_r[5] + Fin_r[6];
+
+        scratch_in[3] /= TW_81;
+        scratch_in[7] /= TW_81N;
+
+        Fout_r[0 * in_step] = scratch_in[0] + scratch_in[1];
+        Fout_r[4 * in_step] = scratch_in[0] - scratch_in[1];
+        Fout_r[1 * in_step] = scratch_in[2] + scratch_in[3];
+        Fout_r[5 * in_step] = scratch_in[2] - scratch_in[3];
+        Fout_r[2 * in_step] = scratch_in[4] + scratch_in[5];
+        Fout_r[6 * in_step] = scratch_in[4] - scratch_in[5];
+        Fout_r[3 * in_step] = scratch_in[6] + scratch_in[7];
+        Fout_r[7 * in_step] = scratch_in[6] - scratch_in[7];
+
+#if defined(NE10_DSP_RFFT_SCALING)
+        Fout_r[0 * in_step] *= one_by_N;
+        Fout_r[4 * in_step] *= one_by_N;
+        Fout_r[1 * in_step] *= one_by_N;
+        Fout_r[5 * in_step] *= one_by_N;
+        Fout_r[2 * in_step] *= one_by_N;
+        Fout_r[6 * in_step] *= one_by_N;
+        Fout_r[3 * in_step] *= one_by_N;
+        Fout_r[7 * in_step] *= one_by_N;
+#endif
+
+        Fin_r  += 8;
+        Fout_r ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4_r2c_c (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    const ne10_int32_t in_step = nfft >> 2;
+          ne10_int32_t f_count;
+
+    ne10_float32_t  scratch_in [4];
+    ne10_float32_t  scratch_out[4];
+
+    /* real pointers */
+    const ne10_float32_t *Fin_r  = (ne10_float32_t*) Fin;
+          ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
+    Fout_r ++; // always leave the first real empty
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        scratch_in[0] = Fin_r[0 * in_step];
+        scratch_in[1] = Fin_r[1 * in_step];
+        scratch_in[2] = Fin_r[2 * in_step];
+        scratch_in[3] = Fin_r[3 * in_step];
+
+        // NE10_PRINT_Q_VECTOR(scratch_in);
+
+        NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
+
+        // NE10_PRINT_Q_VECTOR(scratch_out);
+
+        Fout_r[0] = scratch_out[0];
+        Fout_r[1] = scratch_out[1];
+        Fout_r[2] = scratch_out[2];
+        Fout_r[3] = scratch_out[3];
+
+        Fin_r  ++;
+        Fout_r += 4;
+    }
+}
+
+NE10_INLINE void ne10_radix4_c2r_c (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    ne10_int32_t f_count;
+    const ne10_int32_t in_step = nfft >> 2;
+    ne10_float32_t  scratch_in [4];
+    ne10_float32_t  scratch_out[4];
+
+    const ne10_float32_t one_by_N = 1.0 / nfft;
+
+    /* real pointers */
+    const ne10_float32_t *Fin_r  = (ne10_float32_t*) Fin;
+          ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        scratch_in[0] = Fin_r[0];
+        scratch_in[1] = Fin_r[1];
+        scratch_in[2] = Fin_r[2];
+        scratch_in[3] = Fin_r[3];
+
+        // NE10_PRINT_Q_VECTOR(scratch_in);
+
+        NE10_FFT_C2R_RCR_4R(scratch_out,scratch_in);
+
+        // NE10_PRINT_Q_VECTOR(scratch_out);
+
+#if defined(NE10_DSP_RFFT_SCALING)
+        scratch_out[0] *= one_by_N;
+        scratch_out[1] *= one_by_N;
+        scratch_out[2] *= one_by_N;
+        scratch_out[3] *= one_by_N;
+#endif
+
+        // store
+        Fout_r[0 * in_step] = scratch_out[0];
+        Fout_r[1 * in_step] = scratch_out[1];
+        Fout_r[2 * in_step] = scratch_out[2];
+        Fout_r[3 * in_step] = scratch_out[3];
+
+        Fin_r  += 4;
+        Fout_r ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
+        const ne10_float32_t *Fin_r,
+        const ne10_int32_t out_step,
+        const ne10_int32_t in_step,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_float32_t scratch_out[4];
+    ne10_float32_t scratch_in [4];
+
+    // load
+    scratch_in[0] = Fin_r[0 * in_step];
+    scratch_in[1] = Fin_r[1 * in_step];
+    scratch_in[2] = Fin_r[2 * in_step];
+    scratch_in[3] = Fin_r[3 * in_step];
+
+    // NE10_PRINT_Q_VECTOR(scratch_in);
+
+    NE10_FFT_R2C_4R_RCR(scratch_out,scratch_in);
+
+    // NE10_PRINT_Q_VECTOR(scratch_out);
+
+    // store
+    Fout_r[                      0] = scratch_out[0];
+    Fout_r[    (out_step << 1) - 1] = scratch_out[1];
+    Fout_r[    (out_step << 1)    ] = scratch_out[2];
+    Fout_r[2 * (out_step << 1) - 1] = scratch_out[3];
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_butterfly_c (ne10_float32_t *Fout_r,
+        const ne10_float32_t *Fin_r,
+        const ne10_int32_t out_step,
+        const ne10_int32_t in_step,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_float32_t scratch      [8];
+    ne10_float32_t scratch_in_r [4];
+    ne10_float32_t scratch_out_r[4];
+
+    // load
+    scratch_in_r[0] = Fin_r[0                ];
+    scratch_in_r[1] = Fin_r[1*(out_step<<1)-1];
+    scratch_in_r[2] = Fin_r[1*(out_step<<1)  ];
+    scratch_in_r[3] = Fin_r[2*(out_step<<1)-1];
+
+    // NE10_PRINT_Q_VECTOR(scratch_in_r);
+
+    // radix 4 butterfly without twiddles
+    scratch[0] = scratch_in_r[0] + scratch_in_r[3];
+    scratch[1] = scratch_in_r[0] - scratch_in_r[3];
+    scratch[2] = scratch_in_r[1] + scratch_in_r[1];
+    scratch[3] = scratch_in_r[2] + scratch_in_r[2];
+
+    scratch_out_r[0] = scratch[0] + scratch[2];
+    scratch_out_r[1] = scratch[1] - scratch[3];
+    scratch_out_r[2] = scratch[0] - scratch[2];
+    scratch_out_r[3] = scratch[1] + scratch[3];
+
+    // NE10_PRINT_Q_VECTOR(scratch_out_r);
+
+    // store
+    Fout_r[0 * in_step] = scratch_out_r[0];
+    Fout_r[1 * in_step] = scratch_out_r[1];
+    Fout_r[2 * in_step] = scratch_out_r[2];
+    Fout_r[3 * in_step] = scratch_out_r[3];
+
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
+        const ne10_float32_t *Fin_r,
+        const ne10_int32_t out_step,
+        const ne10_int32_t in_step,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t m_count;
+    ne10_float32_t *Fout_b = Fout_r + (((out_step<<1)-1)<<1) - 2; // reversed
+    ne10_fft_cpx_float32_t scratch_tw[3], scratch_in[4];
+
+    ne10_fft_cpx_float32_t scratch[4], scratch_out[4];
+
+    for (m_count = (out_step >> 1) - 1; m_count; m_count --)
+    {
+        scratch_tw  [0] = twiddles[0 * out_step];
+        scratch_tw  [1] = twiddles[1 * out_step];
+        scratch_tw  [2] = twiddles[2 * out_step];
+
+        scratch_in[0].r = Fin_r[0 * in_step    ];
+        scratch_in[0].i = Fin_r[0 * in_step + 1];
+        scratch_in[1].r = Fin_r[1 * in_step    ];
+        scratch_in[1].i = Fin_r[1 * in_step + 1];
+        scratch_in[2].r = Fin_r[2 * in_step    ];
+        scratch_in[2].i = Fin_r[2 * in_step + 1];
+        scratch_in[3].r = Fin_r[3 * in_step    ];
+        scratch_in[3].i = Fin_r[3 * in_step + 1];
+
+        // NE10_PRINT_Q2_VECTOR(scratch_in);
+
+        // radix 4 butterfly with twiddles
+        scratch[0].r = scratch_in[0].r;
+        scratch[0].i = scratch_in[0].i;
+        scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
+        scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
+
+        scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
+        scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
+
+        scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
+        scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
+
+        NE10_FFT_R2C_CC_CC(scratch_out,scratch);
+
+        // NE10_PRINT_Q2_VECTOR(scratch_in);
+
+        // result
+        Fout_r[                    0] = scratch_out[0].r;
+        Fout_r[                    1] = scratch_out[0].i;
+        Fout_r[  (out_step << 1)    ] = scratch_out[1].r;
+        Fout_r[  (out_step << 1) + 1] = scratch_out[1].i;
+        Fout_b[                    0] = scratch_out[2].r;
+        Fout_b[                    1] = scratch_out[2].i;
+        Fout_b[- (out_step << 1)    ] = scratch_out[3].r;
+        Fout_b[- (out_step << 1) + 1] = scratch_out[3].i;
+
+        // update pointers
+        Fin_r  += 2;
+        Fout_r += 2;
+        Fout_b -= 2;
+        twiddles ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_other_butterfly_c (ne10_float32_t *Fout_r,
+        const ne10_float32_t *Fin_r,
+        const ne10_int32_t out_step,
+        const ne10_int32_t in_step,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t m_count;
+    const ne10_float32_t *Fin_b = Fin_r + (((out_step<<1)-1)<<1) - 2; // reversed
+    ne10_fft_cpx_float32_t scratch_tw [3],
+                           scratch    [8],
+                           scratch_in [4],
+                           scratch_out[4];
+
+    for (m_count = (out_step >> 1) - 1; m_count; m_count --)
+    {
+        scratch_tw[0] = twiddles[0 * out_step];
+        scratch_tw[1] = twiddles[1 * out_step];
+        scratch_tw[2] = twiddles[2 * out_step];
+
+        scratch_in[0].r = Fin_r[0];
+        scratch_in[0].i = Fin_r[1];
+
+        scratch_in[1].r = Fin_b[0];
+        scratch_in[1].i = Fin_b[1];
+
+        scratch_in[2].r = Fin_r[(out_step<<1) + 0];
+        scratch_in[2].i = Fin_r[(out_step<<1) + 1];
+
+        scratch_in[3].r = Fin_b[-(out_step<<1) + 0];
+        scratch_in[3].i = Fin_b[-(out_step<<1) + 1];
+
+        // NE10_PRINT_Q2_VECTOR(scratch_in);
+
+        // // inverse of "result"
+        NE10_FFT_C2R_CC_CC(scratch,scratch_in);
+
+        // inverse of "mutltipy twiddles"
+        scratch_out[0] = scratch[0];
+
+        scratch_out[1].r = scratch[1].r * scratch_tw[0].r + scratch[1].i * scratch_tw[0].i;
+        scratch_out[1].i = scratch[1].i * scratch_tw[0].r - scratch[1].r * scratch_tw[0].i;
+
+        scratch_out[2].r = scratch[2].r * scratch_tw[1].r + scratch[2].i * scratch_tw[1].i;
+        scratch_out[2].i = scratch[2].i * scratch_tw[1].r - scratch[2].r * scratch_tw[1].i;
+
+        scratch_out[3].r = scratch[3].r * scratch_tw[2].r + scratch[3].i * scratch_tw[2].i;
+        scratch_out[3].i = scratch[3].i * scratch_tw[2].r - scratch[3].r * scratch_tw[2].i;
+
+        // NE10_PRINT_Q2_VECTOR(scratch_out);
+
+        // store
+        Fout_r[0 * in_step    ] = scratch_out[0].r;
+        Fout_r[0 * in_step + 1] = scratch_out[0].i;
+        Fout_r[1 * in_step    ] = scratch_out[1].r;
+        Fout_r[1 * in_step + 1] = scratch_out[1].i;
+        Fout_r[2 * in_step    ] = scratch_out[2].r;
+        Fout_r[2 * in_step + 1] = scratch_out[2].i;
+        Fout_r[3 * in_step    ] = scratch_out[3].r;
+        Fout_r[3 * in_step + 1] = scratch_out[3].i;
+
+        // update pointers
+        Fin_r  += 2;
+        Fout_r += 2;
+        Fin_b -= 2;
+        twiddles ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
+        const ne10_float32_t *Fin_r,
+        const ne10_int32_t out_step,
+        const ne10_int32_t in_step,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_float32_t scratch_in [4];
+    ne10_float32_t scratch_out[4];
+
+    scratch_in[0] = Fin_r[0 * in_step];
+    scratch_in[1] = Fin_r[1 * in_step];
+    scratch_in[2] = Fin_r[2 * in_step];
+    scratch_in[3] = Fin_r[3 * in_step];
+
+    // NE10_PRINT_Q_VECTOR(scratch_in);
+
+    NE10_FFT_R2C_4R_CC(scratch_out,scratch_in);
+
+    // NE10_PRINT_Q_VECTOR(scratch_out);
+
+    Fout_r[                   0] = scratch_out[0];
+    Fout_r[                   1] = scratch_out[1];
+    Fout_r[ (out_step << 1)    ] = scratch_out[2];
+    Fout_r[ (out_step << 1) + 1] = scratch_out[3];
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_last_butterfly_c (ne10_float32_t *Fout_r,
+        const ne10_float32_t *Fin_r,
+        const ne10_int32_t out_step,
+        const ne10_int32_t in_step,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    // inverse operation of ne10_radix4_r2c_with_twiddles_last_butterfly_c
+    ne10_float32_t scratch_in [4];
+    ne10_float32_t scratch_out[4];
+
+    // load
+    scratch_in[0] = Fin_r[                   0];
+    scratch_in[1] = Fin_r[                   1];
+    scratch_in[2] = Fin_r[ (out_step << 1)    ];
+    scratch_in[3] = Fin_r[ (out_step << 1) + 1];
+
+    // NE10_PRINT_Q_VECTOR(scratch_in);
+
+    NE10_FFT_C2R_CC_4R(scratch_out,scratch_in);
+
+    // NE10_PRINT_Q_VECTOR(scratch_out);
+
+    // store
+    Fout_r[0 * in_step] = scratch_out[0];
+    Fout_r[1 * in_step] = scratch_out[1];
+    Fout_r[2 * in_step] = scratch_out[2];
+    Fout_r[3 * in_step] = scratch_out[3];
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
+        const ne10_fft_cpx_float32_t *Fin,
+        const ne10_int32_t fstride,
+        const ne10_int32_t mstride,
+        const ne10_int32_t nfft,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t f_count;
+    const ne10_int32_t in_step = nfft >> 2;
+    const ne10_int32_t out_step = mstride;
+
+    const ne10_float32_t *Fin_r  = (ne10_float32_t*) Fin;
+    ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
+    const ne10_fft_cpx_float32_t *tw;
+
+    Fout_r ++;
+    Fin_r ++;
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        tw = twiddles;
+
+        // first butterfly
+        ne10_radix4_r2c_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
+
+        tw ++;
+        Fin_r ++;
+        Fout_r ++;
+
+        // other butterfly
+        ne10_radix4_r2c_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
+
+        // update Fin_r, Fout_r, twiddles
+        tw     +=     ( (out_step >> 1) - 1);
+        Fin_r  += 2 * ( (out_step >> 1) - 1);
+        Fout_r += 2 * ( (out_step >> 1) - 1);
+
+        // last butterfly
+        ne10_radix4_r2c_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
+        tw ++;
+        Fin_r ++;
+        Fout_r ++;
+
+        Fout_r += 3 * out_step;
+    } // f_count
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_c (ne10_fft_cpx_float32_t *Fout,
+        const ne10_fft_cpx_float32_t *Fin,
+        const ne10_int32_t fstride,
+        const ne10_int32_t mstride,
+        const ne10_int32_t nfft,
+        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t f_count;
+    const ne10_int32_t in_step = nfft >> 2;
+    const ne10_int32_t out_step = mstride;
+
+    const ne10_float32_t *Fin_r  = (ne10_float32_t*) Fin;
+          ne10_float32_t *Fout_r = (ne10_float32_t*) Fout;
+    const ne10_fft_cpx_float32_t *tw;
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        tw = twiddles;
+
+        // first butterfly
+        ne10_radix4_c2r_with_twiddles_first_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
+
+        tw ++;
+        Fin_r  ++;
+        Fout_r ++;
+
+        // other butterfly
+        ne10_radix4_c2r_with_twiddles_other_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
+
+        // update Fin_r, Fout_r, twiddles
+        tw     +=     ( (out_step >> 1) - 1);
+        Fin_r  += 2 * ( (out_step >> 1) - 1);
+        Fout_r += 2 * ( (out_step >> 1) - 1);
+
+        // last butterfly
+        ne10_radix4_c2r_with_twiddles_last_butterfly_c (Fout_r, Fin_r, out_step, in_step, tw);
+        tw ++;
+        Fin_r  ++;
+        Fout_r ++;
+
+        Fin_r += 3 * out_step;
+    } // f_count
+}
+
+NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_c (
+    ne10_fft_cpx_float32_t * Fout,
+    const ne10_fft_cpx_float32_t * Fin,
+    const ne10_int32_t * factors,
+    const ne10_fft_cpx_float32_t * twiddles,
+    ne10_fft_cpx_float32_t * buffer)
+{
+    // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+
+    ne10_int32_t fstride, mstride, nfft;
+    ne10_int32_t radix;
+    ne10_int32_t stage_count;
+
+    // init fstride, mstride, radix, nfft
+    stage_count = factors[0];
+    fstride     = factors[1];
+    mstride     = factors[ (stage_count << 1) - 1 ];
+    radix       = factors[  stage_count << 1      ];
+    nfft        = radix * fstride;
+
+    // PRINT_STAGE_INFO;
+
+    if (radix == 2)
+    {
+        // combine one radix-4 and one radix-2 into one radix-8
+        mstride <<= 2;
+        fstride >>= 2;
+        twiddles += 6; // (4-1) x 2
+        stage_count --;
+    }
+
+    if (stage_count % 2 == 0)
+    {
+        ne10_swap_ptr (buffer, Fout);
+    }
+
+    // the first stage
+    if (radix == 2)   // length of FFT is 2^n (n is odd)
+    {
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix8_r2c_c (Fout, Fin, fstride, mstride, nfft);
+    }
+    else if (radix == 4)   // length of FFT is 2^n (n is even)
+    {
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4_r2c_c (Fout, Fin, fstride, mstride, nfft);
+    }
+    // end of first stage
+
+    // others
+    for (; fstride > 1;)
+    {
+        fstride >>= 2;
+        ne10_swap_ptr (buffer, Fout);
+
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4_r2c_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
+        twiddles += 3 * mstride;
+        mstride <<= 2;
+
+    } // other stage
+}
+
+NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_c (
+    ne10_fft_cpx_float32_t * Fout,
+    const ne10_fft_cpx_float32_t * Fin,
+    const ne10_int32_t * factors,
+    const ne10_fft_cpx_float32_t * twiddles,
+    ne10_fft_cpx_float32_t * buffer)
+{
+    // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+
+    ne10_int32_t fstride, mstride, nfft;
+    ne10_int32_t radix;
+    ne10_int32_t stage_count;
+
+    // init fstride, mstride, radix, nfft
+    stage_count = factors[0];
+    fstride     = factors[1];
+    mstride     = factors[ (stage_count << 1) - 1 ];
+    radix       = factors[  stage_count << 1      ];
+    nfft        = radix * fstride;
+
+    // fstride, mstride for the last stage
+    fstride = 1;
+    mstride = nfft >> 2;
+    // PRINT_STAGE_INFO;
+
+    if (radix == 2)
+    {
+        // combine one radix-4 and one radix-2 into one radix-8
+        stage_count --;
+    }
+
+    if (stage_count % 2 == 1)
+    {
+        ne10_swap_ptr (buffer, Fout);
+    }
+
+    // last butterfly -- inversed
+    if (stage_count > 1)
+    {
+        twiddles -= 3 * mstride;
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4_c2r_with_twiddles_c (buffer, Fin, fstride, mstride, nfft, twiddles);
+        fstride <<= 2;
+        mstride >>= 2;
+        stage_count --;
+    }
+
+    // others but the last stage
+    for (; stage_count > 1;)
+    {
+        twiddles -= 3 * mstride;
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4_c2r_with_twiddles_c (Fout, buffer, fstride, mstride, nfft, twiddles);
+        fstride <<= 2;
+        mstride >>= 2;
+        stage_count --;
+        ne10_swap_ptr (buffer, Fout);
+    } // other stage
+
+    // first stage -- inversed
+    if (radix == 2)   // length of FFT is 2^n (n is odd)
+    {
+        mstride >>= 1;
+
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix8_c2r_c (Fout, buffer, fstride, mstride, nfft);
+    }
+    else if (radix == 4)   // length of FFT is 2^n (n is even)
+    {
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4_c2r_c (Fout, buffer, fstride, mstride, nfft);
+    }
+}
+
+/**
+ * @brief User-callable function to allocate all necessary storage space for the fft (r2c/c2r).
+ * @param[in]   nfft             length of FFT
+ * @return      st               point to the FFT config memory. This memory is allocated with malloc.
+ * The function allocate all necessary storage space for the fft. It also factors out the length of FFT and generates the twiddle coeff.
+ */
+ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32 (ne10_int32_t nfft)
+{
+    ne10_fft_r2c_cfg_float32_t st = NULL;
+    ne10_int32_t ncfft = nfft >> 1;
+    ne10_int32_t result;
+
+    ne10_uint32_t memneeded =   sizeof (ne10_fft_r2c_state_float32_t)
+                              + sizeof (ne10_fft_cpx_float32_t) * nfft              /* buffer*/
+                              + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2)       /* r_factors */
+                              + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2)       /* r_factors_neon */
+                              + sizeof (ne10_fft_cpx_float32_t) * nfft              /* r_twiddles */
+                              + sizeof (ne10_fft_cpx_float32_t) * nfft/4            /* r_twiddles_neon */
+                              + sizeof (ne10_fft_cpx_float32_t) * (12 + nfft/32*12) /* r_super_twiddles_neon */
+                              + NE10_FFT_BYTE_ALIGNMENT;     /* 64-bit alignment*/
+
+    st = (ne10_fft_r2c_cfg_float32_t) NE10_MALLOC (memneeded);
+
+    if (!st)
+    {
+        return st;
+    }
+
+    ne10_int32_t i,j;
+    ne10_fft_cpx_float32_t *tw;
+    const ne10_float32_t pi = NE10_PI;
+    ne10_float32_t phase1;
+
+    st->nfft = nfft;
+
+    uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_r2c_state_float32_t);
+    NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
+
+    st->buffer = (ne10_fft_cpx_float32_t*) address;
+    st->r_twiddles = st->buffer + nfft;
+    st->r_factors = (ne10_int32_t*) (st->r_twiddles + nfft);
+    st->r_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors + (NE10_MAXFACTORS * 2));
+    st->r_factors_neon = (ne10_int32_t*) (st->r_twiddles_neon + nfft/4);
+    st->r_super_twiddles_neon = (ne10_fft_cpx_float32_t*) (st->r_factors_neon + (NE10_MAXFACTORS * 2));
+
+    if (nfft<16)
+    {
+        return st;
+    }
+
+    // factors and twiddles for rfft C
+    ne10_factor (nfft, st->r_factors);
+
+    // backward twiddles pointers
+    st->r_twiddles_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles, st->r_factors, nfft);
+
+    // factors and twiddles for rfft neon
+    result = ne10_factor (nfft/4, st->r_factors_neon);
+    if (result == NE10_ERR)
+    {
+        return st;
+    }
+
+    st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles_neon, st->r_factors_neon, nfft/4);
+
+    // nfft/4 x 4
+    tw = st->r_super_twiddles_neon;
+    for (i = 1; i < 4; i ++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            phase1 = - 2 * pi * ( (ne10_float32_t) (i * j) / nfft);
+            tw[4*i-4+j].r = (ne10_float32_t) cos (phase1);
+            tw[4*i-4+j].i = (ne10_float32_t) sin (phase1);
+        }
+    }
+
+    ne10_int32_t k,s;
+    // [nfft/32] x [3] x [4]
+    //     k        s     j
+    for (k=1; k<nfft/32; k++)
+    {
+        // transposed
+        for (s = 1; s < 4; s++)
+        {
+            for (j = 0; j < 4; j++)
+            {
+                phase1 = - 2 * pi * ( (ne10_float32_t) ((k*4+j) * s) / nfft);
+                tw[12*k+j+4*(s-1)].r = (ne10_float32_t) cos (phase1);
+                tw[12*k+j+4*(s-1)].i = (ne10_float32_t) sin (phase1);
+            }
+        }
+    }
+    return st;
+}
+
+/**
+ * @brief Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
+ * @param[out]  *fout            point to the output buffer
+ * @param[in]   *fin             point to the input buffer
+ * @param[in]   cfg              point to the config struct
+ * @return none.
+ * The function implements a mixed radix-2/4 FFT (real to complex). The length of 2^N(N is 3, 4, 5, 6 ....etc) is supported.
+ * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia.
+ * For the usage of this function, please check test/test_suite_fft_float32.c
+ */
+void ne10_fft_r2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
+                                ne10_float32_t *fin,
+                                ne10_fft_r2c_cfg_float32_t cfg)
+{
+    ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
+
+    switch(cfg->nfft)
+    {
+        case 8:
+            ne10_radix8_r2c_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) fin,1,1,8);
+            break;
+        default:
+            ne10_mixed_radix_r2c_butterfly_float32_c (
+                    fout,
+                    (ne10_fft_cpx_float32_t*) fin,
+                    cfg->r_factors,
+                    cfg->r_twiddles,
+                    tmpbuf);
+            break;
+    }
+
+    fout[0].r = fout[0].i;
+    fout[0].i = 0.0f;
+    fout[(cfg->nfft) >> 1].i = 0.0f;
+}
+
+/**
+ * @brief Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.
+ * @param[out]  *fout            point to the output buffer
+ * @param[in]   *fin             point to the input buffer
+ * @param[in]   cfg              point to the config struct
+ * @return none.
+ * The function implements a mixed radix-2/4 FFT (complex to real). The length of 2^N(N is 3, 4, 5, 6 ....etc) is supported.
+ * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia.
+ * For the usage of this function, please check test/test_suite_fft_float32.c
+ */
+void ne10_fft_c2r_1d_float32_c (ne10_float32_t *fout,
+                                ne10_fft_cpx_float32_t *fin,
+                                ne10_fft_r2c_cfg_float32_t cfg)
+{
+    ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
+
+    fin[0].i = fin[0].r;
+    fin[0].r = 0.0f;
+    switch(cfg->nfft)
+    {
+        case 8:
+            ne10_radix8_c2r_c( (ne10_fft_cpx_float32_t*) fout, ( ne10_fft_cpx_float32_t*) &fin[0].i,1,1,8);
+            break;
+        default:
+            ne10_mixed_radix_c2r_butterfly_float32_c (
+                    (ne10_fft_cpx_float32_t*)fout,
+                    (ne10_fft_cpx_float32_t*)&fin[0].i, // first real is moved to first image
+                    cfg->r_factors,
+                    cfg->r_twiddles_backward,
+                    tmpbuf);
+            break;
+    }
+    fin[0].r = fin[0].i;
+    fin[0].i = 0.0f;
+}
+
+/**
+ * @} end of R2C_FFT_IFFT group
+ */
+
+#endif // __aarch64__
diff --git a/modules/dsp/NE10_rfft_float32.neonintrinsic.c b/modules/dsp/NE10_rfft_float32.neonintrinsic.c
new file mode 100644 (file)
index 0000000..4c6744a
--- /dev/null
@@ -0,0 +1,1883 @@
+/*
+ *  Copyright 2014 ARM Limited
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of ARM Limited nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
+ *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* license of Kiss FFT */
+/*
+Copyright (c) 2003-2010, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * NE10 Library : dsp/NE10_rfft_float32.neonintrinsic.c
+ */
+
+#include <arm_neon.h>
+
+#include "NE10_types.h"
+#include "NE10_macros.h"
+#include "NE10_fft.h"
+#include "NE10_dsp.h"
+#include "NE10_fft.neonintrinsic.h"
+
+NE10_INLINE void ne10_radix8x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    ne10_int32_t f_count;
+
+    NE10_DECLARE_8(float32x4_t,q_in);
+    NE10_DECLARE_8(float32x4_t,q_out);
+
+    const float32x4_t *Fin_neon  = (float32x4_t*) Fin;  // 8 x fstride
+          float32x4_t *Fout_neon = (float32x4_t*) Fout; // fstride x 8
+
+    for (f_count = fstride; f_count > 0; f_count --)
+    {
+        // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = fstride
+        NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
+
+        // print q_in0 ~ q_in7
+        // NE10_PRINT_Qx8_VECTOR(q_in);
+
+        // do r2c fft, size = 8
+        NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
+
+        // print q_out0 ~ q_out7
+        // NE10_PRINT_Qx8_VECTOR(q_out);
+
+        // store q_out0 ~ q_out7 to Fout_neon, by step = 1
+        NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
+
+        Fin_neon = Fin_neon - fstride * 8 + 1;
+        Fout_neon += 8; // next column
+    }
+}
+
+NE10_INLINE void ne10_radix8x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    ne10_int32_t f_count;
+
+    NE10_DECLARE_8(float32x4_t,q_in);
+    NE10_DECLARE_8(float32x4_t,q_out);
+
+    const ne10_float32_t one_by_N = 0.25 / nfft;
+    const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
+
+    const float32x4_t *Fin_neon  = (float32x4_t*) Fin;
+          float32x4_t *Fout_neon = (float32x4_t*) Fout;
+
+    for (f_count = fstride; f_count > 0; f_count --)
+    {
+        // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = 1
+        NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
+
+        // NE10_PRINT_Qx8_VECTOR(q_in);
+
+        NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
+
+        // NE10_PRINT_Qx8_VECTOR(q_out);
+
+#ifdef NE10_DSP_RFFT_SCALING
+        q_out0 = vmulq_f32(q_out0,one_by_N_neon);
+        q_out1 = vmulq_f32(q_out1,one_by_N_neon);
+        q_out2 = vmulq_f32(q_out2,one_by_N_neon);
+        q_out3 = vmulq_f32(q_out3,one_by_N_neon);
+        q_out4 = vmulq_f32(q_out4,one_by_N_neon);
+        q_out5 = vmulq_f32(q_out5,one_by_N_neon);
+        q_out6 = vmulq_f32(q_out6,one_by_N_neon);
+        q_out7 = vmulq_f32(q_out7,one_by_N_neon);
+#endif
+
+        // store
+        NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
+
+        Fout_neon ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    ne10_int32_t f_count;
+
+    const float32x4_t *Fin_neon  = (float32x4_t*) Fin;
+          float32x4_t *Fout_neon = (float32x4_t*) Fout;
+
+    for (f_count = 0; f_count < fstride; f_count ++)
+    {
+        NE10_DECLARE_4(float32x4_t,q_in);
+        NE10_DECLARE_4(float32x4_t,q_out);
+
+        // load
+        NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
+
+        NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in)
+
+        // store
+        NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
+
+        Fin_neon = Fin_neon - 4*fstride + 1;
+        Fout_neon += 4;
+    }
+}
+
+NE10_INLINE void ne10_radix4x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
+                                  const ne10_fft_cpx_float32_t *Fin,
+                                  const ne10_int32_t fstride,
+                                  const ne10_int32_t mstride,
+                                  const ne10_int32_t nfft)
+{
+    ne10_int32_t f_count;
+
+    const float32x4_t *Fin_neon  = (float32x4_t*) Fin;
+          float32x4_t *Fout_neon = (float32x4_t*) Fout;
+
+    const ne10_float32_t one_by_N = 0.25 / nfft;
+    const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
+
+    for (f_count = 0; f_count < fstride; f_count ++)
+    {
+        NE10_DECLARE_4(float32x4_t,q_in);
+        NE10_DECLARE_4(float32x4_t,q_out);
+
+        // load
+        NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
+
+        // NE10_PRINT_Qx4_VECTOR(q_in);
+
+        NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in)
+
+        // NE10_PRINT_Qx4_VECTOR(q_out);
+
+#ifdef NE10_DSP_RFFT_SCALING
+        q_out0 = vmulq_f32(q_out0,one_by_N_neon);
+        q_out1 = vmulq_f32(q_out1,one_by_N_neon);
+        q_out2 = vmulq_f32(q_out2,one_by_N_neon);
+        q_out3 = vmulq_f32(q_out3,one_by_N_neon);
+#endif
+
+        // store
+        NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
+
+        Fout_neon ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
+                                                            const float32x4_t *Fin_neon,
+                                                            const ne10_int32_t out_step,
+                                                            const ne10_int32_t in_step,
+                                                            const ne10_fft_cpx_float32_t *twiddles)
+{
+    NE10_DECLARE_4(float32x4_t,q_in);
+    NE10_DECLARE_4(float32x4_t,q_out);
+
+    // load
+    NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
+
+    NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
+
+    // store
+    vst1q_f32( (ne10_float32_t*) (Fout_neon                          ), q_out0);
+    vst1q_f32( (ne10_float32_t*) (Fout_neon +     (out_step << 1) - 1), q_out1);
+    vst1q_f32( (ne10_float32_t*) (Fout_neon +     (out_step << 1)    ), q_out2);
+    vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
+}
+
+NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
+                                                            const float32x4_t *Fin_neon,
+                                                            const ne10_int32_t out_step,
+                                                            const ne10_int32_t in_step,
+                                                            const ne10_fft_cpx_float32_t *twiddles)
+{
+    NE10_DECLARE_4(float32x4_t,q_in);
+    NE10_DECLARE_4(float32x4_t,q_out);
+
+    // load
+    q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon                          ) );
+    q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon +     (out_step << 1) - 1) );
+    q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon +     (out_step << 1)    ) );
+    q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
+
+    // NE10_PRINT_Qx4_VECTOR(q_in);
+
+    NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
+
+    // NE10_PRINT_Qx4_VECTOR(q_out);
+
+    // store
+    NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
+}
+
+NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
+                                                                const float32x4_t *Fin_neon,
+                                                                const ne10_int32_t out_step,
+                                                                const ne10_int32_t in_step,
+                                                                const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t m_count;
+    ne10_int32_t loop_count = (out_step>>1) -1;
+    float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2; // reversed
+
+    NE10_DECLARE_3(float32x4x2_t,q2_tw);
+    NE10_DECLARE_4(float32x4x2_t,q2_in);
+    NE10_DECLARE_4(float32x4x2_t,q2_out);
+
+    for (m_count = loop_count; m_count > 0; m_count -- )
+    {
+        // load
+        q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step    ) );
+        q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
+
+        q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step    ) );
+        q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
+
+        q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step    ) );
+        q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
+
+        q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step    ) );
+        q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
+
+        q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
+        q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
+
+        q2_tw1.val[0] = vdupq_n_f32(twiddles[out_step].r);
+        q2_tw1.val[1] = vdupq_n_f32(twiddles[out_step].i);
+
+        q2_tw2.val[0] = vdupq_n_f32(twiddles[out_step*2].r);
+        q2_tw2.val[1] = vdupq_n_f32(twiddles[out_step*2].i);
+
+        // R2C TW KERNEL
+        NE10_RADIX4x4_R2C_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
+
+        // store
+        vst1q_f32( (ne10_float32_t*) ( Fout_neon                      ), q2_out0.val[0] );
+        vst1q_f32( (ne10_float32_t*) ( Fout_neon                   + 1), q2_out0.val[1] );
+
+        vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1)    ), q2_out1.val[0] );
+        vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
+
+        vst1q_f32( (ne10_float32_t*) ( Fout_b                         ), q2_out2.val[0] );
+        vst1q_f32( (ne10_float32_t*) ( Fout_b                      + 1), q2_out2.val[1] );
+
+        vst1q_f32( (ne10_float32_t*) ( Fout_b    - (out_step << 1)    ), q2_out3.val[0] );
+        vst1q_f32( (ne10_float32_t*) ( Fout_b    - (out_step << 1) + 1), q2_out3.val[1] );
+
+        // update pointers
+        Fin_neon  += 2;
+        Fout_neon += 2;
+        Fout_b    -= 2;
+        twiddles ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
+                                                                const float32x4_t *Fin_neon,
+                                                                const ne10_int32_t out_step,
+                                                                const ne10_int32_t in_step,
+                                                                const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t m_count;
+    ne10_int32_t loop_count = (out_step>>1) -1;
+    const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2; // reversed
+
+    NE10_DECLARE_3(float32x4x2_t,q2_tw);
+    NE10_DECLARE_4(float32x4x2_t,q2_in);
+    NE10_DECLARE_4(float32x4x2_t,q2_out);
+
+    for (m_count = loop_count; m_count > 0; m_count -- )
+    {
+        // load
+        q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon                      ) );
+        q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon                   + 1) );
+
+        q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1)    ) );
+        q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
+
+        q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b                         ) );
+        q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b                      + 1) );
+
+        q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b    - (out_step << 1)    ) );
+        q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b    - (out_step << 1) + 1) );
+
+        q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
+        q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
+
+        q2_tw1.val[0] = vdupq_n_f32(twiddles[out_step].r);
+        q2_tw1.val[1] = vdupq_n_f32(twiddles[out_step].i);
+
+        q2_tw2.val[0] = vdupq_n_f32(twiddles[out_step*2].r);
+        q2_tw2.val[1] = vdupq_n_f32(twiddles[out_step*2].i);
+
+        // NE10_PRINT_Q2x4_VECTOR(q2_in);
+
+        // R2C TW KERNEL
+        NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
+
+        // NE10_PRINT_Q2x4_VECTOR(q2_out);
+
+        // store
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step    ), q2_out0.val[0] );
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
+
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step    ), q2_out1.val[0] );
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
+
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step    ), q2_out2.val[0] );
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
+
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step    ), q2_out3.val[0] );
+        vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
+
+        // update pointers
+        Fin_neon  += 2;
+        Fout_neon += 2;
+        Fin_b    -= 2;
+        twiddles ++;
+    }
+}
+
+NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
+                                                            const float32x4_t *Fin_neon,
+                                                            const ne10_int32_t out_step,
+                                                            const ne10_int32_t in_step,
+                                                            const ne10_fft_cpx_float32_t *twiddles)
+{
+    NE10_DECLARE_4(float32x4_t,q_in);
+    NE10_DECLARE_4(float32x4_t,q_out);
+
+    // load
+    NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
+
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(q_out,q_in);
+
+    // store
+    vst1q_f32( (ne10_float32_t*) (Fout_neon    ), q_out0);
+    vst1q_f32( (ne10_float32_t*) (Fout_neon + 1), q_out1);
+    vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1)    ), q_out2);
+    vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) + 1), q_out3);
+}
+
+NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
+                                                            const float32x4_t *Fin_neon,
+                                                            const ne10_int32_t out_step,
+                                                            const ne10_int32_t in_step,
+                                                            const ne10_fft_cpx_float32_t *twiddles)
+{
+    NE10_DECLARE_4(float32x4_t,q_in);
+    NE10_DECLARE_4(float32x4_t,q_out);
+
+    // load
+    q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon    ) );
+    q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1) );
+    q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1)    ) );
+    q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) + 1) );
+
+    // NE10_PRINT_Qx4_VECTOR(q_in);
+
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(q_out,q_in);
+
+    // NE10_PRINT_Qx4_VECTOR(q_out);
+
+    // store
+    NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
+}
+
+NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon (ne10_fft_cpx_float32_t *Fout,
+                                                        const ne10_fft_cpx_float32_t *Fin,
+                                                        const ne10_int32_t fstride,
+                                                        const ne10_int32_t mstride,
+                                                        const ne10_int32_t nfft,
+                                                        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t f_count;
+    const ne10_int32_t in_step = nfft >> 2;
+    const ne10_int32_t out_step = mstride;
+
+    const float32x4_t *Fin_neon  = (float32x4_t*) Fin;
+          float32x4_t *Fout_neon = (float32x4_t*) Fout;
+    const ne10_fft_cpx_float32_t *tw;
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        tw = twiddles;
+
+        // first butterfly
+        ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
+
+        tw ++;
+        Fin_neon ++;
+        Fout_neon ++;
+
+        // other butterfly
+        ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
+
+        // update Fin_r, Fout_r, twiddles
+        tw        +=     ( (out_step >> 1) - 1);
+        Fin_neon  += 2 * ( (out_step >> 1) - 1);
+        Fout_neon += 2 * ( (out_step >> 1) - 1);
+
+        // last butterfly
+        ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, tw);
+        Fin_neon ++;
+        tw++;
+        Fout_neon ++;
+
+        Fout_neon = Fout_neon + 3 * out_step;
+    } // f_count
+}
+
+NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon (ne10_fft_cpx_float32_t *Fout,
+                                                        const ne10_fft_cpx_float32_t *Fin,
+                                                        const ne10_int32_t fstride,
+                                                        const ne10_int32_t mstride,
+                                                        const ne10_int32_t nfft,
+                                                        const ne10_fft_cpx_float32_t *twiddles)
+{
+    ne10_int32_t f_count;
+    const ne10_int32_t in_step = nfft >> 2;
+    const ne10_int32_t out_step = mstride;
+
+    const float32x4_t *Fin_neon  = (float32x4_t*) Fin;
+          float32x4_t *Fout_neon = (float32x4_t*) Fout;
+    const ne10_fft_cpx_float32_t *tw;
+
+    for (f_count = fstride; f_count; f_count --)
+    {
+        tw = twiddles;
+
+        // first butterfly
+        ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
+
+        tw ++;
+        Fin_neon ++;
+        Fout_neon ++;
+
+        // other butterfly
+        ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
+
+        // update Fin_r, Fout_r, twiddles
+        tw        +=     ( (out_step >> 1) - 1);
+        Fin_neon  += 2 * ( (out_step >> 1) - 1);
+        Fout_neon += 2 * ( (out_step >> 1) - 1);
+
+        // last butterfly
+        ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, tw);
+        Fin_neon ++;
+        tw++;
+        Fout_neon ++;
+
+        Fin_neon = Fin_neon + 3 * out_step;
+    } // f_count
+}
+
+NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
+                                                        const ne10_fft_cpx_float32_t * Fin,
+                                                        const ne10_int32_t * factors,
+                                                        const ne10_fft_cpx_float32_t * twiddles,
+                                                        ne10_fft_cpx_float32_t * buffer)
+{
+    ne10_int32_t fstride, mstride, nfft;
+    ne10_int32_t radix;
+    ne10_int32_t stage_count;
+
+    // PRINT_STAGE_INFO;
+
+    // init fstride, mstride, radix, nfft
+    stage_count = factors[0];
+    fstride     = factors[1];
+    mstride     = factors[ (stage_count << 1) - 1 ];
+    radix       = factors[  stage_count << 1 ];
+    nfft        = radix * fstride; // not the real nfft
+
+    // PRINT_STAGE_INFO;
+
+    if (radix == 2)
+    {
+        // combine one radix-4 and one radix-2 into one radix-8
+        mstride <<= 2;
+        fstride >>= 2;
+        twiddles += 6;
+        stage_count --;
+    }
+
+    if (stage_count % 2 == 1) // since there is another stage outside
+    {
+        ne10_swap_ptr (buffer, Fout);
+    }
+
+    // the first stage
+    if (radix == 2)   // length of FFT is 2^n (n is odd)
+    {
+        ne10_radix8x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
+    }
+    else if (radix == 4)   // length of FFT is 2^n (n is even)
+    {
+        ne10_radix4x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
+    }
+    // end of first stage
+
+    // others
+    for (; fstride > 1;)
+    {
+        fstride >>= 2;
+        ne10_swap_ptr (buffer, Fout);
+
+        ne10_radix4x4_r2c_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
+        twiddles += 3 * mstride;
+        mstride <<= 2;
+    } // other stage
+}
+
+NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
+                                                        const ne10_fft_cpx_float32_t * Fin,
+                                                        const ne10_int32_t * factors,
+                                                        const ne10_fft_cpx_float32_t * twiddles,
+                                                        ne10_fft_cpx_float32_t * buffer)
+{
+    ne10_int32_t fstride, mstride, nfft;
+    ne10_int32_t radix;
+    ne10_int32_t stage_count;
+
+    // PRINT_STAGE_INFO;
+
+    // init fstride, mstride, radix, nfft
+    stage_count = factors[0];
+    fstride     = factors[1];
+
+    mstride     = factors[ (stage_count << 1) - 1 ];
+    radix       = factors[  stage_count << 1 ];
+    nfft        = radix * fstride; // not the real nfft
+
+    // fstride, mstride for last last stage
+    fstride = 1;
+    mstride = nfft >> 2;
+
+    if (radix == 2)
+    {
+        // combine one radix-4 and one radix-2 into one radix-8
+        stage_count --;
+    }
+
+    if (stage_count % 2 == 0)
+    {
+        ne10_swap_ptr(Fout,buffer);
+    }
+
+    // others but the first stage
+    for (; stage_count > 1;)
+    {
+        twiddles -= 3 * mstride;
+
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4x4_c2r_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
+
+        fstride <<= 2;
+        mstride >>= 2;
+        stage_count --;
+        ne10_swap_ptr (buffer, Fout);
+    }
+
+    // first stage -- inversed
+    if (radix == 2)   // length of FFT is 2^n (n is odd)
+    {
+        mstride >>= 1;
+
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix8x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
+    }
+    else if (radix == 4)   // length of FFT is 2^n (n is even)
+    {
+        // PRINT_STAGE_INFO;
+        // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
+        ne10_radix4x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
+    }
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly (ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    // b0
+    {
+        ne10_float32_t q_4r_out[4];
+        const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
+
+        NE10_FFT_R2C_4R_RCR(q_4r_out,p_src_r);
+
+        dst[0].r = q_4r_out[0];
+        dst[0].i = q_4r_out[3];
+        dst += (nfft>>2);
+        dst[0].r = q_4r_out[1];
+        dst[0].i = q_4r_out[2];
+        dst -= (nfft>>2);
+    }
+
+    // b2
+    {
+        const ne10_float32_t *p_src_r = (const ne10_float32_t*) (src);
+        p_src_r  += nfft;
+        p_src_r  -= 4;
+
+        ne10_float32_t q_4r_out[4];
+
+        NE10_FFT_R2C_4R_CC(q_4r_out,p_src_r);
+
+        dst += (nfft>>3);
+        dst[0].r = q_4r_out[0];
+        dst[0].i = q_4r_out[1];
+        dst += (nfft>>2);
+        dst[0].r = q_4r_out[2];
+        dst[0].i = q_4r_out[3];
+        dst -= (nfft>>3);
+        dst -= (nfft>>2);
+    }
+
+    // b1
+    ne10_fft_cpx_float32_t cc_out[4];
+    ne10_fft_cpx_float32_t cc_in [4];
+    const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
+    p_src_r += 4;
+
+    cc_out[0].r = *(p_src_r ++);
+    cc_out[1].r = *(p_src_r ++);
+    cc_out[2].r = *(p_src_r ++);
+    cc_out[3].r = *(p_src_r ++);
+
+    cc_out[0].i = *(p_src_r ++);
+    cc_out[1].i = *(p_src_r ++);
+    cc_out[2].i = *(p_src_r ++);
+    cc_out[3].i = *(p_src_r ++);
+
+    NE10_PRINT_Q2_VECTOR(cc_out);
+
+    // twiddles[0] = ( 1.0, 0.0);
+    // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
+    cc_in[0] = cc_out[0];
+    twiddles ++;
+
+    NE10_CPX_MUL_F32(cc_in[1],cc_out[1],twiddles[0]);
+    twiddles ++;
+
+    NE10_CPX_MUL_F32(cc_in[2],cc_out[2],twiddles[0]);
+    twiddles ++;
+
+    NE10_CPX_MUL_F32(cc_in[3],cc_out[3],twiddles[0]);
+
+    // NE10_PRINT_Q2_VECTOR(cc_in);
+
+    NE10_FFT_R2C_CC_CC(cc_out,cc_in);
+
+    // NE10_PRINT_Q2_VECTOR(cc_out);
+
+    dst[1] = cc_out[0];
+    dst += (nfft>>2);
+    dst[ 1] = cc_out[1];
+    dst[-1] = cc_out[3];
+    dst += (nfft>>2);
+    dst[-1] = cc_out[2];
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly (ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    // b0
+    {
+        ne10_float32_t q_4r_in[4];
+        ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
+
+        q_4r_in[0] = src[0].r;
+        q_4r_in[3] = src[0].i;
+        src += (nfft>>2);
+        q_4r_in[1] = src[0].r;
+        q_4r_in[2] = src[0].i;
+        src -= (nfft>>2);
+
+        NE10_FFT_C2R_RCR_4R(p_dst_r,q_4r_in);
+    }
+
+    // b2
+    {
+        // float32x4_t q_in;
+        ne10_float32_t *p_dst_r = (ne10_float32_t*) (dst);
+        p_dst_r  += nfft;
+        p_dst_r  -= 4;
+
+        ne10_float32_t q_4r_in[4];
+        src += (nfft>>3);
+        q_4r_in[0] = src[0].r;
+        q_4r_in[1] = src[0].i;
+        src += (nfft>>2);
+        q_4r_in[2] = src[0].r;
+        q_4r_in[3] = src[0].i;
+        src -= (nfft>>3);
+        src -= (nfft>>2);
+
+        NE10_FFT_C2R_CC_4R(p_dst_r,q_4r_in);
+    }
+
+    // b1
+    ne10_fft_cpx_float32_t cc_out[4];
+    ne10_fft_cpx_float32_t cc_in [4];
+    ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
+    p_dst_r += 4;
+
+    // load
+    cc_out[0] = src[1];
+    src += (nfft>>2);
+    cc_out[2] = src[ 1];
+    cc_out[3] = src[-1];
+    src += (nfft>>2);
+    cc_out[1] = src[-1];
+
+    // NE10_PRINT_Q2_VECTOR(cc_out);
+
+    NE10_FFT_C2R_CC_CC(cc_in,cc_out);
+
+    // NE10_PRINT_Q2_VECTOR(cc_in);
+
+    // twiddles[0] = ( 1.0, 0.0);
+    // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
+    cc_out[0] = cc_in[0];
+    twiddles ++;
+
+    NE10_CPX_CONJ_MUL_F32(cc_out[1],cc_in[1],twiddles[0]);
+    twiddles ++;
+
+    NE10_CPX_CONJ_MUL_F32(cc_out[2],cc_in[2],twiddles[0]);
+    twiddles ++;
+
+    NE10_CPX_CONJ_MUL_F32(cc_out[3],cc_in[3],twiddles[0]);
+
+    // NE10_PRINT_Q2_VECTOR(cc_out);
+
+    *(p_dst_r ++) = cc_out[0].r;
+    *(p_dst_r ++) = cc_out[1].r;
+    *(p_dst_r ++) = cc_out[2].r;
+    *(p_dst_r ++) = cc_out[3].r;
+
+    *(p_dst_r ++) = cc_out[0].i;
+    *(p_dst_r ++) = cc_out[1].i;
+    *(p_dst_r ++) = cc_out[2].i;
+    *(p_dst_r ++) = cc_out[3].i;
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly (ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    // assert ( nfft % 4 == 0 );
+    const ne10_float32_t *fin_r  = (const ne10_float32_t*) src + 12;
+          ne10_float32_t *fout_r =       (ne10_float32_t*) dst;
+    const ne10_float32_t *tw     = (const ne10_float32_t*) twiddles + 8;
+
+    ne10_float32_t q_in0[4],    q_out0[4],
+                   q_in1[4],    q_out1[4],
+                   q_in2[4],    q_out2[4],
+                   q_in3[4],    q_out3[4];
+
+    ne10_float32_t q2_tw0[2][4],
+                   q2_tw1[2][4];
+
+    /*  INPUT & OUTPUT
+     *  0R  1R  2R  3R      Q0
+     *  0I  1I  2I  3I      Q1
+     *  4R  5R  6R  7R      Q2
+     *  4I  5I  6I  7I      Q3
+     */
+
+    q_in0[0] = *(fin_r++);
+    q_in0[1] = *(fin_r++);
+    q_in0[2] = *(fin_r++);
+    q_in0[3] = *(fin_r++);
+    q_in1[0] = *(fin_r++);
+    q_in1[1] = *(fin_r++);
+    q_in1[2] = *(fin_r++);
+    q_in1[3] = *(fin_r++);
+    q_in2[0] = *(fin_r++);
+    q_in2[1] = *(fin_r++);
+    q_in2[2] = *(fin_r++);
+    q_in2[3] = *(fin_r++);
+    q_in3[0] = *(fin_r++);
+    q_in3[1] = *(fin_r++);
+    q_in3[2] = *(fin_r++);
+    q_in3[3] = *(fin_r++);
+
+    // NE10_PRINT_Q_VECTOR(q_in0);
+    // NE10_PRINT_Q_VECTOR(q_in1);
+    // NE10_PRINT_Q_VECTOR(q_in2);
+    // NE10_PRINT_Q_VECTOR(q_in3);
+
+    q2_tw0[0][0] = tw[0];
+    q2_tw0[0][1] = tw[2];
+    q2_tw0[0][2] = tw[4];
+    q2_tw0[0][3] = tw[6];
+    q2_tw0[1][0] = tw[1];
+    q2_tw0[1][1] = tw[3];
+    q2_tw0[1][2] = tw[5];
+    q2_tw0[1][3] = tw[7];
+
+    q2_tw1[0][0] = tw[0+8];
+    q2_tw1[0][1] = tw[2+8];
+    q2_tw1[0][2] = tw[4+8];
+    q2_tw1[0][3] = tw[6+8];
+    q2_tw1[1][0] = tw[1+8];
+    q2_tw1[1][1] = tw[3+8];
+    q2_tw1[1][2] = tw[5+8];
+    q2_tw1[1][3] = tw[7+8];
+
+    // TW: in->out
+    q_out0[0] = q_in0[0];
+    q_out1[0] = q_in1[0];
+    q_out2[0] = q_in2[0];
+    q_out3[0] = q_in3[0];
+
+    //----------------------------------------------------------//
+    // first 2 lines
+    //   R          R             R           I             I
+    q_out0[1] = q_in0[1] * q2_tw0[0][1] - q_in1[1] * q2_tw0[1][1];
+    //   I          R             I           I             R
+    q_out1[1] = q_in0[1] * q2_tw0[1][1] + q_in1[1] * q2_tw0[0][1];
+
+    //   R          R             R           I             I
+    q_out0[2] = q_in0[2] * q2_tw0[0][2] - q_in1[2] * q2_tw0[1][2];
+    //   I          R             I           I             R
+    q_out1[2] = q_in0[2] * q2_tw0[1][2] + q_in1[2] * q2_tw0[0][2];
+
+    //   R          R             R           I             I
+    q_out0[3] = q_in0[3] * q2_tw0[0][3] - q_in1[3] * q2_tw0[1][3];
+    //   I          R             I           I             R
+    q_out1[3] = q_in0[3] * q2_tw0[1][3] + q_in1[3] * q2_tw0[0][3];
+
+    //---------------------------------------------------------//
+    // second 2 lines
+    //   R          R             R           I             I
+    q_out2[1] = q_in2[1] * q2_tw1[0][1] - q_in3[1] * q2_tw1[1][1];
+    //   I          R             I           I             R
+    q_out3[1] = q_in2[1] * q2_tw1[1][1] + q_in3[1] * q2_tw1[0][1];
+
+    //   R          R             R           I             I
+    q_out2[2] = q_in2[2] * q2_tw1[0][2] - q_in3[2] * q2_tw1[1][2];
+    //   I          R             I           I             R
+    q_out3[2] = q_in2[2] * q2_tw1[1][2] + q_in3[2] * q2_tw1[0][2];
+
+    //   R          R             R           I             I
+    q_out2[3] = q_in2[3] * q2_tw1[0][3] - q_in3[3] * q2_tw1[1][3];
+    //   I          R             I           I             R
+    q_out3[3] = q_in2[3] * q2_tw1[1][3] + q_in3[3] * q2_tw1[0][3];
+
+    // NE10_PRINT_Q_VECTOR(q_out0);
+    // NE10_PRINT_Q_VECTOR(q_out1);
+    // NE10_PRINT_Q_VECTOR(q_out2);
+    // NE10_PRINT_Q_VECTOR(q_out3);
+
+    // BUTTERFLY - radix 4x2
+    // STAGE
+    // q_out -> q_in
+    //  R i         R j         R k
+    q_in0[0] = q_out0[0] + q_out0[2];
+    q_in1[0] = q_out1[0] + q_out1[2];
+
+    q_in0[1] = q_out0[0] - q_out0[2];
+    q_in1[1] = q_out1[0] - q_out1[2];
+
+    //  R i         R j         R k
+    q_in0[2] = q_out0[1] + q_out0[3];
+    q_in1[2] = q_out1[1] + q_out1[3];
+
+    q_in0[3] = q_out0[1] - q_out0[3];
+    q_in1[3] = q_out1[1] - q_out1[3];
+
+    //  R i         R j         R k
+    q_in2[0] = q_out2[0] + q_out2[2];
+    q_in3[0] = q_out3[0] + q_out3[2];
+
+    q_in2[1] = q_out2[0] - q_out2[2];
+    q_in3[1] = q_out3[0] - q_out3[2];
+
+    //  R i         R j         R k
+    q_in2[2] = q_out2[1] + q_out2[3];
+    q_in3[2] = q_out3[1] + q_out3[3];
+
+    q_in2[3] = q_out2[1] - q_out2[3];
+    q_in3[3] = q_out3[1] - q_out3[3];
+
+    // NE10_PRINT_Q_VECTOR(q_in0);
+    // NE10_PRINT_Q_VECTOR(q_in1);
+    // NE10_PRINT_Q_VECTOR(q_in2);
+    // NE10_PRINT_Q_VECTOR(q_in3);
+
+    // STAGE
+    // q_in -> q_out
+    // and transpose
+    //   R i          R j        R k
+    q_out0[0] =   q_in0[0] + q_in0[2];
+    q_out0[1] =   q_in1[0] + q_in1[2];
+
+    q_out2[2] =   q_in0[0] - q_in0[2];
+    q_out2[3] = - q_in1[0] + q_in1[2];// CONJ
+
+    //   R i          R j        R k
+    q_out3[2] =   q_in0[1] - q_in1[3];
+    q_out3[3] = - q_in1[1] - q_in0[3];// CONJ
+
+    q_out1[0] =   q_in0[1] + q_in1[3];
+    q_out1[1] =   q_in1[1] - q_in0[3];
+
+    //   R i          R j        R k
+    q_out0[2] =   q_in2[0] + q_in2[2];
+    q_out0[3] =   q_in3[0] + q_in3[2];
+
+    q_out2[0] =   q_in2[0] - q_in2[2];
+    q_out2[1] = - q_in3[0] + q_in3[2];// CONJ
+
+    //   R i          R j        R k
+    q_out3[0] =   q_in2[1] - q_in3[3];
+    q_out3[1] = - q_in3[1] - q_in2[3]; // CONJ
+
+    q_out1[2] =   q_in2[1] + q_in3[3];
+    q_out1[3] =   q_in3[1] - q_in2[3];
+
+    // NE10_PRINT_Q_VECTOR(q_out0);
+    // NE10_PRINT_Q_VECTOR(q_out1);
+    // NE10_PRINT_Q_VECTOR(q_out2);
+    // NE10_PRINT_Q_VECTOR(q_out3);
+
+    // STORE
+    fout_r += 4;
+    fout_r[0] = q_out0[0];
+    fout_r[1] = q_out0[1];
+    fout_r[2] = q_out0[2];
+    fout_r[3] = q_out0[3];
+
+    fout_r += (nfft>>1);
+    fout_r[0] = q_out1[0];
+    fout_r[1] = q_out1[1];
+    fout_r[2] = q_out1[2];
+    fout_r[3] = q_out1[3];
+
+    fout_r -= 10;
+    fout_r[0] = q_out3[0];
+    fout_r[1] = q_out3[1];
+    fout_r[2] = q_out3[2];
+    fout_r[3] = q_out3[3];
+
+    fout_r += (nfft>>1);
+    fout_r[0] = q_out2[0];
+    fout_r[1] = q_out2[1];
+    fout_r[2] = q_out2[2];
+    fout_r[3] = q_out2[3];
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly (ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    const ne10_float32_t *fin_r  = (const ne10_float32_t*) src;
+          ne10_float32_t *fout_r =       (ne10_float32_t*) dst + 12;
+    const ne10_float32_t *tw     = (const ne10_float32_t*) twiddles + 8;
+
+    ne10_float32_t q_in0[4],    q_out0[4],
+                   q_in1[4],    q_out1[4],
+                   q_in2[4],    q_out2[4],
+                   q_in3[4],    q_out3[4];
+
+    ne10_float32_t q2_tw0[2][4],
+                   q2_tw1[2][4];
+
+    /*  INPUT & OUTPUT
+     *  0R  1R  2R  3R      Q0
+     *  0I  1I  2I  3I      Q1
+     *  4R  5R  6R  7R      Q2
+     *  4I  5I  6I  7I      Q3
+     */
+
+    // load
+    fin_r += 4;
+    q_in0[0] = fin_r[0];
+    q_in0[1] = fin_r[1];
+    q_in0[2] = fin_r[2];
+    q_in0[3] = fin_r[3];
+
+    fin_r += (nfft>>1);
+    q_in1[0] = fin_r[0];
+    q_in1[1] = fin_r[1];
+    q_in1[2] = fin_r[2];
+    q_in1[3] = fin_r[3];
+
+    fin_r -= 10;
+    q_in3[0] = fin_r[0];
+    q_in3[1] = fin_r[1];
+    q_in3[2] = fin_r[2];
+    q_in3[3] = fin_r[3];
+
+    fin_r += (nfft>>1);
+    q_in2[0] = fin_r[0];
+    q_in2[1] = fin_r[1];
+    q_in2[2] = fin_r[2];
+    q_in2[3] = fin_r[3];
+
+    // NE10_PRINT_Q_VECTOR(q_in0);
+    // NE10_PRINT_Q_VECTOR(q_in1);
+    // NE10_PRINT_Q_VECTOR(q_in2);
+    // NE10_PRINT_Q_VECTOR(q_in3);
+
+    // OUTPUT
+    // INPUT
+#define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do {    \
+    q_out ## I1 [I2] = ( q_in ## K1 [K2] + q_in ## S1 [S2] ); \
+    q_out ## J1 [J2] = ( q_in ## K1 [K2] - q_in ## S1 [S2] ); \
+} while(0);
+
+    // STAGE
+    // q_in -> q_out
+    // and transpose
+    NE10_INV_BUTTERFLY_TMP( 0,0, 0,2,
+                            0,0, 2,2);
+
+    NE10_INV_BUTTERFLY_TMP( 1,2, 1,0,
+                            0,1, 2,3);
+
+    NE10_INV_BUTTERFLY_TMP( 0,1, 1,3,
+                            1,0, 3,2);
+
+    q_in3[3] *= - 1.0f;
+    NE10_INV_BUTTERFLY_TMP( 1,1, 0,3,
+                            3,3, 1,1);
+
+    NE10_INV_BUTTERFLY_TMP( 2,0, 2,2,
+                            0,2, 2,0);
+
+    NE10_INV_BUTTERFLY_TMP( 3,2, 3,0,
+                            0,3, 2,1);
+
+    NE10_INV_BUTTERFLY_TMP( 2,1, 3,3,
+                            1,2, 3,0);
+
+    q_in3[1] *= - 1.0f;
+    NE10_INV_BUTTERFLY_TMP( 3,1, 2,3,
+                            3,1, 1,3);
+#undef NE10_INV_BUTTERFLY_TMP
+
+    // NE10_PRINT_Q_VECTOR(q_out0);
+    // NE10_PRINT_Q_VECTOR(q_out1);
+    // NE10_PRINT_Q_VECTOR(q_out2);
+    // NE10_PRINT_Q_VECTOR(q_out3);
+
+    // BUTTERFLY - radix 4x2
+    // STAGE
+    // q_out -> q_in
+
+    // OUTPUT
+    // INPUT
+#define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do {    \
+    q_in ## I1 [I2] = ( q_out ## K1 [K2] + q_out ## S1 [S2] ); \
+    q_in ## J1 [J2] = ( q_out ## K1 [K2] - q_out ## S1 [S2] ); \
+} while(0);
+
+    NE10_INV_BUTTERFLY_TMP(0,0, 0,2,
+                           0,0, 0,1);
+
+    NE10_INV_BUTTERFLY_TMP(1,0, 1,2,
+                           1,0, 1,1);
+
+    NE10_INV_BUTTERFLY_TMP(0,1, 0,3,
+                           0,2, 0,3);
+
+    NE10_INV_BUTTERFLY_TMP(1,1, 1,3,
+                           1,2, 1,3);
+
+    NE10_INV_BUTTERFLY_TMP(2,0, 2,2,
+                           2,0, 2,1);
+
+    NE10_INV_BUTTERFLY_TMP(3,0, 3,2,
+                           3,0, 3,1);
+
+
+    NE10_INV_BUTTERFLY_TMP(2,1, 2,3,
+                           2,2, 2,3);
+
+    NE10_INV_BUTTERFLY_TMP(3,1, 3,3,
+                           3,2, 3,3);
+
+    // NE10_PRINT_Q_VECTOR(q_in0);
+    // NE10_PRINT_Q_VECTOR(q_in1);
+    // NE10_PRINT_Q_VECTOR(q_in2);
+    // NE10_PRINT_Q_VECTOR(q_in3);
+#undef NE10_INV_BUTTERFLY_TMP
+
+    // load tw
+    q2_tw0[0][0] = tw[0];
+    q2_tw0[0][1] = tw[2];
+    q2_tw0[0][2] = tw[4];
+    q2_tw0[0][3] = tw[6];
+    q2_tw0[1][0] = tw[1];
+    q2_tw0[1][1] = tw[3];
+    q2_tw0[1][2] = tw[5];
+    q2_tw0[1][3] = tw[7];
+
+    q2_tw1[0][0] = tw[0+8];
+    q2_tw1[0][1] = tw[2+8];
+    q2_tw1[0][2] = tw[4+8];
+    q2_tw1[0][3] = tw[6+8];
+    q2_tw1[1][0] = tw[1+8];
+    q2_tw1[1][1] = tw[3+8];
+    q2_tw1[1][2] = tw[5+8];
+    q2_tw1[1][3] = tw[7+8];
+
+    // TW: in->out
+    q_out0[0] = q_in0[0];
+    q_out1[0] = q_in1[0];
+    q_out2[0] = q_in2[0];
+    q_out3[0] = q_in3[0];
+
+    //----------------------------------------------------------//
+    // first 2 lines
+    //   R          R             R           I             I
+    q_out0[1] = q_in0[1] * q2_tw0[0][1] + q_in1[1] * q2_tw0[1][1];
+    //   I          R             I           I             R
+    q_out1[1] = q_in0[1] * q2_tw0[1][1] - q_in1[1] * q2_tw0[0][1];
+
+    //   R          R             R           I             I
+    q_out0[2] = q_in0[2] * q2_tw0[0][2] + q_in1[2] * q2_tw0[1][2];
+    //   I          R             I           I             R
+    q_out1[2] = q_in0[2] * q2_tw0[1][2] - q_in1[2] * q2_tw0[0][2];
+
+    //   R          R             R           I             I
+    q_out0[3] = q_in0[3] * q2_tw0[0][3] + q_in1[3] * q2_tw0[1][3];
+    //   I          R             I           I             R
+    q_out1[3] = q_in0[3] * q2_tw0[1][3] - q_in1[3] * q2_tw0[0][3];
+
+    //----------------------------------------------------------//
+    // second 2 lines
+    //   R          R             R           I             I
+    q_out2[1] = q_in2[1] * q2_tw1[0][1] + q_in3[1] * q2_tw1[1][1];
+    //   I          R             I           I             R
+    q_out3[1] = q_in2[1] * q2_tw1[1][1] - q_in3[1] * q2_tw1[0][1];
+
+    //   R          R             R           I             I
+    q_out2[2] = q_in2[2] * q2_tw1[0][2] + q_in3[2] * q2_tw1[1][2];
+    //   I          R             I           I             R
+    q_out3[2] = q_in2[2] * q2_tw1[1][2] - q_in3[2] * q2_tw1[0][2];
+
+    //   R          R             R           I             I
+    q_out2[3] = q_in2[3] * q2_tw1[0][3] + q_in3[3] * q2_tw1[1][3];
+    //   I          R             I           I             R
+    q_out3[3] = q_in2[3] * q2_tw1[1][3] - q_in3[3] * q2_tw1[0][3];
+
+    // STORE
+    *(fout_r++) =   q_out0[0];
+    *(fout_r++) =   q_out0[1];
+    *(fout_r++) =   q_out0[2];
+    *(fout_r++) =   q_out0[3];
+    *(fout_r++) =   q_out1[0];
+    *(fout_r++) = - q_out1[1];
+    *(fout_r++) = - q_out1[2];
+    *(fout_r++) = - q_out1[3];
+    *(fout_r++) =   q_out2[0];
+    *(fout_r++) =   q_out2[1];
+    *(fout_r++) =   q_out2[2];
+    *(fout_r++) =   q_out2[3];
+    *(fout_r++) =   q_out3[0];
+    *(fout_r++) = - q_out3[1];
+    *(fout_r++) = - q_out3[2];
+    *(fout_r++) = - q_out3[3];
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly (ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    const ne10_float32_t *fin_r  = ((const ne10_float32_t*) src ) + 12 + 16 ;
+          ne10_float32_t *fout_r =        (ne10_float32_t*) dst + 8;
+          ne10_float32_t *fout_b =        (ne10_float32_t*) dst - 14;
+    const ne10_float32_t *tw     = ((const ne10_float32_t*) twiddles) + 8  + 16;
+    ne10_int32_t loop_count = ((nfft>>2)-8)>>3;
+
+    for ( ; loop_count>0; loop_count -- )
+    {
+        NE10_DECLARE_4(float32x4x2_t,q2_in);    // 8Q
+        NE10_DECLARE_3(float32x4x2_t,q2_tw);    // 6Q
+        NE10_DECLARE_4(float32x4x2_t,q2_out);   // 8Q
+
+        /*  INPUT
+         *  0R  1R  2R  3R      Q0
+         *  0I  1I  2I  3I      Q1
+         *  4R  5R  6R  7R      Q2
+         *  4I  5I  6I  7I      Q3
+         *  8R  9R  aR  bR      Q4
+         *  8I  9I  aI  bI      Q5
+         *  cR  dR  eR  fR      Q6
+         *  cI  dI  eI  fI      Q7
+         */
+
+        q2_out0.val[0] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out0.val[1] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out1.val[0] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out1.val[1] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out2.val[0] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out2.val[1] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out3.val[0] = vld1q_f32(fin_r);
+        fin_r += 4;
+        q2_out3.val[1] = vld1q_f32(fin_r);
+        fin_r += 4;
+
+        q2_tw0 = vld2q_f32(tw);
+        tw += 8;
+        q2_tw1 = vld2q_f32(tw);
+        tw += 8;
+        q2_tw2 = vld2q_f32(tw);
+        tw += 8;
+
+        // transpose
+        // q2_out -> q2_in
+        /*
+         *      val[0]
+         *  0R  4R  8R  cR      Q0
+         *  1R  5R  9R  dR      Q2
+         *  2R  6R  aR  eR      Q4
+         *  3R  7R  bR  fR      Q6
+         *
+         *      val[1]
+         *  0I  4I  8I  cI      Q1
+         *  1I  5I  9I  dI      Q3
+         *  2I  6I  aI  eI      Q5
+         *  3I  7I  bI  fI      Q7
+         */
+
+        NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
+
+        // tw
+        // q2_in -> q2_out
+        q2_out0 = q2_in0;
+        NE10_CPX_MUL_NEON_F32(q2_out1,q2_in1,q2_tw0);
+        NE10_CPX_MUL_NEON_F32(q2_out2,q2_in2,q2_tw1);
+        NE10_CPX_MUL_NEON_F32(q2_out3,q2_in3,q2_tw2);
+
+        // butterfly
+        // out -> in
+        q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out2.val[0]);
+        q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out2.val[1]);
+        q2_in1.val[0] = vsubq_f32 (q2_out0.val[0], q2_out2.val[0]);
+        q2_in1.val[1] = vsubq_f32 (q2_out0.val[1], q2_out2.val[1]);
+        q2_in2.val[0] = vaddq_f32 (q2_out1.val[0], q2_out3.val[0]);
+        q2_in2.val[1] = vaddq_f32 (q2_out1.val[1], q2_out3.val[1]);
+        q2_in3.val[0] = vsubq_f32 (q2_out1.val[0], q2_out3.val[0]);
+        q2_in3.val[1] = vsubq_f32 (q2_out1.val[1], q2_out3.val[1]);
+
+        // in -> out
+        q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
+        q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
+        q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
+        q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
+
+        q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[1]);
+        q2_out1.val[1] = vsubq_f32 (q2_in1.val[1], q2_in3.val[0]);
+        q2_out3.val[0] = vsubq_f32 (q2_in1.val[0], q2_in3.val[1]);
+        q2_out3.val[1] = vaddq_f32 (q2_in1.val[1], q2_in3.val[0]);
+
+        // reverse -- CONJ
+        NE10_REVERSE_FLOAT32X4( q2_out3.val[0] );
+        NE10_REVERSE_FLOAT32X4( q2_out3.val[1] );
+        NE10_REVERSE_FLOAT32X4( q2_out2.val[0] );
+        NE10_REVERSE_FLOAT32X4( q2_out2.val[1] );
+
+        q2_out2.val[1] = vnegq_f32( q2_out2.val[1] );
+        q2_out3.val[1] = vnegq_f32( q2_out3.val[1] );
+
+        // store
+        vst2q_f32(fout_r            , q2_out0 );
+        vst2q_f32(fout_r + (nfft>>1), q2_out1 );
+        fout_r += 8;
+
+        vst2q_f32(fout_b + (nfft>>1), q2_out3 );
+        vst2q_f32(fout_b + nfft     , q2_out2 );
+        fout_b -= 8;
+    }
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly (ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+          ne10_float32_t *fout_r  =       ((ne10_float32_t*) dst ) + 12 + 16 ;
+    const ne10_float32_t *fin_r =    (const ne10_float32_t*) src + 8;
+    const ne10_float32_t *fin_b =    (const ne10_float32_t*) src - 14;
+    const ne10_float32_t *tw     = ((const ne10_float32_t*) twiddles) + 8  + 16;
+    ne10_int32_t loop_count = ((nfft>>2)-8)>>3;
+
+    for ( ; loop_count>0; loop_count -- )
+    {
+        NE10_DECLARE_4(float32x4x2_t,q2_in);    // 8Q
+        NE10_DECLARE_3(float32x4x2_t,q2_tw);    // 6Q
+        NE10_DECLARE_4(float32x4x2_t,q2_out);   // 8Q
+
+        /*  INPUT
+         *  0R  1R  2R  3R      Q0
+         *  0I  1I  2I  3I      Q1
+         *  4R  5R  6R  7R      Q2
+         *  4I  5I  6I  7I      Q3
+         *  8R  9R  aR  bR      Q4
+         *  8I  9I  aI  bI      Q5
+         *  cR  dR  eR  fR      Q6
+         *  cI  dI  eI  fI      Q7
+         */
+
+        q2_in0 = vld2q_f32(fin_r            );
+        q2_in1 = vld2q_f32(fin_r + (nfft>>1));
+        fin_r += 8;
+
+        q2_in3 = vld2q_f32(fin_b + (nfft>>1));
+        q2_in2 = vld2q_f32(fin_b + nfft     );
+        fin_b -= 8;
+
+        q2_tw0 = vld2q_f32(tw);
+        tw += 8;
+        q2_tw1 = vld2q_f32(tw);
+        tw += 8;
+        q2_tw2 = vld2q_f32(tw);
+        tw += 8;
+
+        // reverse -- CONJ
+        NE10_REVERSE_FLOAT32X4( q2_in3.val[0] );
+        NE10_REVERSE_FLOAT32X4( q2_in3.val[1] );
+        NE10_REVERSE_FLOAT32X4( q2_in2.val[0] );
+        NE10_REVERSE_FLOAT32X4( q2_in2.val[1] );
+
+        q2_in2.val[1] = vnegq_f32( q2_in2.val[1] );
+        q2_in3.val[1] = vnegq_f32( q2_in3.val[1] );
+
+        // in -> out
+        q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
+        q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
+
+        q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
+        q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
+
+        q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
+        q2_out3.val[1] = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
+
+        q2_out1.val[1] = vaddq_f32 (q2_in3.val[1], q2_in1.val[1]);
+        q2_out3.val[0] = vsubq_f32 (q2_in3.val[1], q2_in1.val[1]);
+
+        // out -> in
+        q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out1.val[0]);
+        q2_in2.val[0] = vsubq_f32 (q2_out0.val[0], q2_out1.val[0]);
+
+        q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out1.val[1]);
+        q2_in2.val[1] = vsubq_f32 (q2_out0.val[1], q2_out1.val[1]);
+
+        q2_in1.val[0] = vaddq_f32 (q2_out2.val[0], q2_out3.val[0]);
+        q2_in3.val[0] = vsubq_f32 (q2_out2.val[0], q2_out3.val[0]);
+
+        q2_in1.val[1] = vaddq_f32 (q2_out2.val[1], q2_out3.val[1]);
+        q2_in3.val[1] = vsubq_f32 (q2_out2.val[1], q2_out3.val[1]);
+
+        // tw
+        // q2_in -> q2_out
+        q2_out0 = q2_in0;
+        NE10_CPX_MUL_INV_NEON_F32(q2_out1,q2_in1,q2_tw0);
+        NE10_CPX_MUL_INV_NEON_F32(q2_out2,q2_in2,q2_tw1);
+        NE10_CPX_MUL_INV_NEON_F32(q2_out3,q2_in3,q2_tw2);
+
+        // transpose
+        // q2_out -> q2_in
+        NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
+
+        // store
+        vst1q_f32(fout_r, q2_in0.val[0]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in0.val[1]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in1.val[0]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in1.val[1]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in2.val[0]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in2.val[1]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in3.val[0]);
+        fout_r += 4;
+        vst1q_f32(fout_r, q2_in3.val[1]);
+        fout_r += 4;
+    }
+}
+
+NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage( ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly(dst,src,twiddles,nfft);
+
+    if (nfft==16)
+    {
+        return;
+    }
+
+    ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly(dst,src,twiddles,nfft);
+
+    if (nfft==32)
+    {
+        return;
+    }
+
+    ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly(dst,src,twiddles,nfft);
+}
+
+NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage( ne10_fft_cpx_float32_t *dst,
+                                            const ne10_fft_cpx_float32_t *src,
+                                            const ne10_fft_cpx_float32_t *twiddles,
+                                            const ne10_int32_t nfft)
+{
+    ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly(dst,src,twiddles,nfft);
+
+    if (nfft==16)
+    {
+        return;
+    }
+
+    ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly(dst,src,twiddles,nfft);
+
+    if (nfft==32)
+    {
+        return;
+    }
+
+    ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly(dst,src,twiddles,nfft);
+}
+
+/**
+ * @addtogroup R2C_FFT_IFFT
+ * @{
+ */
+
+/**
+ * @brief Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
+ * @param[out]  *fout            point to the output buffer
+ * @param[in]   *fin             point to the input buffer
+ * @param[in]   cfg              point to the config struct
+ * @return none.
+ * The function implements a mixed radix-2/4 FFT (real to complex). The length of 2^N(N is 3, 4, 5, 6 ....etc) is supported.
+ * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia.
+ * For the usage of this function, please check test/test_suite_fft_float32.c
+ */
+void ne10_fft_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
+                                   ne10_float32_t *fin,
+                                   ne10_fft_r2c_cfg_float32_t cfg)
+{
+    typedef         ne10_float32_t REAL;
+    typedef ne10_fft_cpx_float32_t CPLX;
+
+    ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
+    ne10_float32_t *fout_r = (ne10_float32_t*) fout;
+
+    switch (cfg->nfft)
+    {
+        case 8:
+            ne10_radix8_r2c_c ( (CPLX*) fout_r, (const CPLX*) fin, 1, 1, 8);
+            fout[0].r = fout[0].i;
+            break;
+        default:
+            ne10_mixed_radix_r2c_butterfly_float32_neon (fout, (CPLX*) fin, cfg->r_factors_neon, cfg->r_twiddles_neon, tmpbuf);
+            ne10_radix4_r2c_with_twiddles_last_stage(fout, tmpbuf, cfg->r_super_twiddles_neon, cfg->nfft);
+            fout[cfg->nfft / 2].r = fout[0].i;
+            break;
+    }
+    fout[0].i = fout[cfg->nfft / 2].i = 0.0f;
+}
+
+/**
+ * @brief Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.
+ * @param[out]  *fout            point to the output buffer
+ * @param[in]   *fin             point to the input buffer
+ * @param[in]   cfg              point to the config struct
+ * @return none.
+ * The function implements a mixed radix-2/4 FFT (complex to real). The length of 2^N(N is 3, 4, 5, 6 ....etc) is supported.
+ * Otherwise, we alloc a temp buffer(the size is same as input buffer) for storing intermedia.
+ * For the usage of this function, please check test/test_suite_fft_float32.c
+ */
+void ne10_fft_c2r_1d_float32_neon (ne10_float32_t *fout,
+                                   ne10_fft_cpx_float32_t *fin,
+                                   ne10_fft_r2c_cfg_float32_t cfg)
+{
+    typedef         ne10_float32_t REAL;
+    typedef ne10_fft_cpx_float32_t CPLX;
+
+    ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
+    ne10_fft_cpx_float32_t * fout_c;
+    ne10_int32_t stage_count;
+    ne10_int32_t radix;
+
+    switch (cfg->nfft)
+    {
+        case 8:
+            fin[0].i = fin[0].r;
+            fin[0].r = 0.0f;
+            ne10_radix8_c2r_c  ( (CPLX*) fout, (const CPLX*) &fin[0].i, 1, 1, 8);
+            fin[0].r = fin[0].i;
+            break;
+        default:
+            stage_count = cfg->r_factors_neon[0];
+            radix       = cfg->r_factors_neon[  stage_count << 1 ];
+            if (radix==2)
+            {
+                stage_count --;
+            }
+            fin[0].i = fin[cfg->nfft>>1].r;
+            fout_c = (stage_count % 2==1) ? tmpbuf : (CPLX*)fout;
+            ne10_radix4_c2r_with_twiddles_first_stage( (CPLX*) fout_c, fin, cfg->r_super_twiddles_neon, cfg->nfft);
+            ne10_mixed_radix_c2r_butterfly_float32_neon ( (CPLX*) fout, (CPLX*) NULL, cfg->r_factors_neon, cfg->r_twiddles_neon_backward, tmpbuf);
+            break;
+    }
+    fin[0].i = 0.0f;
+}
+
+/**
+ * @} end of R2C_FFT_IFFT group
+ */
+
+/**
+ * unit test for rfft macros
+ */
+
+#ifndef NDEBUG
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifdef __cplusplus
+    using std::abs;
+#endif
+
+int is_float32_close(const ne10_float32_t a, const ne10_float32_t b, const ne10_float32_t ratio)
+{
+    ne10_float32_t err = abs(a - b * ratio);
+    return abs(err / a) < 0.01;
+}
+
+int is_float32x4_close( const float32x4_t q_a, const float32x4_t q_b, const ne10_float32_t ratio )
+{
+    int i;
+    for ( i = 0; i < 4; i ++ )
+    {
+        if ( ! is_float32_close( q_a[i], q_b[i], ratio ) )
+        {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+int is_float32x4x2_close( const float32x4x2_t q2_a, const float32x4x2_t q2_b, const ne10_float32_t ratio )
+{
+    if ( ! is_float32x4_close( q2_a.val[0], q2_b.val[0], ratio ) )
+    {
+        return 0;
+    }
+    if ( ! is_float32x4_close( q2_a.val[1], q2_b.val[1], ratio ) )
+    {
+        return 0;
+    }
+    return 1;
+}
+
+#define NE10_DEBUG_Qx8_IN                       \
+    const float32x4_t q_in0 = { 1, 2, 3, 4};    \
+    const float32x4_t q_in1 = { 5, 6, 7, 8};    \
+    const float32x4_t q_in2 = { 9,10,11,12};    \
+    const float32x4_t q_in3 = {13,14,15,16};    \
+    const float32x4_t q_in4 = {17,18,19,20};    \
+    const float32x4_t q_in5 = {21,22,23,24};    \
+    const float32x4_t q_in6 = {25,26,27,28};    \
+    const float32x4_t q_in7 = {29,30,31,32};
+
+#define NE10_DEBUG_Qx4_IN               \
+    float32x4_t q_in0 = { 1, 2, 3, 4},  \
+                q_in1 = { 5, 6, 7, 8},  \
+                q_in2 = { 9,10,11,12},  \
+                q_in3 = {13,14,15,16};
+
+#define NE10_DEBUG_Q2x4_IN                              \
+    float32x4x2_t q2_in0 = { 1, 2, 3, 4,17,18,19,20},   \
+                  q2_in1 = { 5, 6, 7, 8,21,22,23,24},   \
+                  q2_in2 = { 9,10,11,12,25,26,27,28},   \
+                  q2_in3 = {13,14,15,16,29,30,31,32};
+
+#define NR10_DEBUG_RANDOM_TWIDDLES_MATRIX do {          \
+    ne10_int32_t i;                                     \
+    for ( i = 0; i < 4; i ++ )                          \
+    {                                                   \
+        q2_tw0.val[0][i] = ( rand() % 5000 ) / (ne10_float32_t)(5000);  \
+        q2_tw1.val[0][i] = ( rand() % 5000 ) / (ne10_float32_t)(5000);  \
+        q2_tw2.val[0][i] = ( rand() % 5000 ) / (ne10_float32_t)(5000);  \
+        q2_tw3.val[0][i] = ( rand() % 5000 ) / (ne10_float32_t)(5000);  \
+    }                                                                   \
+    for ( i = 0; i < 4; i ++ )                                          \
+    {                                                                   \
+        q2_tw0.val[1][i] = sqrt( 1.0f - pow( q2_tw0.val[0][i], 2 ) );   \
+        q2_tw1.val[1][i] = sqrt( 1.0f - pow( q2_tw1.val[0][i], 2 ) );   \
+        q2_tw2.val[1][i] = sqrt( 1.0f - pow( q2_tw2.val[0][i], 2 ) );   \
+        q2_tw3.val[1][i] = sqrt( 1.0f - pow( q2_tw3.val[0][i], 2 ) );   \
+    }                                                                   \
+} while(0);
+
+#define NE10_DEBUG_Qx4_COMPARE                                  \
+    if( ! is_float32x4_close(q_in0,q_tmp0, ratio) ) return 0;   \
+    if( ! is_float32x4_close(q_in1,q_tmp1, ratio) ) return 0;   \
+    if( ! is_float32x4_close(q_in2,q_tmp2, ratio) ) return 0;   \
+    if( ! is_float32x4_close(q_in3,q_tmp3, ratio) ) return 0;
+
+#define NE10_DEBUG_Qx8_COMPARE                                  \
+    NE10_DEBUG_Qx4_COMPARE;                                     \
+    if( ! is_float32x4_close(q_in4,q_tmp4, ratio) ) return 0;   \
+    if( ! is_float32x4_close(q_in5,q_tmp5, ratio) ) return 0;   \
+    if( ! is_float32x4_close(q_in6,q_tmp6, ratio) ) return 0;   \
+    if( ! is_float32x4_close(q_in7,q_tmp7, ratio) ) return 0;
+
+#define NE10_DEBUG_Q2x4_COMPARE                                     \
+    if( ! is_float32x4x2_close(q2_in0,q2_tmp0, ratio) )   return 0; \
+    if( ! is_float32x4x2_close(q2_in1,q2_tmp1, ratio) )   return 0; \
+    if( ! is_float32x4x2_close(q2_in2,q2_tmp2, ratio) )   return 0; \
+    if( ! is_float32x4x2_close(q2_in3,q2_tmp3, ratio) )   return 0;
+
+// Check Point:
+// NE10_RADIX8x4_C2R_NEON_KERNEL_S1 is inversed of NE10_RADIX8x4_R2C_NEON_KERNEL_S2
+int is_ne10_radix8x4_r2c_neon_kernel_s1_conformed()
+{
+    NE10_DEBUG_Qx8_IN;
+    NE10_DECLARE_8(float32x4_t,q_tmp);
+    NE10_DECLARE_8(float32x4_t,q_out);
+
+    ne10_float32_t ratio = 0.5;
+
+    NE10_RADIX8x4_R2C_NEON_KERNEL_S1(q_out,q_in);
+    NE10_RADIX8x4_C2R_NEON_KERNEL_S2(q_tmp,q_out);
+
+    NE10_DEBUG_Qx8_COMPARE;
+
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX8x4_C2R_NEON_KERNEL_S2 is inversed of NE10_RADIX8x4_R2C_NEON_KERNEL_S1
+int is_ne10_radix8x4_r2c_neon_kernel_s2_conformed()
+{
+    NE10_DEBUG_Qx8_IN;
+    NE10_DECLARE_8(float32x4_t,q_tmp);
+    NE10_DECLARE_8(float32x4_t,q_out);
+
+    ne10_float32_t ratio = 0.25;
+
+    NE10_RADIX8x4_R2C_NEON_KERNEL_S2(q_out,q_in);
+    NE10_RADIX8x4_C2R_NEON_KERNEL_S1(q_tmp,q_out);
+
+    NE10_DEBUG_Qx8_COMPARE;
+
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX8x4_C2R_NEON_KERNEL is inversed of NE10_RADIX8x4_R2C_NEON_KERNEL
+int is_ne10_radix8x4_r2c_neon_kernel_conformed()
+{
+    NE10_DEBUG_Qx8_IN;
+    NE10_DECLARE_8(float32x4_t,q_tmp);
+    NE10_DECLARE_8(float32x4_t,q_out);
+
+    ne10_float32_t ratio = 0.125;
+
+    NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
+    NE10_RADIX8x4_C2R_NEON_KERNEL(q_tmp,q_out);
+
+    NE10_DEBUG_Qx8_COMPARE;
+
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX4x4_C2R_NEON_KERNEL is inversed of NE10_RADIX4x4_R2C_NEON_KERNEL
+int is_ne10_radix4x4_r2c_neon_kernel_conformed()
+{
+    NE10_DEBUG_Qx4_IN;
+    NE10_DECLARE_4(float32x4_t,q_out);
+    NE10_DECLARE_4(float32x4_t,q_tmp);
+
+    ne10_float32_t ratio = 0.25;
+
+    NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
+    NE10_RADIX4x4_C2R_NEON_KERNEL(q_tmp,q_out);
+
+    NE10_DEBUG_Qx4_COMPARE;
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2 is inversed of NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1
+int is_ne10_radix4x4_r2c_tw_neon_kernel_s1_conformed()
+{
+    NE10_DEBUG_Q2x4_IN;
+    NE10_DECLARE_4(float32x4x2_t,q2_out);
+    NE10_DECLARE_4(float32x4x2_t,q2_tmp);
+
+    ne10_float32_t ratio = 0.5;
+
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1(q2_out,q2_in);
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S2(q2_tmp,q2_out);
+
+    NE10_DEBUG_Q2x4_COMPARE;
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX4x4_C2R_TW_MUL_NEON is inversed of NE10_RADIX4x4_R2C_TW_MUL_NEON
+int is_ne10_radix4x4_r2c_tw_mul_neon_conformed()
+{
+    NE10_DEBUG_Q2x4_IN;
+    NE10_DECLARE_4(float32x4x2_t,q2_out);
+    NE10_DECLARE_4(float32x4x2_t,q2_tmp);
+    NE10_DECLARE_4(float32x4x2_t,q2_tw);
+
+    NR10_DEBUG_RANDOM_TWIDDLES_MATRIX;
+
+    ne10_float32_t ratio = 1.0;
+
+    NE10_RADIX4x4_R2C_TW_MUL_NEON(q2_out,q2_in,q2_tw);
+    NE10_RADIX4x4_C2R_TW_MUL_NEON(q2_tmp,q2_out,q2_tw);
+
+    NE10_DEBUG_Q2x4_COMPARE;
+
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1 is inversed of NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2
+int is_ne10_radix4x4_r2c_tw_neon_kernel_s2_conformed()
+{
+    NE10_DEBUG_Q2x4_IN;
+    NE10_DECLARE_4(float32x4x2_t,q2_out);
+    NE10_DECLARE_4(float32x4x2_t,q2_tmp);
+
+    ne10_float32_t ratio = 0.5;
+
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2(q2_out,q2_in);
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL_S1(q2_tmp,q2_out);
+
+    NE10_DEBUG_Q2x4_COMPARE;
+
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX4x4_C2R_TW_NEON_KERNEL is inversed of NE10_RADIX4x4_R2C_TW_NEON_KERNEL
+int is_ne10_radix4x4_r2c_tw_neon_kernel_conformed()
+{
+    int i;
+
+    NE10_DEBUG_Q2x4_IN;
+    NE10_DECLARE_4(float32x4x2_t,q2_out);
+    NE10_DECLARE_4(float32x4x2_t,q2_tmp);
+    NE10_DECLARE_4(float32x4x2_t,q2_tw);
+    NE10_DECLARE_4(float32x4x2_t,q2_s);
+
+    NR10_DEBUG_RANDOM_TWIDDLES_MATRIX;
+
+    ne10_float32_t ratio = 0.25;
+
+    q2_s0 = q2_in0;
+    q2_s1 = q2_in1;
+    q2_s2 = q2_in2;
+    q2_s3 = q2_in3;
+
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL(q2_out,q2_s,q2_tw);
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_tmp,q2_out,q2_tw);
+
+    NE10_DEBUG_Q2x4_COMPARE;
+
+    return 1;
+}
+
+// Check Point:
+// NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST is inversed of NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST
+int is_ne10_radix4x4_r2c_tw_neon_kernel_last_conformed()
+{
+    int i;
+
+    NE10_DEBUG_Qx4_IN;
+    NE10_DECLARE_4(float32x4_t,q_out);
+    NE10_DECLARE_4(float32x4_t,q_tmp);
+    NE10_DECLARE_4(float32x4_t,q_s);
+
+    ne10_float32_t ratio = 0.25;
+
+    q_s0 = q_in0;
+    q_s1 = q_in1;
+    q_s2 = q_in2;
+    q_s3 = q_in3;
+
+    NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(q_out,q_s);
+    NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(q_tmp,q_out);
+
+    NE10_DEBUG_Qx4_COMPARE;
+
+    return 1;
+}
+#endif // NDEBUG
index 8861583..5e57793 100644 (file)
@@ -88,22 +88,6 @@ void test_fft_c2c_1d_float32_conformance()
 
     fprintf (stdout, "----------%30s start\n", __FUNCTION__);
 
-    /* init input memory */
-    guarded_in_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_in_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    in_c = guarded_in_c + ARRAY_GUARD_LEN;
-    in_neon = guarded_in_neon + ARRAY_GUARD_LEN;
-
-    /* init dst memory */
-    guarded_out_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_out_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    out_c = guarded_out_c + ARRAY_GUARD_LEN;
-    out_neon = guarded_out_neon + ARRAY_GUARD_LEN;
-
-    for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
-    {
-        testInput_f32[i] = (ne10_float32_t) (drand48() * 32768.0f - 16384.0f);
-    }
     for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
         fprintf (stdout, "FFT size %d\n", fftSize);
@@ -150,11 +134,6 @@ void test_fft_c2c_1d_float32_conformance()
 
         NE10_FREE (cfg);
     }
-
-    NE10_FREE (guarded_in_c);
-    NE10_FREE (guarded_in_neon);
-    NE10_FREE (guarded_out_c);
-    NE10_FREE (guarded_out_neon);
 }
 
 void test_fft_c2c_1d_float32_performance()
@@ -168,22 +147,6 @@ void test_fft_c2c_1d_float32_performance()
     fprintf (stdout, "----------%30s start\n", __FUNCTION__);
     fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
 
-    /* init input memory */
-    guarded_in_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_in_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    in_c = guarded_in_c + ARRAY_GUARD_LEN;
-    in_neon = guarded_in_neon + ARRAY_GUARD_LEN;
-
-    /* init dst memory */
-    guarded_out_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_out_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    out_c = guarded_out_c + ARRAY_GUARD_LEN;
-    out_neon = guarded_out_neon + ARRAY_GUARD_LEN;
-
-    for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
-    {
-        testInput_f32[i] = (ne10_float32_t) drand48() * 2 ;
-    }
     for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
         fprintf (stdout, "FFT size %d\n", fftSize);
@@ -247,11 +210,6 @@ void test_fft_c2c_1d_float32_performance()
 
         NE10_FREE (cfg);
     }
-
-    NE10_FREE (guarded_in_c);
-    NE10_FREE (guarded_in_neon);
-    NE10_FREE (guarded_out_c);
-    NE10_FREE (guarded_out_neon);
 }
 
 void test_fft_r2c_1d_float32_conformance()
@@ -263,22 +221,6 @@ void test_fft_r2c_1d_float32_conformance()
 
     fprintf (stdout, "----------%30s start\n", __FUNCTION__);
 
-    /* init input memory */
-    guarded_in_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_in_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    in_c = guarded_in_c + ARRAY_GUARD_LEN;
-    in_neon = guarded_in_neon + ARRAY_GUARD_LEN;
-
-    /* init dst memory */
-    guarded_out_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_out_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    out_c = guarded_out_c + ARRAY_GUARD_LEN;
-    out_neon = guarded_out_neon + ARRAY_GUARD_LEN;
-
-    for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
-    {
-        testInput_f32[i] = (ne10_float32_t) (drand48() * 32768.0f - 16384.0f);
-    }
     for (fftSize = MIN_LENGTH_SAMPLES_REAL; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
         fprintf (stdout, "FFT size %d\n", fftSize);
@@ -335,11 +277,6 @@ void test_fft_r2c_1d_float32_conformance()
 
         NE10_FREE (cfg);
     }
-
-    NE10_FREE (guarded_in_c);
-    NE10_FREE (guarded_in_neon);
-    NE10_FREE (guarded_out_c);
-    NE10_FREE (guarded_out_neon);
 }
 
 void test_fft_r2c_1d_float32_performance()
@@ -353,22 +290,6 @@ void test_fft_r2c_1d_float32_performance()
     fprintf (stdout, "----------%30s start\n", __FUNCTION__);
     fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
 
-    /* init input memory */
-    guarded_in_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_in_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    in_c = guarded_in_c + ARRAY_GUARD_LEN;
-    in_neon = guarded_in_neon + ARRAY_GUARD_LEN;
-
-    /* init dst memory */
-    guarded_out_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    guarded_out_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
-    out_c = guarded_out_c + ARRAY_GUARD_LEN;
-    out_neon = guarded_out_neon + ARRAY_GUARD_LEN;
-
-    for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
-    {
-        testInput_f32[i] = (ne10_float32_t) (drand48() * 32768.0f - 16384.0f);
-    }
     for (fftSize = MIN_LENGTH_SAMPLES_REAL; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
         fprintf (stdout, "FFT size %d\n", fftSize);
@@ -442,16 +363,37 @@ void test_fft_r2c_1d_float32_performance()
 
         NE10_FREE (cfg);
     }
-
-    NE10_FREE (guarded_in_c);
-    NE10_FREE (guarded_in_neon);
-    NE10_FREE (guarded_out_c);
-    NE10_FREE (guarded_out_neon);
 }
 
 static void my_test_setup (void)
 {
     ne10_log_buffer_ptr = ne10_log_buffer;
+    ne10_int32_t i;
+
+    /* init input memory */
+    guarded_in_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
+    guarded_in_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
+    in_c = guarded_in_c + ARRAY_GUARD_LEN;
+    in_neon = guarded_in_neon + ARRAY_GUARD_LEN;
+
+    /* init dst memory */
+    guarded_out_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
+    guarded_out_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 + ARRAY_GUARD_LEN * 2) * sizeof (ne10_float32_t));
+    out_c = guarded_out_c + ARRAY_GUARD_LEN;
+    out_neon = guarded_out_neon + ARRAY_GUARD_LEN;
+
+    for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
+    {
+        testInput_f32[i] = (ne10_float32_t) (drand48() * 32768.0f - 16384.0f);
+    }
+}
+
+static void my_test_teardown (void)
+{
+    NE10_FREE (guarded_in_c);
+    NE10_FREE (guarded_in_neon);
+    NE10_FREE (guarded_out_c);
+    NE10_FREE (guarded_out_neon);
 }
 
 void test_fft_c2c_1d_float32()
@@ -468,6 +410,17 @@ void test_fft_c2c_1d_float32()
 void test_fft_r2c_1d_float32()
 {
 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
+#if defined (__aarch64__)
+    // test for macro
+    assert_true ( is_ne10_radix8x4_r2c_neon_kernel_s1_conformed() );
+    assert_true ( is_ne10_radix8x4_r2c_neon_kernel_s2_conformed() );
+    assert_true ( is_ne10_radix4x4_r2c_neon_kernel_conformed() );
+    assert_true ( is_ne10_radix4x4_r2c_tw_neon_kernel_s1_conformed() );
+    assert_true ( is_ne10_radix4x4_r2c_tw_neon_kernel_s2_conformed() );
+    assert_true ( is_ne10_radix4x4_r2c_tw_neon_kernel_conformed() );
+    assert_true ( is_ne10_radix4x4_r2c_tw_mul_neon_conformed() );
+    assert_true ( is_ne10_radix4x4_r2c_tw_neon_kernel_last_conformed() );
+#endif
     test_fft_r2c_1d_float32_conformance();
 #endif
 
@@ -484,6 +437,8 @@ void test_fixture_fft_c2c_1d_float32 (void)
 
     run_test (test_fft_c2c_1d_float32);       // run tests
 
+    fixture_teardown(my_test_teardown);
+
     test_fixture_end();                 // ends a fixture
 }
 
@@ -495,5 +450,7 @@ void test_fixture_fft_r2c_1d_float32 (void)
 
     run_test (test_fft_r2c_1d_float32);       // run tests
 
+    fixture_teardown(my_test_teardown);
+
     test_fixture_end();                 // ends a fixture
 }