optimize float complex FFT v1.0.1
authorYang Zhang <yang.zhang@arm.com>
Fri, 30 May 2014 11:36:23 +0000 (19:36 +0800)
committerYang Zhang <yang.zhang@arm.com>
Wed, 4 Jun 2014 06:09:13 +0000 (14:09 +0800)
1. To optimize FFT, the algorithm is changed. Bit reversal is removed and radix 8 is added.
2. After test, the optimized FFT show the best performance, so that the old implementations are removed.

The performance result is as follows:

toolchain: gcc 4.8 at -O2
omx fft's execute time is the base. The ratio is less, the performance is better.

panda board A9:
|     |16    |32    |64    |128   |256   |512   |1024  |2048  |4096  |
|Ne10 |84.27%|89.57%|85.63%|85.79%|87.89%|87.91%|83.51%|97.08%|92.68%|
|omx  |100%  |100%  |100%  |100%  |100%  |100%  |100%  |100%  |100%  |

nexus10 A15:
|     |16    |32    |64    |128   |256   |512   |1024  |2048  |4096  |
|Ne10 |84.88%|98.43%|89.46%|101.0%|99.24%|103.2%|93.80%|105.1%|97.44%|
|omx  |100%  |100%  |100%  |100%  |100%  |100%  |100%  |100%  |100%  |

Change-Id: I363ee1602f08532e566d3a5a4f3d7a99972a1283

20 files changed:
inc/NE10_dsp.h
modules/CMakeLists.txt
modules/dsp/NE10_cfft.c [deleted file]
modules/dsp/NE10_cfft.neon.s [deleted file]
modules/dsp/NE10_cfft_init.c [deleted file]
modules/dsp/NE10_fft.h
modules/dsp/NE10_fft_float32.c
modules/dsp/NE10_fft_float32.neon.c
modules/dsp/NE10_fft_float32.neon.s
modules/dsp/NE10_init_dsp.c
modules/dsp/NE10_rfft.c [deleted file]
modules/dsp/NE10_rfft.neon.c [deleted file]
modules/dsp/NE10_rfft_init.c [deleted file]
modules/dsp/test/test_main.c
modules/dsp/test/test_suite_cfft.c [deleted file]
modules/dsp/test/test_suite_fft_float32.c
modules/dsp/test/test_suite_fft_int16.c
modules/dsp/test/test_suite_fft_int32.c
modules/dsp/test/test_suite_rfft.c [deleted file]
test/CMakeLists.txt

index dbb2fa0..9360438 100644 (file)
@@ -46,22 +46,6 @@ extern "C" {
     /* fft functions*/
 
     /* function pointers*/
-    extern void (*ne10_radix4_butterfly_float) (ne10_float32_t *pDst,
-            ne10_float32_t *pSrc,
-            ne10_uint16_t N,
-            ne10_float32_t *pCoef);
-
-    extern void (*ne10_radix4_butterfly_inverse_float) (ne10_float32_t *pDst,
-            ne10_float32_t *pSrc,
-            ne10_uint16_t N,
-            ne10_float32_t *pCoef,
-            ne10_float32_t onebyN);
-
-    extern void (*ne10_rfft_float) (const ne10_rfft_instance_f32_t * S,
-                                    ne10_float32_t * pSrc,
-                                    ne10_float32_t * pDst,
-                                    ne10_float32_t * pTemp);
-
     extern void (*ne10_fft_c2c_1d_float32) (ne10_fft_cpx_float32_t *fout,
                                             ne10_fft_cpx_float32_t *fin,
                                             ne10_fft_cpx_float32_t *twiddles,
@@ -150,22 +134,6 @@ extern "C" {
     extern ne10_fft_r2c_cfg_int16_t ne10_fft_alloc_r2c_int16 (ne10_int32_t nfft);
 
     /* C version*/
-    extern void ne10_radix4_butterfly_float_c (ne10_float32_t *pDst,
-            ne10_float32_t *pSrc,
-            ne10_uint16_t N,
-            ne10_float32_t *pCoef);
-
-    extern void ne10_radix4_butterfly_inverse_float_c (ne10_float32_t *pDst,
-            ne10_float32_t *pSrc,
-            ne10_uint16_t N,
-            ne10_float32_t *pCoef,
-            ne10_float32_t onebyN);
-
-    extern void ne10_rfft_float_c (const ne10_rfft_instance_f32_t * S,
-                                   ne10_float32_t * pSrc,
-                                   ne10_float32_t * pDst,
-                                   ne10_float32_t * pTemp);
-
     extern void ne10_fft_c2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
                                            ne10_fft_cpx_float32_t *fin,
                                            ne10_fft_cpx_float32_t *twiddles,
@@ -237,25 +205,6 @@ extern "C" {
 
 
     /* NEON version*/
-    /**
-     * @addtogroup CFFT_CIFFT
-     * @{
-     */
-    extern void ne10_radix4_butterfly_float_neon (ne10_float32_t *pDst,
-            ne10_float32_t *pSrc,
-            ne10_uint16_t N,
-            ne10_float32_t *pCoef)
-    asm ("ne10_radix4_butterfly_float_neon");
-
-    extern void ne10_radix4_butterfly_inverse_float_neon (ne10_float32_t *pDst,
-            ne10_float32_t *pSrc,
-            ne10_uint16_t N,
-            ne10_float32_t *pCoef,
-            ne10_float32_t onebyN)
-    asm ("ne10_radix4_butterfly_inverse_float_neon");
-    /** @} */ //end of CFFT_CIFFT group
-
-
     extern void ne10_rfft_float_neon (const ne10_rfft_instance_f32_t * S,
                                       ne10_float32_t * pSrc,
                                       ne10_float32_t * pDst,
index 972ac56..aecc7de 100644 (file)
@@ -173,13 +173,9 @@ if(NE10_ENABLE_DSP)
     # Add dsp C files.
     set(NE10_DSP_C_SRCS
         ${PROJECT_SOURCE_DIR}/common/NE10_mask_table.c
-        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft.c
-        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft_init.c
-        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.c
-        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft_init.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fir_init.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_iir.c
@@ -188,7 +184,6 @@ if(NE10_ENABLE_DSP)
 
     # Add dsp intrinsic NEON files.
     set(NE10_DSP_INTRINSIC_SRCS
-        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_rfft.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neon.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neon.c
@@ -201,7 +196,6 @@ if(NE10_ENABLE_DSP)
 
     # Add dsp NEON files.
     set(NE10_DSP_NEON_SRCS
-        ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_cfft.neon.s
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_float32.neon.s
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int32.neon.s
         ${PROJECT_SOURCE_DIR}/modules/dsp/NE10_fft_int16.neon.s
diff --git a/modules/dsp/NE10_cfft.c b/modules/dsp/NE10_cfft.c
deleted file mode 100644 (file)
index 50aecf4..0000000
+++ /dev/null
@@ -1,718 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * NE10 Library : dsp/NE10_cfft.c
- */
-
-#include "NE10_types.h"
-/**
- * @ingroup groupDSPs
- */
-
-/**
- * @defgroup CFFT_CIFFT Complex FFT
- *
- * \par
- * Complex Fast Fourier Transform(CFFT) and Complex Inverse Fast Fourier Transform(CIFFT) is an efficient algorithm to compute Discrete Fourier Transform(DFT) and Inverse Discrete Fourier Transform(IDFT).
- * Computational complexity of CFFT reduces drastically when compared to DFT.
- * \par
- * This set of functions implements CFFT/CIFFT
- * for floating-point data types.  The functions operate on out-of-place buffer which use different buffer for input and output.
- * Complex input is stored in input buffer in an interleaved fashion.
- *
- * \par
- * The functions operate on blocks of input and output data and each call to the function processes
- * <code>2*fftLen</code> samples through the transform.  <code>pSrc</code>  points to input arrays containing <code>2*fftLen</code> values.
- * \par
- * The <code>pDst</code> points to the array of output buffer of size <code>2*fftLen</code> and inputs and outputs are stored in an interleaved fashion as shown below.
- * <pre> {real[0], imag[0], real[1], imag[1],..} </pre>
- *
- * \par Lengths supported by the transform:
- * \par
- * Internally, the functions utilize a radix-4 decimation in frequency(DIF) algorithm
- * and the size of the FFT supported are of the lengths [16, 64, 256, 1024].
- *
- *
- * \par Algorithm:
- *
- * <b>Complex Fast Fourier Transform:</b>
- * \par
- * Input real and imaginary data:
- * <pre>
- * x(n) = xa + j * ya
- * x(n+N/4 ) = xb + j * yb
- * x(n+N/2 ) = xc + j * yc
- * x(n+3N 4) = xd + j * yd
- * </pre>
- * where N is length of FFT
- * \par
- * Output real and imaginary data:
- * <pre>
- * X(4r) = xa'+ j * ya'
- * X(4r+1) = xb'+ j * yb'
- * X(4r+2) = xc'+ j * yc'
- * X(4r+3) = xd'+ j * yd'
- * </pre>
- * \par
- * Twiddle factors for radix-4 FFT:
- * <pre>
- * Wn = co1 + j * (- si1)
- * W2n = co2 + j * (- si2)
- * W3n = co3 + j * (- si3)
- * </pre>
- *
- * \par
- * \image html CFFT.gif "Radix-4 Decimation-in Frequency Complex Fast Fourier Transform"
- *
- * \par
- * Output from Radix-4 CFFT Results in Digit reversal order. Interchange middle two branches of every butterfly results in Bit reversed output.
- * \par
- * <b> Butterfly CFFT equations:</b>
- * <pre>
- * xa' = xa + xb + xc + xd
- * ya' = ya + yb + yc + yd
- * xc' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
- * yc' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
- * xb' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
- * yb' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
- * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
- * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
- * </pre>
- *
- *
- * <b>Complex Inverse Fast Fourier Transform:</b>
- * \par
- * CIFFT uses same twiddle factor table as CFFT with modifications in the design equation as shown below.
- *
- * \par
- * <b> Modified Butterfly CIFFT equations:</b>
- * <pre>
- * xa' = xa + xb + xc + xd
- * ya' = ya + yb + yc + yd
- * xc' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
- * yc' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
- * xb' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
- * yb' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
- * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
- * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
- * </pre>
- *
- * \par Instance Structure
- * A separate instance structure must be defined for each Instance but the twiddle factors and bit reversal tables can be reused.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Initializes twiddle factor table and bit reversal table pointers
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Manually initialize the instance structure as follows:
- * <pre>
- *ne10_cfft_radix4_instance_f32_t = {fft_len, ifft_flag, bit_reverse_flag, p_twiddle, p_bit_rev_table, twid_coef_modifier, bit_rev_factor, one_by_fft_len};
- * </pre>
- * \par
- * where <code>fftLen</code> length of CFFT/CIFFT; <code>ifft_flag</code> Flag for selection of CFFT or CIFFT(Set ifft_flag to calculate CIFFT otherwise calculates CFFT);
- * <code>bit_reverse_flag</code> Flag for selection of output order(Set bitReverseFlag to output in normal order otherwise output in bit reversed order);
- * <code>p_twiddle</code>points to array of twiddle coefficients; <code>pBitRevTable</code> points to the array of bit reversal table.
- * <code>p_bit_rev_table</code> modifier for bit reversal table which supports all FFT lengths with same table.
- * <code>twid_coef_modifier</code> modifier for twiddle factor table which supports all FFT lengths with same table;
- * <code>one_by_fft_len</code> value of 1/fftLen to calculate CIFFT;
- *
- */
-
-
-/**
- * @addtogroup CFFT_CIFFT
- * @{
- */
-
-/**
- * @brief Core radix-4 FFT of floating-point data.
- * @param[out]  *pDst            point to the output buffer (out-of-place)
- * @param[in]  *pSrc             point to the input buffer (out-of-place: the pSrc is used for intermedia buffer, so the input buffer is destroyed)
- * @param[in]  N                 length of FFT
- * @param[in]  *pCoef            point to the twiddle factors
- * @return none.
- * The function implements a Radix-4 Complex FFT
- * Can support FFT lengths of 16, 64, 256, 1024
- */
-
-void ne10_radix4_butterfly_float_c(
-                     ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef)
-{
-    ne10_int32_t     set,grp;
-    ne10_int32_t     setCount,grpCount,grpStep,twidStep;
-    ne10_float32_t       *pTw2,*pTw3,*pTw4,*pT1,*pTmp;
-    ne10_float32_t       *pOut1,*pOut2,*pOut3,*pOut4;
-    ne10_int32_t     SubFFTSize=4,SubFFTNum=N/4;            /*Intial Number of Groups and Group Size*/
-    ne10_float32_t       InpRe1,InpIm1,InpRe2,InpIm2,InpRe3,InpIm3,InpRe4,InpIm4;
-    ne10_float32_t       OutRe1,OutIm1,OutRe2,OutIm2,OutRe3,OutIm3,OutRe4,OutIm4;
-    ne10_float32_t       ReTmp1,ImTmp1,ReTmp2,ImTmp2,ReTmp3,ImTmp3,ReTmp4,ImTmp4;
-    ne10_float32_t       TwRe2,TwIm2,TwRe3,TwIm3,TwRe4,TwIm4;
-    ne10_float32_t       ReTmpT2,ImTmpT2,ReTmpT3,ImTmpT3,ReTmpT4,ImTmpT4;
-    ne10_int32_t     InpStep=(N>>2),OutStep=1;
-
-    /*First stage*/
-    grpCount = SubFFTNum;
-    pT1 = pSrc;
-    pOut1 = pDst;
-
-    for(grp = 0; grp < grpCount; grp++)
-    {
-        InpRe1 = pT1[0];
-        InpIm1 = pT1[1];
-        InpRe2 = pT1[InpStep<<1];
-        InpIm2 = pT1[(InpStep<<1)+1];
-        InpRe3 = pT1[(InpStep<<2)];
-        InpIm3 = pT1[(InpStep<<2)+1];
-        InpRe4 = pT1[3*(InpStep<<1)];
-        InpIm4 = pT1[3*(InpStep<<1)+1];
-
-        //Inp1 + Inp3
-        ReTmp1 = InpRe1 + InpRe3;
-        ImTmp1 = InpIm1 + InpIm3;
-        //Inp1 - Inp3
-        ReTmp2 = InpRe1 - InpRe3;
-        ImTmp2 = InpIm1 - InpIm3;
-
-        //Inp2 + Inp4
-        ReTmp3 = InpRe2 + InpRe4;
-        ImTmp3 = InpIm2 + InpIm4;
-
-        //Inp2 - Inp4
-        ReTmp4 = InpRe2 - InpRe4;
-        ImTmp4 = InpIm2 - InpIm4;
-
-        /*Radix-4 Butterfly calculation*/
-        /*Third Result*/
-        OutRe3 = ReTmp1 - ReTmp3;
-        OutIm3 = ImTmp1 - ImTmp3;
-        /*First Result*/
-        OutRe1 = ReTmp1 + ReTmp3;
-        OutIm1 = ImTmp1 + ImTmp3;
-        /*Second result*/
-        OutRe2 = ReTmp2 + ImTmp4;
-        OutIm2 = ImTmp2 - ReTmp4;
-        /*Fourth Result*/
-        OutRe4 = ReTmp2 - ImTmp4;
-        OutIm4 = ImTmp2 + ReTmp4;
-
-        *pOut1++ = OutRe1;
-        *pOut1++ = OutIm1;
-        *pOut1++ = OutRe2;
-        *pOut1++ = OutIm2;
-        *pOut1++ = OutRe3;
-        *pOut1++ = OutIm3;
-        *pOut1++ = OutRe4;
-        *pOut1++ = OutIm4;
-        pT1+=2;
-
-    }
-    /*Remaining FFT Stages: Second Stage to Last Stage*/
-    /* Update the Grp count and size for the next stage */
-    SubFFTSize = SubFFTSize<<2;
-    SubFFTNum  = SubFFTNum>>2;
-    twidStep = 0;
-    /*Swap Input and Output*/
-    pTmp = pDst;
-    pDst = pSrc;
-    pSrc= pTmp;
-
-    while(SubFFTNum > 0)
-    {
-        grpCount = SubFFTNum;                          /*Number of Blocks*/
-        setCount = SubFFTSize>>2;                      /* setCount is number of Butterflies */
-        grpStep  = 0;
-        OutStep     = (OutStep<<2);
-        pT1 = pSrc;
-        for(grp = 0; grp < grpCount; grp++)
-        {
-            pOut1 = pDst + (grpStep<<1);
-            pOut2 = pOut1 + (OutStep<<1);
-            pOut3 = pOut2 + (OutStep<<1);
-            pOut4 = pOut3 + (OutStep<<1);
-
-            pTw2 = pCoef + twidStep;
-            pTw3 = pTw2  + (setCount<<1);
-            pTw4 = pTw3  + (setCount<<1);
-            for(set = 0; set < setCount; set++)
-            {
-                InpRe1 = pT1[0];
-                InpIm1 = pT1[1];
-                InpRe2 = pT1[InpStep<<1];
-                InpIm2 = pT1[(InpStep<<1)+1];
-                InpRe3 = pT1[(InpStep<<2)];
-                InpIm3 = pT1[(InpStep<<2)+1];
-                InpRe4 = pT1[3*(InpStep<<1)];
-                InpIm4 = pT1[3*(InpStep<<1)+1];
-
-                /*Load Twiddles*/
-                TwRe2 = *pTw2++;
-                TwIm2 = *pTw2++;
-                TwRe3 = *pTw3++;
-                TwIm3 = *pTw3++;
-                TwRe4 = *pTw4++;
-                TwIm4 = *pTw4++;
-
-
-                /*Butterfly calculation*/
-                //CPLX_MUL (pTmpT2, pTw2, Inp2);
-                ReTmpT2 = InpRe2*TwRe2 + InpIm2*TwIm2;
-                ImTmpT2 = InpIm2*TwRe2 - InpRe2*TwIm2;
-
-                //CPLX_MUL (pTmpT3, pTw3, Inp3);
-                ReTmpT3 = InpRe3*TwRe3 + InpIm3*TwIm3;
-                ImTmpT3 = InpIm3*TwRe3 - InpRe3*TwIm3;
-
-
-                //CPLX_MUL (pTmpT4, pTw4, Inp4);
-                ReTmpT4 = InpRe4*TwRe4 + InpIm4*TwIm4;
-                ImTmpT4 = InpIm4*TwRe4 - InpRe4*TwIm4;
-                //CPLX_ADD (pTmp1, Inp1, pTmpT3);
-                ReTmp1 = InpRe1 + ReTmpT3;
-                ImTmp1 = InpIm1 + ImTmpT3;
-
-                //CPLX_SUB (pTmp2, pT1, pTmpT3);
-                ReTmp2 = InpRe1 - ReTmpT3;
-                ImTmp2 = InpIm1 - ImTmpT3;
-
-                //CPLX_ADD (pTmp3, pTmpT2, pTmpT4);
-                ReTmp3 = ReTmpT2 + ReTmpT4;
-                ImTmp3 = ImTmpT2 + ImTmpT4;
-
-                //CPLX_SUB (pTmp4, pTmpT2, pTmpT4);
-                ReTmp4 = ReTmpT2 - ReTmpT4;
-                ImTmp4 = ImTmpT2 - ImTmpT4;
-
-
-
-                /*Third Result*/
-                //CPLX_SUB (pT3, pTmp1, pTmp3);
-                OutRe3 = ReTmp1 - ReTmp3;
-                OutIm3 = ImTmp1 - ImTmp3;
-                /*First Result*/
-                //CPLX_ADD (pT1, pTmp1, pTmp3);
-
-                OutRe1 = ReTmp1 + ReTmp3;
-                OutIm1 = ImTmp1 + ImTmp3;
-                /*Second result*/
-                //CPLX_ADD_SUB_X (pT2, pTmp2, pTmp4);
-                OutRe2 = ReTmp2 + ImTmp4;
-                OutIm2 = ImTmp2 - ReTmp4;
-                /*Fourth Result*/
-                //CPLX_SUB_ADD_X (pT4, pTmp2, pTmp4);
-                OutRe4 = ReTmp2 - ImTmp4;
-                OutIm4 = ImTmp2 + ReTmp4;
-                /*Store the Result*/
-                *pOut1++ = OutRe1;
-                *pOut1++ = OutIm1;
-                *pOut2++ = OutRe2;
-                *pOut2++ = OutIm2;
-                *pOut3++ = OutRe3;
-                *pOut3++ = OutIm3;
-                *pOut4++ = OutRe4;
-                *pOut4++ = OutIm4;
-
-                pT1+=2;
-            }
-            grpStep = grpStep + SubFFTSize;
-        }
-        /* Update the Grp count and size for the next stage */
-        twidStep+= (3*(setCount<<1));
-        SubFFTSize = SubFFTSize<<2;
-        SubFFTNum  = SubFFTNum>>2;
-        /*Swap Input and Output*/
-        pTmp = pDst;
-        pDst = pSrc;
-        pSrc= pTmp;
-    }
-
-    /* if the N is even power of 4, copy the output to dst buffer */
-    SubFFTNum = 0;
-    set = N;
-    while (set > 1)
-    {
-        set = set>>2;
-        SubFFTNum++;
-    }
-
-    if((SubFFTNum&1) == 0)
-    {
-        pT1 = pSrc;
-        pOut1 = pDst;
-        for(grpCount = 0; grpCount < N; grpCount++)
-        {
-            *pOut1++ = *pT1++;
-            *pOut1++ = *pT1++;
-        };
-    }
-}
-
-
-/**
- * @brief Core radix-4 IFFT of floating-point data.
- * @param[out]  *pDst            point to the output buffer (out-of-place)
- * @param[in]  *pSrc             point to the input buffer (out-of-place: the pSrc is used for intermedia buffer, so the input buffer is destroyed)
- * @param[in]  N                 length of FFT
- * @param[in]  *pCoef            point to the twiddle factors
- * @return none.
- * The function implements a Radix-4 Complex IFFT
- */
-
-void ne10_radix4_butterfly_inverse_float_c(
-                     ne10_float32_t *pDst,
-                     ne10_float32_t *pSrc,
-                     ne10_uint16_t N,
-                     ne10_float32_t *pCoef,
-                     ne10_float32_t onebyN)
-{
-    ne10_int32_t     set,grp;
-    ne10_int32_t     setCount,grpCount,grpStep,twidStep;
-    ne10_float32_t       *pTw2,*pTw3,*pTw4,*pT1,*pTmp;
-    ne10_float32_t       *pOut1,*pOut2,*pOut3,*pOut4;
-    ne10_int32_t     SubFFTSize=4,SubFFTNum=N/4;            /*Intial Number of Groups and Group Size*/
-    ne10_float32_t       InpRe1,InpIm1,InpRe2,InpIm2,InpRe3,InpIm3,InpRe4,InpIm4;
-    ne10_float32_t       OutRe1,OutIm1,OutRe2,OutIm2,OutRe3,OutIm3,OutRe4,OutIm4;
-    ne10_float32_t       ReTmp1,ImTmp1,ReTmp2,ImTmp2,ReTmp3,ImTmp3,ReTmp4,ImTmp4;
-    ne10_float32_t       TwRe2,TwIm2,TwRe3,TwIm3,TwRe4,TwIm4;
-    ne10_float32_t       ReTmpT2,ImTmpT2,ReTmpT3,ImTmpT3,ReTmpT4,ImTmpT4;
-    ne10_int32_t     InpStep=(N>>2),OutStep=1;
-
-    /*First stage*/
-    grpCount = SubFFTNum;
-    pT1 = pSrc;
-    pOut1 = pDst;
-
-    for(grp = 0; grp < grpCount; grp++)
-    {
-        InpRe1 = pT1[0];
-        InpIm1 = pT1[1];
-        InpRe2 = pT1[InpStep<<1];
-        InpIm2 = pT1[(InpStep<<1)+1];
-        InpRe3 = pT1[(InpStep<<2)];
-        InpIm3 = pT1[(InpStep<<2)+1];
-        InpRe4 = pT1[3*(InpStep<<1)];
-        InpIm4 = pT1[3*(InpStep<<1)+1];
-
-        //Inp1 + Inp3
-        ReTmp1 = InpRe1 + InpRe3;
-        ImTmp1 = InpIm1 + InpIm3;
-        //Inp1 - Inp3
-        ReTmp2 = InpRe1 - InpRe3;
-        ImTmp2 = InpIm1 - InpIm3;
-
-        //Inp2 + Inp4
-        ReTmp3 = InpRe2 + InpRe4;
-        ImTmp3 = InpIm2 + InpIm4;
-
-        //Inp2 - Inp4
-        ReTmp4 = InpRe2 - InpRe4;
-        ImTmp4 = InpIm2 - InpIm4;
-
-        /*Radix-4 Butterfly calculation*/
-        /*Third Result*/
-        OutRe3 = ReTmp1 - ReTmp3;
-        OutIm3 = ImTmp1 - ImTmp3;
-        /*First Result*/
-        OutRe1 = ReTmp1 + ReTmp3;
-        OutIm1 = ImTmp1 + ImTmp3;
-        /*Second result*/
-        OutRe2 = ReTmp2 - ImTmp4;
-        OutIm2 = ImTmp2 + ReTmp4;
-        /*Fourth Result*/
-        OutRe4 = ReTmp2 + ImTmp4;
-        OutIm4 = ImTmp2 - ReTmp4;
-
-        *pOut1++ = OutRe1;
-        *pOut1++ = OutIm1;
-        *pOut1++ = OutRe2;
-        *pOut1++ = OutIm2;
-        *pOut1++ = OutRe3;
-        *pOut1++ = OutIm3;
-        *pOut1++ = OutRe4;
-        *pOut1++ = OutIm4;
-        pT1+=2;
-
-    }
-    /*Intermediate FFT Stages: Second Stage to Last but one Stage*/
-    /* Update the Grp count and size for the next stage */
-    SubFFTSize = SubFFTSize<<2;
-    SubFFTNum  = SubFFTNum>>2;
-    twidStep = 0;
-    /*Swap Input and Output*/
-    pTmp = pDst;
-    pDst = pSrc;
-    pSrc= pTmp;
-
-    while(SubFFTNum > 1)
-    {
-        grpCount = SubFFTNum;                          /*Number of Blocks*/
-        setCount = SubFFTSize>>2;                      /* setCount is number of Butterflies */
-        grpStep  = 0;
-        OutStep     = (OutStep<<2);
-        pT1 = pSrc;
-        for(grp = 0; grp < grpCount; grp++)
-        {
-            pOut1 = pDst + (grpStep<<1);
-            pOut2 = pOut1 + (OutStep<<1);
-            pOut3 = pOut2 + (OutStep<<1);
-            pOut4 = pOut3 + (OutStep<<1);
-
-            pTw2 = pCoef + twidStep;
-            pTw3 = pTw2  + (setCount<<1);
-            pTw4 = pTw3  + (setCount<<1);
-            for(set = 0; set < setCount; set++)
-            {
-                InpRe1 = pT1[0];
-                InpIm1 = pT1[1];
-                InpRe2 = pT1[InpStep<<1];
-                InpIm2 = pT1[(InpStep<<1)+1];
-                InpRe3 = pT1[(InpStep<<2)];
-                InpIm3 = pT1[(InpStep<<2)+1];
-                InpRe4 = pT1[3*(InpStep<<1)];
-                InpIm4 = pT1[3*(InpStep<<1)+1];
-
-                /*Load Twiddles*/
-                TwRe2 = *pTw2++;
-                TwIm2 = *pTw2++;
-                TwRe3 = *pTw3++;
-                TwIm3 = *pTw3++;
-                TwRe4 = *pTw4++;
-                TwIm4 = *pTw4++;
-
-
-                /*Butterfly calculation*/
-                //CPLX_MUL (pTmpT2, pTw2, Inp2);
-                ReTmpT2 = InpRe2*TwRe2 - InpIm2*TwIm2;
-                ImTmpT2 = InpIm2*TwRe2 + InpRe2*TwIm2;
-
-                //CPLX_MUL (pTmpT3, pTw3, Inp3);
-                ReTmpT3 = InpRe3*TwRe3 - InpIm3*TwIm3;
-                ImTmpT3 = InpIm3*TwRe3 + InpRe3*TwIm3;
-
-                //CPLX_MUL (pTmpT4, pTw4, Inp4);
-                ReTmpT4 = InpRe4*TwRe4 - InpIm4*TwIm4;
-                ImTmpT4 = InpIm4*TwRe4 + InpRe4*TwIm4;
-
-                //CPLX_ADD (pTmp1, Inp1, pTmpT3);
-                ReTmp1 = InpRe1 + ReTmpT3;
-                ImTmp1 = InpIm1 + ImTmpT3;
-
-                //CPLX_SUB (pTmp2, pT1, pTmpT3);
-                ReTmp2 = InpRe1 - ReTmpT3;
-                ImTmp2 = InpIm1 - ImTmpT3;
-
-                //CPLX_ADD (pTmp3, pTmpT2, pTmpT4);
-                ReTmp3 = ReTmpT2 + ReTmpT4;
-                ImTmp3 = ImTmpT2 + ImTmpT4;
-
-                //CPLX_SUB (pTmp4, pTmpT2, pTmpT4);
-                ReTmp4 = ReTmpT2 - ReTmpT4;
-                ImTmp4 = ImTmpT2 - ImTmpT4;
-
-
-                /*Third Result*/
-                //CPLX_SUB (pT3, pTmp1, pTmp3);
-                OutRe3 = ReTmp1 - ReTmp3;
-                OutIm3 = ImTmp1 - ImTmp3;
-
-                /*First Result*/
-                //CPLX_ADD (pT1, pTmp1, pTmp3);
-                OutRe1 = ReTmp1 + ReTmp3;
-                OutIm1 = ImTmp1 + ImTmp3;
-
-                /*Second result*/
-                //CPLX_SUB_ADD_X (pT2, pTmp2, pTmp4);
-                OutRe2 = ReTmp2 - ImTmp4;
-                OutIm2 = ImTmp2 + ReTmp4;
-
-                /*Fourth Result*/
-                //CPLX_ADD_SUB_X (pT4, pTmp2, pTmp4);
-                OutRe4 = ReTmp2 + ImTmp4;
-                OutIm4 = ImTmp2 - ReTmp4;
-
-                /*Store the Result*/
-                *pOut1++ = OutRe1;
-                *pOut1++ = OutIm1;
-                *pOut2++ = OutRe2;
-                *pOut2++ = OutIm2;
-                *pOut3++ = OutRe3;
-                *pOut3++ = OutIm3;
-                *pOut4++ = OutRe4;
-                *pOut4++ = OutIm4;
-
-                pT1+=2;
-            }
-            grpStep = grpStep + SubFFTSize;
-        }
-        /* Update the Grp count and size for the next stage */
-        twidStep+= (3*(setCount<<1));
-        SubFFTSize = SubFFTSize<<2;
-        SubFFTNum  = SubFFTNum>>2;
-        /*Swap Input and Output*/
-        pTmp = pDst;
-        pDst = pSrc;
-        pSrc= pTmp;
-    }
-
-    /* last stage */
-    setCount = SubFFTSize>>2;                      /* setCount is number of Butterflies */
-    grpStep  = 0;
-    OutStep     = (OutStep<<2);
-    pT1 = pSrc;
-
-    pOut1 = pDst + (grpStep<<1);
-    pOut2 = pOut1 + (OutStep<<1);
-    pOut3 = pOut2 + (OutStep<<1);
-    pOut4 = pOut3 + (OutStep<<1);
-
-    pTw2 = pCoef + twidStep;
-    pTw3 = pTw2  + (setCount<<1);
-    pTw4 = pTw3  + (setCount<<1);
-
-    for(set = 0; set < setCount; set++)
-    {
-        InpRe1 = pT1[0];
-        InpIm1 = pT1[1];
-        InpRe2 = pT1[InpStep<<1];
-        InpIm2 = pT1[(InpStep<<1)+1];
-        InpRe3 = pT1[(InpStep<<2)];
-        InpIm3 = pT1[(InpStep<<2)+1];
-        InpRe4 = pT1[3*(InpStep<<1)];
-        InpIm4 = pT1[3*(InpStep<<1)+1];
-
-        /*Load Twiddles*/
-        TwRe2 = *pTw2++;
-        TwIm2 = *pTw2++;
-        TwRe3 = *pTw3++;
-        TwIm3 = *pTw3++;
-        TwRe4 = *pTw4++;
-        TwIm4 = *pTw4++;
-
-
-        /*Butterfly calculation*/
-        //CPLX_MUL (pTmpT2, pTw2, Inp2);
-        ReTmpT2 = InpRe2*TwRe2 - InpIm2*TwIm2;
-        ImTmpT2 = InpIm2*TwRe2 + InpRe2*TwIm2;
-
-        //CPLX_MUL (pTmpT3, pTw3, Inp3);
-        ReTmpT3 = InpRe3*TwRe3 - InpIm3*TwIm3;
-        ImTmpT3 = InpIm3*TwRe3 + InpRe3*TwIm3;
-
-        //CPLX_MUL (pTmpT4, pTw4, Inp4);
-        ReTmpT4 = InpRe4*TwRe4 - InpIm4*TwIm4;
-        ImTmpT4 = InpIm4*TwRe4 + InpRe4*TwIm4;
-
-        //CPLX_ADD (pTmp1, Inp1, pTmpT3);
-        ReTmp1 = InpRe1 + ReTmpT3;
-        ImTmp1 = InpIm1 + ImTmpT3;
-
-        //CPLX_SUB (pTmp2, pT1, pTmpT3);
-        ReTmp2 = InpRe1 - ReTmpT3;
-        ImTmp2 = InpIm1 - ImTmpT3;
-
-        //CPLX_ADD (pTmp3, pTmpT2, pTmpT4);
-        ReTmp3 = ReTmpT2 + ReTmpT4;
-        ImTmp3 = ImTmpT2 + ImTmpT4;
-
-        //CPLX_SUB (pTmp4, pTmpT2, pTmpT4);
-        ReTmp4 = ReTmpT2 - ReTmpT4;
-        ImTmp4 = ImTmpT2 - ImTmpT4;
-
-
-        /*Third Result*/
-        //CPLX_SUB (pT3, pTmp1, pTmp3);
-        OutRe3 = ReTmp1 - ReTmp3;
-        OutIm3 = ImTmp1 - ImTmp3;
-
-        /*First Result*/
-        //CPLX_ADD (pT1, pTmp1, pTmp3);
-        OutRe1 = ReTmp1 + ReTmp3;
-        OutIm1 = ImTmp1 + ImTmp3;
-
-        /*Second result*/
-        //CPLX_SUB_ADD_X (pT2, pTmp2, pTmp4);
-        OutRe2 = ReTmp2 - ImTmp4;
-        OutIm2 = ImTmp2 + ReTmp4;
-
-        /*Fourth Result*/
-        //CPLX_ADD_SUB_X (pT4, pTmp2, pTmp4);
-        OutRe4 = ReTmp2 + ImTmp4;
-        OutIm4 = ImTmp2 - ReTmp4;
-
-        /*Store the Result*/
-        *pOut1++ = OutRe1 * onebyN;
-        *pOut1++ = OutIm1 * onebyN;
-        *pOut2++ = OutRe2 * onebyN;
-        *pOut2++ = OutIm2 * onebyN;
-        *pOut3++ = OutRe3 * onebyN;
-        *pOut3++ = OutIm3 * onebyN;
-        *pOut4++ = OutRe4 * onebyN;
-        *pOut4++ = OutIm4 * onebyN;
-
-        pT1+=2;
-    }
-
-    /*Swap Input and Output*/
-    pTmp = pDst;
-    pDst = pSrc;
-    pSrc= pTmp;
-
-
-    /* if the N is even power of 4, copy the output to dst buffer */
-    SubFFTNum = 0;
-    set = N;
-    while (set > 1)
-    {
-        set = set>>2;
-        SubFFTNum++;
-    }
-
-    if((SubFFTNum&1) == 0)
-    {
-        pT1 = pSrc;
-        pOut1 = pDst;
-        for(grpCount = 0; grpCount < N; grpCount++)
-        {
-            *pOut1++ = *pT1++;
-            *pOut1++ = *pT1++;
-        };
-    }
-}
-
-
-/**
- * @} end of CFFT_CIFFT group
- */
diff --git a/modules/dsp/NE10_cfft.neon.s b/modules/dsp/NE10_cfft.neon.s
deleted file mode 100644 (file)
index b542107..0000000
+++ /dev/null
@@ -1,738 +0,0 @@
-@
-@  Copyright 2012-14 ARM Limited
-@  All rights reserved.
-@
-@  Redistribution and use in source and binary forms, with or without
-@  modification, are permitted provided that the following conditions are met:
-@    * Redistributions of source code must retain the above copyright
-@      notice, this list of conditions and the following disclaimer.
-@    * Redistributions in binary form must reproduce the above copyright
-@      notice, this list of conditions and the following disclaimer in the
-@      documentation and/or other materials provided with the distribution.
-@    * Neither the name of ARM Limited nor the
-@      names of its contributors may be used to endorse or promote products
-@      derived from this software without specific prior written permission.
-@
-@  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
-@  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-@  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-@  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
-@  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-@  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-@  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-@  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-@  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-@  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-@
-
-@/*
-@ * NE10 Library : dsp/NE10_cfft.neon.s
-@ */
-
-@/*
-@ * Note:
-@ * 1. Currently, this is for soft VFP EABI, not for hard vfpv3 ABI yet
-@ * 2. In the assembly code, we use D0-D31 registers. So VFPv3-D32 is used. In VFPv3-D16, there will be failure
-@ */
-
-        .text
-        .syntax   unified
-
-        @/*
-        @ * @brief  Core radix-4 FFT of floating-point data.  Do not call this function directly.
-        @ * @param[out]  *pDst            points to the output buffer
-        @ * @param[in]  *pSrc             points to the input buffer
-        @ * @param[in]  N                 length of FFT
-        @ * @param[in]  *pCoef            points to the twiddle factors
-        @ * @retureq none.
-        @ * The function implements a Radix-4 Complex FFT
-        @ */
-
-        .align   4
-        .global   ne10_radix4_butterfly_float_neon
-        .thumb
-        .thumb_func
-
-ne10_radix4_butterfly_float_neon:
-
-        PUSH    {r4-r12,lr}    @push r12: to keep stack 8 bytes aligned
-        VPUSH   {d8-d15}
-
-        qInp1   .qn Q0.F32
-        qInp2   .qn Q1.F32
-        qInp3   .qn Q2.F32
-        qInp4   .qn Q3.F32
-        qInp5   .qn Q4.F32
-        qInp6   .qn Q5.F32
-        qInp7   .qn Q6.F32
-        qInp8   .qn Q7.F32
-
-        qTwd2Re .qn Q8.F32
-        qTwd2Im .qn Q9.F32
-        qTwd3Re .qn Q10.F32
-        qTwd3Im .qn Q11.F32
-        qTwd4Re .qn Q12.F32
-        qTwd4Im .qn Q13.F32
-
-        qReTmpT2 .qn Q14.F32
-        qImTmpT2 .qn Q15.F32
-
-        qReTmpT3 .qn Q2.F32
-        qImTmpT3 .qn Q3.F32
-
-        qReTmpT4 .qn Q4.F32
-        qImTmpT4 .qn Q5.F32
-
-        qRe1     .qn Q8.F32
-        qIm1     .qn Q9.F32
-        qRe2     .qn Q10.F32
-        qIm2     .qn Q11.F32
-        qRe3     .qn Q12.F32
-        qIm3     .qn Q13.F32
-        qRe4     .qn Q14.F32
-        qIm4     .qn Q15.F32
-
-        pDst        .req  R0
-        pSrc        .req  R1
-        fftSize     .req  R2
-        pCoef       .req  R3
-
-
-        SubFFTSize  .req  R4
-        SubFFTNum   .req  R5
-        grpCount    .req  R6
-        twidStep    .req  R8
-        setCount    .req  R9
-        grpStep     .req  R10
-
-        pT1         .req  R7
-        pOut1       .req  R11
-        pTw2        .req  R12
-        TwdStep     .req  R14
-        pTmp        .req  R7
-
-        LSR     SubFFTNum,fftSize,#2
-        MOV     SubFFTSize,#4
-        MOV     pT1,pSrc
-        LSR     grpCount,SubFFTNum,#2
-        MOV     pOut1,pDst
-        LSL     fftSize,#1
-
-fftGrpLoop:
-        VLD2        {qInp1,qInp2},[pT1],fftSize  @/*Load Input Values*/
-        VLD2        {qInp3,qInp4},[pT1],fftSize
-        VLD2        {qInp5,qInp6},[pT1],fftSize
-        VLD2        {qInp7,qInp8},[pT1],fftSize
-
-        @/*pSrc[0] + pSrc[2]*/
-        VADD    qRe1,qInp1,qInp5
-        VADD    qIm1,qInp2,qInp6
-        @/*pSrc[0] - pSrc[2]*/
-        VSUB    qRe2,qInp1,qInp5
-        VSUB    qIm2,qInp2,qInp6
-        @/*pSrc[1] + pSrc[3]*/
-        VADD    qRe3,qInp3,qInp7
-        VADD    qIm3,qInp4,qInp8
-        @/*pSrc[1] - pSrc[3]*/
-        VSUB    qRe4,qInp3,qInp7
-        VSUB    qIm4,qInp4,qInp8
-
-        @/*Radix-4 Butterfly calculation*/
-        @/*Third Result*/
-        VSUB    qInp5,qRe1,qRe3
-        VSUB    qInp6,qIm1,qIm3
-        @/*First Result*/
-        VADD    qInp1,qRe1,qRe3
-        VADD    qInp2,qIm1,qIm3
-        @/*Second result*/
-        VADD    qInp3,qRe2,qIm4
-        VSUB    qInp4,qIm2,qRe4
-        @/*Fourth Result*/
-        VSUB    qInp7,qRe2,qIm4
-        VADD    qInp8,qIm2,qRe4
-
-        @/*Get Result in correct order for storing*/
-        @/*4Re2,4Re0,3Re2,3Re0 2Re2,2Re0,1Re2,1Re0*/
-        VZIP    qInp1,qInp5
-        @/*4Re3,4Re1,3Re3,3Re1 2Re3,2Re1,1Re3,1Re1*/
-        VZIP    qInp3,qInp7
-
-        @/*4Im2,4Im0,3Im2,3Im0 2Im2,2Im0,1Im2,1Im0*/
-        VZIP    qInp2,qInp6
-        @/*4Im3,4Im1,3Im2,3Im1 2Im3,2Im1,1Im3,1Im1*/
-        VZIP    qInp4,qInp8
-
-        SUB     pT1,pT1,fftSize, LSL #2
-
-        VST4.F32    {d0,d2,d4,d6},[pOut1]!
-        VST4.F32    {d1,d3,d5,d7},[pOut1]!
-        SUBS        grpCount,#1
-        ADD         pT1,pT1,#32
-        VST4.F32    {d8,d10,d12,d14},[pOut1]!
-        VST4.F32    {d9,d11,d13,d15},[pOut1]!
-
-        BGT     fftGrpLoop
-
-        @/* Swap Input and Output*/
-        MOV     pTmp,pDst
-        MOV     pDst,pSrc
-        MOV     pSrc,pTmp
-
-        @/*Remaining FFT Stages: Second Stage to Last Stage*/
-        @/* Update the Grp count and size for the next stage */
-        LSR     SubFFTNum,#2
-        LSL     SubFFTSize,#2
-
-fftStageLoop:
-        MOV     grpCount,SubFFTNum
-        MOV     grpStep,#0
-        ADD     pT1,pSrc,fftSize
-        LSL     TwdStep,SubFFTSize,#1
-
-fftGrpLoop1:
-        LSR     setCount,SubFFTSize,#2
-        ADD     pOut1,pDst,grpStep,LSL #3
-        MOV     pTw2,pCoef
-
-        LSL     SubFFTSize,#1
-
-fftSetLoop:
-        VLD2    {qTwd2Re,qTwd2Im},[pTw2],TwdStep
-        VLD2    {qInp3,qInp4},[pT1],fftSize
-        @/*CPLX_MUL (pTmpT2, pTw2, pT2);*/
-        VMUL    qReTmpT2,qTwd2Re,qInp3
-        VMUL    qImTmpT2,qTwd2Re,qInp4
-        VLD2    {qTwd3Re,qTwd3Im},[pTw2],TwdStep
-        VLD2    {qInp5,qInp6},[pT1],fftSize
-        VMLA    qReTmpT2,qTwd2Im,qInp4
-        VMLS    qImTmpT2,qTwd2Im,qInp3
-
-
-        @/*CPLX_MUL (pTmpT3, pTw3, pT3);*/
-        VMUL    qReTmpT3,qTwd3Re,qInp5
-        VMUL    qImTmpT3,qTwd3Re,qInp6
-        VLD2    {qTwd4Re,qTwd4Im},[pTw2]
-        VLD2    {qInp7,qInp8},[pT1],fftSize
-        VMLA    qReTmpT3,qTwd3Im,qInp6
-        VMLS    qImTmpT3,qTwd3Im,qInp5
-
-        SUB     pT1,pT1,fftSize, LSL #2
-
-
-        @/*CPLX_MUL (pTmpT4, pTw4, pT4);*/
-        VMUL    qReTmpT4,qTwd4Re,qInp7
-        VMUL    qImTmpT4,qTwd4Re,qInp8
-        VLD2    {qInp1,qInp2},[pT1],fftSize
-        VMLA    qReTmpT4,qTwd4Im,qInp8
-        VMLS    qImTmpT4,qTwd4Im,qInp7
-
-
-        @/*CPLX_ADD (pTmp1, pT1, pTmpT3);*/
-        VADD    qRe1,qInp1,qReTmpT3
-        VADD    qIm1,qInp2,qImTmpT3
-        @/*CPLX_SUB (pTmp2, pT1, pTmpT3);*/
-        VSUB    qRe2,qInp1,qReTmpT3
-        VSUB    qIm2,qInp2,qImTmpT3
-        @/*CPLX_ADD (pTmp3, pTmpT2, pTmpT4);*/
-        VADD    qRe3,qReTmpT2,qReTmpT4
-        VADD    qIm3,qImTmpT2,qImTmpT4
-        @/*CPLX_SUB (pTmp4, pTmpT2, pTmpT4);*/
-        VSUB    qRe4,qReTmpT2,qReTmpT4
-        VSUB    qIm4,qImTmpT2,qImTmpT4
-
-        @/*CPLX_ADD (pT1, pTmp1, pTmp3);*/
-        VADD    qInp1,qRe1,qRe3
-        VADD    qInp2,qIm1,qIm3
-
-        @/*CPLX_ADD_SUB_X (pT2, pTmp2, pTmp4);*/
-        VADD    qInp3,qRe2,qIm4
-        VSUB    qInp4,qIm2,qRe4
-
-        @/*CPLX_SUB (pT3, pTmp1, pTmp3);*/
-        VSUB    qInp5,qRe1,qRe3
-        VSUB    qInp6,qIm1,qIm3
-        @/*CPLX_SUB_ADD_X (pT4, pTmp2, pTmp4);*/
-        VSUB    qInp7,qRe2,qIm4
-        VADD    qInp8,qIm2,qRe4
-
-        SUBS    setCount,#4
-        @/* Store the Result*/
-
-        VST2    {qInp1,qInp2},[pOut1],SubFFTSize
-        VST2    {qInp3,qInp4},[pOut1],SubFFTSize
-
-        VST2    {qInp5,qInp6},[pOut1],SubFFTSize
-        VST2    {qInp7,qInp8},[pOut1],SubFFTSize
-
-        SUB     pTw2,pTw2,TwdStep, LSL #1
-        SUB     pOut1,pOut1,SubFFTSize, LSL #2
-
-        ADD     pT1,pT1,#32
-        ADD     pTw2,pTw2,#32
-        ADD     pOut1,pOut1,#32
-
-        BGT     fftSetLoop
-        LSR     SubFFTSize,#1
-        SUBS    grpCount,grpCount,#1
-        ADD     grpStep,grpStep,SubFFTSize
-
-        BGT     fftGrpLoop1
-        @/* Update the Grp count and size for the next stage */
-        ADD     twidStep,SubFFTSize,SubFFTSize, LSL #1
-        LSRS    SubFFTNum,SubFFTNum,#2
-
-        @/* Swap Input and Output*/
-        MOV     pTmp,pDst
-        MOV     pDst,pSrc
-        MOV     pSrc,pTmp
-
-        ADD     pCoef,pCoef,twidStep,LSL #1
-
-        LSL     SubFFTSize,SubFFTSize,#2
-
-        BGT     fftStageLoop
-
-        @/* if the N is even power of 4, copy the output to dst buffer */
-        ASR     fftSize,fftSize,#1
-        CLZ     SubFFTNum,fftSize
-        MOV     setCount, #32
-        SUB     SubFFTNum, setCount, SubFFTNum
-        ASR     SubFFTNum,SubFFTNum,#1
-        ANDS    SubFFTNum, SubFFTNum, #1
-
-        BNE     fftEnd
-
-        ASR     grpCount, fftSize, #4
-fftCopyLoop:
-        VLD1.F32    {d0,d1,d2,d3},[pSrc]!
-        VLD1.F32    {d4,d5,d6,d7},[pSrc]!
-        VLD1.F32    {d8,d9,d10,d11},[pSrc]!
-        VLD1.F32    {d12,d13,d14,d15},[pSrc]!
-
-        SUBS        grpCount,#1
-        VST1.F32    {d0,d1,d2,d3},[pDst]!
-        VST1.F32    {d4,d5,d6,d7},[pDst]!
-        VST1.F32    {d8,d9,d10,d11},[pDst]!
-        VST1.F32    {d12,d13,d14,d15},[pDst]!
-
-        BGT         fftCopyLoop
-
-fftEnd:
-        @/* Retureq From Function*/
-        VPOP    {d8-d15}
-        POP     {r4-r12,pc}
-
-        @/*
-        @ * @brief  Core radix-4 IFFT of floating-point data.  Do not call this function directly.
-        @ * @param[out]  *pDst            points to the output buffer
-        @ * @param[in]  *pSrc             points to the input buffer
-        @ * @param[in]  N                 length of FFT
-        @ * @param[in]  *pCoef            points to the twiddle factors
-        @ * @param[in]  onebyN            reciprocal of FFT length
-        @ * @retureq none.
-        @ * The function implements a Radix-4 Complex FFT
-        @ */
-
-        .align   4
-        .global   ne10_radix4_butterfly_inverse_float_neon
-        .thumb
-        .thumb_func
-
-ne10_radix4_butterfly_inverse_float_neon:
-
-        PUSH    {r4-r12,lr}    @push r12: to keep stack 8 bytes aligned
-        VPUSH   {d8-d15}
-#if defined (NE10_ENABLE_HF)
-        VPUSH   {s0,s1}
-#endif
-        qInp1   .qn Q0.F32
-        qInp2   .qn Q1.F32
-        qInp3   .qn Q2.F32
-        qInp4   .qn Q3.F32
-        qInp5   .qn Q4.F32
-        qInp6   .qn Q5.F32
-        qInp7   .qn Q6.F32
-        qInp8   .qn Q7.F32
-
-        qTwd2Re .qn Q8.F32
-        qTwd2Im .qn Q9.F32
-        qTwd3Re .qn Q10.F32
-        qTwd3Im .qn Q11.F32
-        qTwd4Re .qn Q12.F32
-        qTwd4Im .qn Q13.F32
-
-        qReTmpT2 .qn Q14.F32
-        qImTmpT2 .qn Q15.F32
-
-        qReTmpT3 .qn Q2.F32
-        qImTmpT3 .qn Q3.F32
-
-        qReTmpT4 .qn Q4.F32
-        qImTmpT4 .qn Q5.F32
-
-        qRe1     .qn Q8.F32
-        qIm1     .qn Q9.F32
-        qRe2     .qn Q10.F32
-        qIm2     .qn Q11.F32
-        qRe3     .qn Q12.F32
-        qIm3     .qn Q13.F32
-        qRe4     .qn Q14.F32
-        qIm4     .qn Q15.F32
-
-        pDst        .req  R0
-        pSrc        .req  R1
-        fftSize     .req  R2
-        pCoef       .req  R3
-
-
-        SubFFTSize  .req  R4
-        SubFFTNum   .req  R5
-        grpCount    .req  R6
-        twidStep    .req  R8
-        setCount    .req  R9
-        grpStep     .req  R10
-
-        pT1         .req  R7
-        pOut1       .req  R11
-        pTw2        .req  R12
-        TwdStep     .req  R14
-        pTmp        .req  R7
-
-        LSR     SubFFTNum,fftSize,#2
-        MOV     SubFFTSize,#4
-        MOV     pT1,pSrc
-        LSR     grpCount,SubFFTNum,#2
-        MOV     pOut1,pDst
-        LSL     fftSize,#1
-
-ifftGrpLoop:
-        VLD2        {qInp1,qInp2},[pT1],fftSize  @/*Load Input Values*/
-        VLD2        {qInp3,qInp4},[pT1],fftSize
-        VLD2        {qInp5,qInp6},[pT1],fftSize
-        VLD2        {qInp7,qInp8},[pT1],fftSize
-
-        @/*pSrc[0] + pSrc[2]*/
-        VADD    qRe1,qInp1,qInp5
-        VADD    qIm1,qInp2,qInp6
-        @/*pSrc[0] - pSrc[2]*/
-        VSUB    qRe2,qInp1,qInp5
-        VSUB    qIm2,qInp2,qInp6
-        @/*pSrc[1] + pSrc[3]*/
-        VADD    qRe3,qInp3,qInp7
-        VADD    qIm3,qInp4,qInp8
-        @/*pSrc[1] - pSrc[3]*/
-        VSUB    qRe4,qInp3,qInp7
-        VSUB    qIm4,qInp4,qInp8
-
-        @/*Radix-4 Butterfly calculation*/
-        @/*Third Result*/
-        VSUB    qInp5,qRe1,qRe3
-        VSUB    qInp6,qIm1,qIm3
-        @/*First Result*/
-        VADD    qInp1,qRe1,qRe3
-        VADD    qInp2,qIm1,qIm3
-        @/*Second result*/
-        VSUB    qInp3,qRe2,qIm4
-        VADD    qInp4,qIm2,qRe4
-        @/*Fourth Result*/
-        VADD    qInp7,qRe2,qIm4
-        VSUB    qInp8,qIm2,qRe4
-
-        @/*Get Result in correct order for storing*/
-        @/*4Re2,4Re0,3Re2,3Re0 2Re2,2Re0,1Re2,1Re0*/
-        VZIP    qInp1,qInp5
-        @/*4Re3,4Re1,3Re3,3Re1 2Re3,2Re1,1Re3,1Re1*/
-        VZIP    qInp3,qInp7
-
-        @/*4Im2,4Im0,3Im2,3Im0 2Im2,2Im0,1Im2,1Im0*/
-        VZIP    qInp2,qInp6
-        @/*4Im3,4Im1,3Im2,3Im1 2Im3,2Im1,1Im3,1Im1*/
-        VZIP    qInp4,qInp8
-
-
-        SUB     pT1,pT1,fftSize, LSL #2
-
-
-        VST4.F32    {d0,d2,d4,d6},[pOut1]!
-        VST4.F32    {d1,d3,d5,d7},[pOut1]!
-        SUBS        grpCount,#1
-        ADD         pT1,pT1,#32
-        VST4.F32    {d8,d10,d12,d14},[pOut1]!
-        VST4.F32    {d9,d11,d13,d15},[pOut1]!
-
-
-        BGT     ifftGrpLoop
-
-        @/* Swap Input and Output*/
-        MOV     pTmp,pDst
-        MOV     pDst,pSrc
-        MOV     pSrc,pTmp
-
-        @/*Intermediate FFT Stages: Second Stage to Last but one Stage*/
-        @/* Update the Grp count and size for the next stage */
-
-        LSR     SubFFTNum,#2
-        LSL     SubFFTSize,#2
-        SUBS    pTmp, SubFFTNum, #1
-        BEQ     ifftLastStageLoop
-
-ifftStageLoop:
-        MOV     grpCount,SubFFTNum
-        MOV     grpStep,#0
-        ADD     pT1,pSrc,fftSize
-        LSL     TwdStep,SubFFTSize,#1
-
-ifftGrpLoop1:
-        LSR     setCount,SubFFTSize,#2
-        ADD     pOut1,pDst,grpStep,LSL #3
-        MOV     pTw2,pCoef
-
-        LSL     SubFFTSize,#1
-
-ifftSetLoop:
-        VLD2    {qTwd2Re,qTwd2Im},[pTw2],TwdStep
-        VLD2    {qInp3,qInp4},[pT1],fftSize
-        @/*CPLX_MUL (pTmpT2, pTw2, pT2);*/
-        VMUL    qReTmpT2,qTwd2Re,qInp3
-        VMUL    qImTmpT2,qTwd2Re,qInp4
-        VLD2    {qTwd3Re,qTwd3Im},[pTw2],TwdStep
-        VLD2    {qInp5,qInp6},[pT1],fftSize
-        VMLS    qReTmpT2,qTwd2Im,qInp4
-        VMLA    qImTmpT2,qTwd2Im,qInp3
-
-
-        @/*CPLX_MUL (pTmpT3, pTw3, pT3);*/
-        VMUL    qReTmpT3,qTwd3Re,qInp5
-        VMUL    qImTmpT3,qTwd3Re,qInp6
-        VLD2    {qTwd4Re,qTwd4Im},[pTw2]
-        VLD2    {qInp7,qInp8},[pT1],fftSize
-        VMLS    qReTmpT3,qTwd3Im,qInp6
-        VMLA    qImTmpT3,qTwd3Im,qInp5
-
-        SUB     pT1,pT1,fftSize, LSL #2
-
-
-        @/*CPLX_MUL (pTmpT4, pTw4, pT4);*/
-        VMUL    qReTmpT4,qTwd4Re,qInp7
-        VMUL    qImTmpT4,qTwd4Re,qInp8
-        VLD2    {qInp1,qInp2},[pT1],fftSize
-        VMLS    qReTmpT4,qTwd4Im,qInp8
-        VMLA    qImTmpT4,qTwd4Im,qInp7
-
-
-        @/*CPLX_ADD (pTmp1, pT1, pTmpT3);*/
-        VADD    qRe1,qInp1,qReTmpT3
-        VADD    qIm1,qInp2,qImTmpT3
-        @/*CPLX_SUB (pTmp2, pT1, pTmpT3);*/
-        VSUB    qRe2,qInp1,qReTmpT3
-        VSUB    qIm2,qInp2,qImTmpT3
-        @/*CPLX_ADD (pTmp3, pTmpT2, pTmpT4);*/
-        VADD    qRe3,qReTmpT2,qReTmpT4
-        VADD    qIm3,qImTmpT2,qImTmpT4
-        @/*CPLX_SUB (pTmp4, pTmpT2, pTmpT4);*/
-        VSUB    qRe4,qReTmpT2,qReTmpT4
-        VSUB    qIm4,qImTmpT2,qImTmpT4
-
-        @/*CPLX_ADD (pT1, pTmp1, pTmp3);*/
-        VADD    qInp1,qRe1,qRe3
-        VADD    qInp2,qIm1,qIm3
-
-        @/*CPLX_SUB_ADD_X (pT2, pTmp2, pTmp4);*/
-        VSUB    qInp3,qRe2,qIm4
-        VADD    qInp4,qIm2,qRe4
-
-        @/*CPLX_SUB (pT3, pTmp1, pTmp3);*/
-        VSUB    qInp5,qRe1,qRe3
-        VSUB    qInp6,qIm1,qIm3
-        @/*CPLX_ADD_SUB_X (pT4, pTmp2, pTmp4);*/
-        VADD    qInp7,qRe2,qIm4
-        VSUB    qInp8,qIm2,qRe4
-
-        SUBS    setCount,#4
-        @/* Store the Result*/
-
-        VST2    {qInp1,qInp2},[pOut1],SubFFTSize
-        VST2    {qInp3,qInp4},[pOut1],SubFFTSize
-
-        VST2    {qInp5,qInp6},[pOut1],SubFFTSize
-        VST2    {qInp7,qInp8},[pOut1],SubFFTSize
-
-        SUB     pTw2,pTw2,TwdStep, LSL #1
-        SUB     pOut1,pOut1,SubFFTSize, LSL #2
-
-        ADD     pT1,pT1,#32
-        ADD     pTw2,pTw2,#32
-        ADD     pOut1,pOut1,#32
-
-
-        BGT     ifftSetLoop
-        LSR     SubFFTSize,#1
-        SUBS    grpCount,grpCount,#1
-        ADD     grpStep,grpStep,SubFFTSize
-
-        BGT     ifftGrpLoop1
-        @/* Update the Grp count and size for the next stage */
-        ADD     twidStep,SubFFTSize,SubFFTSize, LSL #1
-        LSR     SubFFTNum,SubFFTNum,#2
-        SUBS    pTmp, SubFFTNum, #1
-
-        @/* Swap Input and Output*/
-        MOV     pTmp,pDst
-        MOV     pDst,pSrc
-        MOV     pSrc,pTmp
-
-        ADD     pCoef,pCoef,twidStep,LSL #1
-
-        LSL     SubFFTSize,SubFFTSize,#2
-
-        BGT     ifftStageLoop
-
-        @/* last stage */
-ifftLastStageLoop:
-        MOV     grpStep,#0
-        ADD     pT1,pSrc,fftSize
-        LSL     TwdStep,SubFFTSize,#1
-
-@ifftLastStageGrpLoop1:
-        LSR     setCount,SubFFTSize,#2
-        ADD     pOut1,pDst,grpStep,LSL #3
-        MOV     pTw2,pCoef
-
-        LSL     SubFFTSize,#1
-
-ifftLastStageSetLoop:
-        VLD2    {qTwd2Re,qTwd2Im},[pTw2],TwdStep
-        VLD2    {qInp3,qInp4},[pT1],fftSize
-        @/*CPLX_MUL (pTmpT2, pTw2, pT2);*/
-        VMUL    qReTmpT2,qTwd2Re,qInp3
-        VMUL    qImTmpT2,qTwd2Re,qInp4
-        VLD2    {qTwd3Re,qTwd3Im},[pTw2],TwdStep
-        VLD2    {qInp5,qInp6},[pT1],fftSize
-        VMLS    qReTmpT2,qTwd2Im,qInp4
-        VMLA    qImTmpT2,qTwd2Im,qInp3
-
-
-        @/*CPLX_MUL (pTmpT3, pTw3, pT3);*/
-        VMUL    qReTmpT3,qTwd3Re,qInp5
-        VMUL    qImTmpT3,qTwd3Re,qInp6
-        VLD2    {qTwd4Re,qTwd4Im},[pTw2]
-        VLD2    {qInp7,qInp8},[pT1],fftSize
-        VMLS    qReTmpT3,qTwd3Im,qInp6
-        VMLA    qImTmpT3,qTwd3Im,qInp5
-
-        SUB     pT1,pT1,fftSize, LSL #2
-
-
-        @/*CPLX_MUL (pTmpT4, pTw4, pT4);*/
-        VMUL    qReTmpT4,qTwd4Re,qInp7
-        VMUL    qImTmpT4,qTwd4Re,qInp8
-        VLD2    {qInp1,qInp2},[pT1],fftSize
-        VMLS    qReTmpT4,qTwd4Im,qInp8
-        VMLA    qImTmpT4,qTwd4Im,qInp7
-
-
-        @/*CPLX_ADD (pTmp1, pT1, pTmpT3);*/
-        VADD    qRe1,qInp1,qReTmpT3
-        VADD    qIm1,qInp2,qImTmpT3
-        @/*CPLX_SUB (pTmp2, pT1, pTmpT3);*/
-        VSUB    qRe2,qInp1,qReTmpT3
-        VSUB    qIm2,qInp2,qImTmpT3
-        @/*CPLX_ADD (pTmp3, pTmpT2, pTmpT4);*/
-        VADD    qRe3,qReTmpT2,qReTmpT4
-        VADD    qIm3,qImTmpT2,qImTmpT4
-        @/*CPLX_SUB (pTmp4, pTmpT2, pTmpT4);*/
-        VSUB    qRe4,qReTmpT2,qReTmpT4
-        VSUB    qIm4,qImTmpT2,qImTmpT4
-
-        @/*CPLX_ADD (pT1, pTmp1, pTmp3);*/
-        VADD    qInp1,qRe1,qRe3
-        VADD    qInp2,qIm1,qIm3
-
-        @/*CPLX_SUB_ADD_X (pT2, pTmp2, pTmp4);*/
-        VSUB    qInp3,qRe2,qIm4
-        VADD    qInp4,qIm2,qRe4
-
-        @/*CPLX_SUB (pT3, pTmp1, pTmp3);*/
-        VSUB    qInp5,qRe1,qRe3
-        VSUB    qInp6,qIm1,qIm3
-        @/*CPLX_ADD_SUB_X (pT4, pTmp2, pTmp4);*/
-        VADD    qInp7,qRe2,qIm4
-        VSUB    qInp8,qIm2,qRe4
-
-        @/* multiply onebyN */
-#if defined (NE10_ENABLE_HF)
-        LDR           grpCount,[sp,#0]          @revert the original value
-#else
-        LDR           grpCount,[sp,#104]          @revert the original value
-#endif
-        VDUP.f32      q8,grpCount
-
-        VMUL    qInp1,qInp1,qRe1
-        VMUL    qInp2,qInp2,qRe1
-        VMUL    qInp3,qInp3,qRe1
-        VMUL    qInp4,qInp4,qRe1
-        VMUL    qInp5,qInp5,qRe1
-        VMUL    qInp6,qInp6,qRe1
-        VMUL    qInp7,qInp7,qRe1
-        VMUL    qInp8,qInp8,qRe1
-
-        SUBS    setCount,#4
-        @/* Store the Result*/
-
-        VST2    {qInp1,qInp2},[pOut1],SubFFTSize
-        VST2    {qInp3,qInp4},[pOut1],SubFFTSize
-
-        VST2    {qInp5,qInp6},[pOut1],SubFFTSize
-        VST2    {qInp7,qInp8},[pOut1],SubFFTSize
-
-        SUB     pTw2,pTw2,TwdStep, LSL #1
-        SUB     pOut1,pOut1,SubFFTSize, LSL #2
-
-        ADD     pT1,pT1,#32
-        ADD     pTw2,pTw2,#32
-        ADD     pOut1,pOut1,#32
-
-        BGT     ifftLastStageSetLoop
-
-        @/* Swap Input and Output*/
-        MOV     pTmp,pDst
-        MOV     pDst,pSrc
-        MOV     pSrc,pTmp
-
-        @/* if the N is even power of 4, copy the output to dst buffer */
-        ASR     fftSize,fftSize,#1
-        CLZ     SubFFTNum,fftSize
-        MOV     setCount, #32
-        SUB     SubFFTNum, setCount, SubFFTNum
-        ASR     SubFFTNum,SubFFTNum,#1
-        ANDS    SubFFTNum, SubFFTNum, #1
-
-        BNE     ifftEnd
-
-        ASR     grpCount, fftSize, #4
-
-ifftCopyLoop:
-        VLD1.F32    {d0,d1,d2,d3},[pSrc]!
-        VLD1.F32    {d4,d5,d6,d7},[pSrc]!
-        VLD1.F32    {d8,d9,d10,d11},[pSrc]!
-        VLD1.F32    {d12,d13,d14,d15},[pSrc]!
-
-        SUBS       grpCount,#1
-        VST1.F32    {d0,d1,d2,d3},[pDst]!
-        VST1.F32    {d4,d5,d6,d7},[pDst]!
-        VST1.F32    {d8,d9,d10,d11},[pDst]!
-        VST1.F32    {d12,d13,d14,d15},[pDst]!
-
-        BGT        ifftCopyLoop
-
-ifftEnd:
-        @/* Retureq From Function*/
-#if defined (NE10_ENABLE_HF)
-        VPOP    {s0,s1}
-#endif
-        VPOP    {d8-d15}
-        POP     {r4-r12,pc}
-
-
-        .end
-
diff --git a/modules/dsp/NE10_cfft_init.c b/modules/dsp/NE10_cfft_init.c
deleted file mode 100644 (file)
index 350d791..0000000
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "NE10_types.h"
-
-/*
-* @brief  Twiddle factors Table
-*/
-/** Pseudo code for Twiddle factor Tables Generation:
-
-for i=1 to N
-    cfft_twiddle_coef(2*i) = cos((i-1) * 2*PI/(float)N))
-    cfft_twiddle_coef(2*i + 1) = sin((i-1) * 2*PI/(float)N))
-end
-
-where N = 1024    and PI = 3.14159265358979
-
-N is the maximum FFT Size supported and
-Cos and Sin values are interleaved fashion
-*/
-
-/*Twiddles below are generated for each FFT-DIT stage seperately*/
-
-static ne10_float32_t cfft_twiddle_coef[2040]={
-1.000000,0.000000,0.923880,0.382683,0.707107,0.707107,0.382683,0.923880,
-1.000000,0.000000,0.707107,0.707107,0.000000,1.000000,-0.707107,0.707107,
-1.000000,0.000000,0.382683,0.923880,-0.707107,0.707107,-0.923880,-0.382683,
-
-1.000000,0.000000,0.995185,0.098017,0.980785,0.195090,0.956940,0.290285,0.923880
-,0.382683,0.881921,0.471397,0.831470,0.555570,0.773010,0.634393,0.707107,
-0.707107,0.634393,0.773010,0.555570,0.831470,0.471397,0.881921,0.382683,0.923880,
-0.290285,0.956940,0.195090,0.980785,0.098017,0.995185,
-
-1.000000,0.000000,0.980785,0.195090,0.923880,0.382683,0.831470,0.555570,0.707107
-,0.707107,0.555570,0.831470,0.382683,0.923880,0.195090,0.980785,0.000000,
-1.000000,-0.195090,0.980785,-0.382683,0.923880,-0.555570,0.831470,-0.707107,0.707107,
--0.831470,0.555570,-0.923880,0.382683,-0.980785,0.195090,
-
-1.000000,0.000000,0.956940,0.290285,0.831470,0.555570,0.634393,0.773010,0.382683
-,0.923880,0.098017,0.995185,-0.195090,0.980785,-0.471397,0.881921,-0.707107,
-0.707107,-0.881921,0.471397,-0.980785,0.195090,-0.995185,-0.098017,-0.923880,
--0.382683,-0.773010,-0.634393,-0.555570,-0.831470,-0.290285,-0.956940,
-
-1.000000,0.000000,0.999699,0.024541,0.998795,0.049068,0.997290,0.073565,0.995185
-,0.098017,0.992480,0.122411,0.989177,0.146730,0.985278,0.170962,0.980785,
-0.195090,0.975702,0.219101,0.970031,0.242980,0.963776,0.266713,0.956940,0.290285,
-0.949528,0.313682,0.941544,0.336890,0.932993,0.359895,0.923880,0.382683,0.914210,
-0.405241,0.903989,0.427555,0.893224,0.449611,0.881921,0.471397,0.870087,0.492898,
-0.857729,0.514103,0.844854,0.534998,0.831470,0.555570,0.817585,0.575808,0.803208,
-0.595699,0.788346,0.615232,0.773010,0.634393,0.757209,0.653173,0.740951,0.671559,
-0.724247,0.689541,0.707107,0.707107,0.689541,0.724247,0.671559,0.740951,0.653173,
-0.757209,0.634393,0.773010,0.615232,0.788346,0.595699,0.803208,0.575808,0.817585,
-0.555570,0.831470,0.534998,0.844854,0.514103,0.857729,0.492898,0.870087,0.471397
-,0.881921,0.449611,0.893224,0.427555,0.903989,0.405241,0.914210,0.382683,
-0.923880,0.359895,0.932993,0.336890,0.941544,0.313682,0.949528,0.290285,0.956940,
-0.266713,0.963776,0.242980,0.970031,0.219101,0.975702,0.195090,0.980785,0.170962,
-0.985278,0.146730,0.989177,0.122411,0.992480,0.098017,0.995185,0.073565,0.997290,
-0.049068,0.998795,0.024541,0.999699,
-
-1.000000,0.000000,0.998795,0.049068,0.995185,0.098017,0.989177,0.146730,0.980785
-,0.195090,0.970031,0.242980,0.956940,0.290285,0.941544,0.336890,0.923880,
-0.382683,0.903989,0.427555,0.881921,0.471397,0.857729,0.514103,0.831470,0.555570,
-0.803208,0.595699,0.773010,0.634393,0.740951,0.671559,0.707107,0.707107,0.671559,
-0.740951,0.634393,0.773010,0.595699,0.803208,0.555570,0.831470,0.514103,0.857729,
-0.471397,0.881921,0.427555,0.903989,0.382683,0.923880,0.336890,0.941544,0.290285,
-0.956940,0.242980,0.970031,0.195090,0.980785,0.146730,0.989177,0.098017,0.995185,
-0.049068,0.998795,0.000000,1.000000,-0.049068,0.998795,-0.098017,0.995185,
--0.146730,0.989177,-0.195090,0.980785,-0.242980,0.970031,-0.290285,0.956940,-0.336890,
-0.941544,-0.382683,0.923880,-0.427555,0.903989,-0.471397,0.881921,-0.514103,
-0.857729,-0.555570,0.831470,-0.595699,0.803208,-0.634393,0.773010,-0.671559,0.740951,
--0.707107,0.707107,-0.740951,0.671559,-0.773010,0.634393,-0.803208,0.595699,
--0.831470,0.555570,-0.857729,0.514103,-0.881921,0.471397,-0.903989,0.427555,
--0.923880,0.382683,-0.941544,0.336890,-0.956940,0.290285,-0.970031,0.242980,-0.980785,
-0.195090,-0.989177,0.146730,-0.995185,0.098017,-0.998795,0.049068,
-
-1.000000,0.000000,0.997290,0.073565,0.989177,0.146730,0.975702,0.219101,0.956940
-,0.290285,0.932993,0.359895,0.903989,0.427555,0.870087,0.492898,0.831470,
-0.555570,0.788346,0.615232,0.740951,0.671559,0.689541,0.724247,0.634393,0.773010,
-0.575808,0.817585,0.514103,0.857729,0.449611,0.893224,0.382683,0.923880,0.313682,
-0.949528,0.242980,0.970031,0.170962,0.985278,0.098017,0.995185,0.024541,0.999699,
--0.049068,0.998795,-0.122411,0.992480,-0.195090,0.980785,-0.266713,0.963776,
--0.336890,0.941544,-0.405241,0.914210,-0.471397,0.881921,-0.534998,0.844854,-0.595699,
-0.803208,-0.653173,0.757209,-0.707107,0.707107,-0.757209,0.653173,-0.803208,
-0.595699,-0.844854,0.534998,-0.881921,0.471397,-0.914210,0.405241,-0.941544,0.336890,
--0.963776,0.266713,-0.980785,0.195090,-0.992480,0.122411,-0.998795,0.049068,
--0.999699,-0.024541,-0.995185,-0.098017,-0.985278,-0.170962,-0.970031,-0.242980,
--0.949528,-0.313682,-0.923880,-0.382683,-0.893224,-0.449611,-0.857729,-0.514103,
--0.817585,-0.575808,-0.773010,-0.634393,-0.724247,-0.689541,-0.671559,-0.740951,
--0.615232,-0.788346,-0.555570,-0.831470,-0.492898,-0.870087,-0.427555,-0.903989,
--0.359895,-0.932993,-0.290285,-0.956940,-0.219101,-0.975702,-0.146730,-0.989177,
--0.073565,-0.997290,
-
-
-1.000000,0.000000,0.999981,0.006136,0.999925,0.012272,0.999831,0.018407,0.999699
-,0.024541,0.999529,0.030675,0.999322,0.036807,0.999078,0.042938,0.998795,
-0.049068,0.998476,0.055195,0.998118,0.061321,0.997723,0.067444,0.997290,0.073565,
-0.996820,0.079682,0.996313,0.085797,0.995767,0.091909,0.995185,0.098017,0.994565,
-0.104122,0.993907,0.110222,0.993212,0.116319,0.992480,0.122411,0.991710,0.128498,
-0.990903,0.134581,0.990058,0.140658,0.989177,0.146730,0.988258,0.152797,0.987301,
-0.158858,0.986308,0.164913,0.985278,0.170962,0.984210,0.177004,0.983105,0.183040,
-0.981964,0.189069,0.980785,0.195090,0.979570,0.201105,0.978317,0.207111,0.977028,
-0.213110,0.975702,0.219101,0.974339,0.225084,0.972940,0.231058,0.971504,0.237024,
-0.970031,0.242980,0.968522,0.248928,0.966976,0.254866,0.965394,0.260794,0.963776
-,0.266713,0.962121,0.272621,0.960431,0.278520,0.958703,0.284408,0.956940,
-0.290285,0.955141,0.296151,0.953306,0.302006,0.951435,0.307850,0.949528,0.313682,
-0.947586,0.319502,0.945607,0.325310,0.943593,0.331106,0.941544,0.336890,0.939459,
-0.342661,0.937339,0.348419,0.935184,0.354164,0.932993,0.359895,0.930767,0.365613,
-0.928506,0.371317,0.926210,0.377007,0.923880,0.382683,0.921514,0.388345,0.919114,
-0.393992,0.916679,0.399624,0.914210,0.405241,0.911706,0.410843,0.909168,0.416430,
-0.906596,0.422000,0.903989,0.427555,0.901349,0.433094,0.898674,0.438616,0.895966,
-0.444122,0.893224,0.449611,0.890449,0.455084,0.887640,0.460539,0.884797,0.465977,
-0.881921,0.471397,0.879012,0.476799,0.876070,0.482184,0.873095,0.487550,0.870087
-,0.492898,0.867046,0.498228,0.863973,0.503538,0.860867,0.508830,0.857729,
-0.514103,0.854558,0.519356,0.851355,0.524590,0.848120,0.529804,0.844854,0.534998,
-0.841555,0.540171,0.838225,0.545325,0.834863,0.550458,0.831470,0.555570,0.828045,
-0.560662,0.824589,0.565732,0.821102,0.570781,0.817585,0.575808,0.814036,0.580814,
-0.810457,0.585798,0.806848,0.590760,0.803208,0.595699,0.799537,0.600616,0.795837,
-0.605511,0.792107,0.610383,0.788346,0.615232,0.784557,0.620057,0.780737,0.624860,
-0.776888,0.629638,0.773010,0.634393,0.769103,0.639124,0.765167,0.643832,0.761202,
-0.648514,0.757209,0.653173,0.753187,0.657807,0.749136,0.662416,0.745058,0.667000,
-0.740951,0.671559,0.736817,0.676093,0.732654,0.680601,0.728464,0.685084,0.724247
-,0.689541,0.720003,0.693971,0.715731,0.698376,0.711432,0.702755,0.707107,
-0.707107,0.702755,0.711432,0.698376,0.715731,0.693971,0.720003,0.689541,0.724247,
-0.685084,0.728464,0.680601,0.732654,0.676093,0.736817,0.671559,0.740951,0.667000,
-0.745058,0.662416,0.749136,0.657807,0.753187,0.653173,0.757209,0.648514,0.761202,
-0.643832,0.765167,0.639124,0.769103,0.634393,0.773010,0.629638,0.776888,0.624860,
-0.780737,0.620057,0.784557,0.615232,0.788346,0.610383,0.792107,0.605511,0.795837,
-0.600616,0.799537,0.595699,0.803208,0.590760,0.806848,0.585798,0.810457,0.580814,
-0.814036,0.575808,0.817585,0.570781,0.821102,0.565732,0.824589,0.560662,0.828045,
-0.555570,0.831470,0.550458,0.834863,0.545325,0.838225,0.540171,0.841555,0.534998
-,0.844854,0.529804,0.848120,0.524590,0.851355,0.519356,0.854558,0.514103,
-0.857729,0.508830,0.860867,0.503538,0.863973,0.498228,0.867046,0.492898,0.870087,
-0.487550,0.873095,0.482184,0.876070,0.476799,0.879012,0.471397,0.881921,0.465977,
-0.884797,0.460539,0.887640,0.455084,0.890449,0.449611,0.893224,0.444122,0.895966,
-0.438616,0.898674,0.433094,0.901349,0.427555,0.903989,0.422000,0.906596,0.416430,
-0.909168,0.410843,0.911706,0.405241,0.914210,0.399624,0.916679,0.393992,0.919114,
-0.388345,0.921514,0.382683,0.923880,0.377007,0.926210,0.371317,0.928506,0.365613,
-0.930767,0.359895,0.932993,0.354164,0.935184,0.348419,0.937339,0.342661,0.939459,
-0.336890,0.941544,0.331106,0.943593,0.325310,0.945607,0.319502,0.947586,0.313682
-,0.949528,0.307850,0.951435,0.302006,0.953306,0.296151,0.955141,0.290285,
-0.956940,0.284408,0.958703,0.278520,0.960431,0.272621,0.962121,0.266713,0.963776,
-0.260794,0.965394,0.254866,0.966976,0.248928,0.968522,0.242980,0.970031,0.237024,
-0.971504,0.231058,0.972940,0.225084,0.974339,0.219101,0.975702,0.213110,0.977028,
-0.207111,0.978317,0.201105,0.979570,0.195090,0.980785,0.189069,0.981964,0.183040,
-0.983105,0.177004,0.984210,0.170962,0.985278,0.164913,0.986308,0.158858,0.987301,
-0.152797,0.988258,0.146730,0.989177,0.140658,0.990058,0.134581,0.990903,0.128498,
-0.991710,0.122411,0.992480,0.116319,0.993212,0.110222,0.993907,0.104122,0.994565,
-0.098017,0.995185,0.091909,0.995767,0.085797,0.996313,0.079682,0.996820,0.073565
-,0.997290,0.067444,0.997723,0.061321,0.998118,0.055195,0.998476,0.049068,
-0.998795,0.042938,0.999078,0.036807,0.999322,0.030675,0.999529,0.024541,0.999699,
-0.018407,0.999831,0.012272,0.999925,0.006136,0.999981,
-
-
-1.000000,0.000000,0.999925,0.012272,0.999699,0.024541,0.999322,0.036807,0.998795
-,0.049068,0.998118,0.061321,0.997290,0.073565,0.996313,0.085797,0.995185,
-0.098017,0.993907,0.110222,0.992480,0.122411,0.990903,0.134581,0.989177,0.146730,
-0.987301,0.158858,0.985278,0.170962,0.983105,0.183040,0.980785,0.195090,0.978317,
-0.207111,0.975702,0.219101,0.972940,0.231058,0.970031,0.242980,0.966976,0.254866,
-0.963776,0.266713,0.960431,0.278520,0.956940,0.290285,0.953306,0.302006,0.949528,
-0.313682,0.945607,0.325310,0.941544,0.336890,0.937339,0.348419,0.932993,0.359895,
-0.928506,0.371317,0.923880,0.382683,0.919114,0.393992,0.914210,0.405241,0.909168,
-0.416430,0.903989,0.427555,0.898674,0.438616,0.893224,0.449611,0.887640,0.460539,
-0.881921,0.471397,0.876070,0.482184,0.870087,0.492898,0.863973,0.503538,0.857729
-,0.514103,0.851355,0.524590,0.844854,0.534998,0.838225,0.545325,0.831470,
-0.555570,0.824589,0.565732,0.817585,0.575808,0.810457,0.585798,0.803208,0.595699,
-0.795837,0.605511,0.788346,0.615232,0.780737,0.624860,0.773010,0.634393,0.765167,
-0.643832,0.757209,0.653173,0.749136,0.662416,0.740951,0.671559,0.732654,0.680601,
-0.724247,0.689541,0.715731,0.698376,0.707107,0.707107,0.698376,0.715731,0.689541,
-0.724247,0.680601,0.732654,0.671559,0.740951,0.662416,0.749136,0.653173,0.757209,
-0.643832,0.765167,0.634393,0.773010,0.624860,0.780737,0.615232,0.788346,0.605511,
-0.795837,0.595699,0.803208,0.585798,0.810457,0.575808,0.817585,0.565732,0.824589,
-0.555570,0.831470,0.545325,0.838225,0.534998,0.844854,0.524590,0.851355,0.514103
-,0.857729,0.503538,0.863973,0.492898,0.870087,0.482184,0.876070,0.471397,
-0.881921,0.460539,0.887640,0.449611,0.893224,0.438616,0.898674,0.427555,0.903989,
-0.416430,0.909168,0.405241,0.914210,0.393992,0.919114,0.382683,0.923880,0.371317,
-0.928506,0.359895,0.932993,0.348419,0.937339,0.336890,0.941544,0.325310,0.945607,
-0.313682,0.949528,0.302006,0.953306,0.290285,0.956940,0.278520,0.960431,0.266713,
-0.963776,0.254866,0.966976,0.242980,0.970031,0.231058,0.972940,0.219101,0.975702,
-0.207111,0.978317,0.195090,0.980785,0.183040,0.983105,0.170962,0.985278,0.158858,
-0.987301,0.146730,0.989177,0.134581,0.990903,0.122411,0.992480,0.110222,0.993907,
-0.098017,0.995185,0.085797,0.996313,0.073565,0.997290,0.061321,0.998118,0.049068
-,0.998795,0.036807,0.999322,0.024541,0.999699,0.012272,0.999925,0.000000,
-1.000000,-0.012272,0.999925,-0.024541,0.999699,-0.036807,0.999322,-0.049068,0.998795,
--0.061321,0.998118,-0.073565,0.997290,-0.085797,0.996313,-0.098017,0.995185,
--0.110222,0.993907,-0.122411,0.992480,-0.134581,0.990903,-0.146730,0.989177,-0.158858,
-0.987301,-0.170962,0.985278,-0.183040,0.983105,-0.195090,0.980785,-0.207111,
-0.978317,-0.219101,0.975702,-0.231058,0.972940,-0.242980,0.970031,-0.254866,
-0.966976,-0.266713,0.963776,-0.278520,0.960431,-0.290285,0.956940,-0.302006,0.953306,
--0.313682,0.949528,-0.325310,0.945607,-0.336890,0.941544,-0.348419,0.937339,
--0.359895,0.932993,-0.371317,0.928506,-0.382683,0.923880,-0.393992,0.919114,-0.405241,
-0.914210,-0.416430,0.909168,-0.427555,0.903989,-0.438616,0.898674,-0.449611,
-0.893224,-0.460539,0.887640,-0.471397,0.881921,-0.482184,0.876070,-0.492898,0.870087,
--0.503538,0.863973,-0.514103,0.857729,-0.524590,0.851355,-0.534998,0.844854,
--0.545325,0.838225,-0.555570,0.831470,-0.565732,0.824589,-0.575808,0.817585,
--0.585798,0.810457,-0.595699,0.803208,-0.605511,0.795837,-0.615232,0.788346,-0.624860,
-0.780737,-0.634393,0.773010,-0.643832,0.765167,-0.653173,0.757209,-0.662416,
-0.749136,-0.671559,0.740951,-0.680601,0.732654,-0.689541,0.724247,-0.698376,0.715731,
--0.707107,0.707107,-0.715731,0.698376,-0.724247,0.689541,-0.732654,0.680601,
--0.740951,0.671559,-0.749136,0.662416,-0.757209,0.653173,-0.765167,0.643832,
--0.773010,0.634393,-0.780737,0.624860,-0.788346,0.615232,-0.795837,0.605511,-0.803208,
-0.595699,-0.810457,0.585798,-0.817585,0.575808,-0.824589,0.565732,-0.831470,
-0.555570,-0.838225,0.545325,-0.844854,0.534998,-0.851355,0.524590,-0.857729,0.514103,
--0.863973,0.503538,-0.870087,0.492898,-0.876070,0.482184,-0.881921,0.471397,
--0.887640,0.460539,-0.893224,0.449611,-0.898674,0.438616,-0.903989,0.427555,-0.909168,
-0.416430,-0.914210,0.405241,-0.919114,0.393992,-0.923880,0.382683,-0.928506,
-0.371317,-0.932993,0.359895,-0.937339,0.348419,-0.941544,0.336890,-0.945607,
-0.325310,-0.949528,0.313682,-0.953306,0.302006,-0.956940,0.290285,-0.960431,0.278520,
--0.963776,0.266713,-0.966976,0.254866,-0.970031,0.242980,-0.972940,0.231058,
--0.975702,0.219101,-0.978317,0.207111,-0.980785,0.195090,-0.983105,0.183040,-0.985278,
-0.170962,-0.987301,0.158858,-0.989177,0.146730,-0.990903,0.134581,-0.992480,
-0.122411,-0.993907,0.110222,-0.995185,0.098017,-0.996313,0.085797,-0.997290,0.073565,
--0.998118,0.061321,-0.998795,0.049068,-0.999322,0.036807,-0.999699,0.024541,
--0.999925,0.012272,
-
-
-
-1.000000,0.000000,0.999831,0.018407,0.999322,0.036807,0.998476,0.055195,0.997290
-,0.073565,0.995767,0.091909,0.993907,0.110222,0.991710,0.128498,0.989177,
-0.146730,0.986308,0.164913,0.983105,0.183040,0.979570,0.201105,0.975702,0.219101,
-0.971504,0.237024,0.966976,0.254866,0.962121,0.272621,0.956940,0.290285,0.951435,
-0.307850,0.945607,0.325310,0.939459,0.342661,0.932993,0.359895,0.926210,0.377007,
-0.919114,0.393992,0.911706,0.410843,0.903989,0.427555,0.895966,0.444122,0.887640,
-0.460539,0.879012,0.476799,0.870087,0.492898,0.860867,0.508830,0.851355,0.524590,
-0.841555,0.540171,0.831470,0.555570,0.821102,0.570781,0.810457,0.585798,0.799537,
-0.600616,0.788346,0.615232,0.776888,0.629638,0.765167,0.643832,0.753187,0.657807,
-0.740951,0.671559,0.728464,0.685084,0.715731,0.698376,0.702755,0.711432,0.689541
-,0.724247,0.676093,0.736817,0.662416,0.749136,0.648514,0.761202,0.634393,
-0.773010,0.620057,0.784557,0.605511,0.795837,0.590760,0.806848,0.575808,0.817585,
-0.560662,0.828045,0.545325,0.838225,0.529804,0.848120,0.514103,0.857729,0.498228,
-0.867046,0.482184,0.876070,0.465977,0.884797,0.449611,0.893224,0.433094,0.901349,
-0.416430,0.909168,0.399624,0.916679,0.382683,0.923880,0.365613,0.930767,0.348419,
-0.937339,0.331106,0.943593,0.313682,0.949528,0.296151,0.955141,0.278520,0.960431,
-0.260794,0.965394,0.242980,0.970031,0.225084,0.974339,0.207111,0.978317,0.189069,
-0.981964,0.170962,0.985278,0.152797,0.988258,0.134581,0.990903,0.116319,0.993212,
-0.098017,0.995185,0.079682,0.996820,0.061321,0.998118,0.042938,0.999078,0.024541
-,0.999699,0.006136,0.999981,-0.012272,0.999925,-0.030675,0.999529,-0.049068,
-0.998795,-0.067444,0.997723,-0.085797,0.996313,-0.104122,0.994565,-0.122411,
-0.992480,-0.140658,0.990058,-0.158858,0.987301,-0.177004,0.984210,-0.195090,0.980785,
--0.213110,0.977028,-0.231058,0.972940,-0.248928,0.968522,-0.266713,0.963776,
--0.284408,0.958703,-0.302006,0.953306,-0.319502,0.947586,-0.336890,0.941544,-0.354164,
-0.935184,-0.371317,0.928506,-0.388345,0.921514,-0.405241,0.914210,-0.422000,
-0.906596,-0.438616,0.898674,-0.455084,0.890449,-0.471397,0.881921,-0.487550,0.873095,
--0.503538,0.863973,-0.519356,0.854558,-0.534998,0.844854,-0.550458,0.834863,
--0.565732,0.824589,-0.580814,0.814036,-0.595699,0.803208,-0.610383,0.792107,
--0.624860,0.780737,-0.639124,0.769103,-0.653173,0.757209,-0.667000,0.745058,-0.680601,
-0.732654,-0.693971,0.720003,-0.707107,0.707107,-0.720003,0.693971,-0.732654,
-0.680601,-0.745058,0.667000,-0.757209,0.653173,-0.769103,0.639124,-0.780737,0.624860,
--0.792107,0.610383,-0.803208,0.595699,-0.814036,0.580814,-0.824589,0.565732,
--0.834863,0.550458,-0.844854,0.534998,-0.854558,0.519356,-0.863973,0.503538,-0.873095
-,0.487550,-0.881921,0.471397,-0.890449,0.455084,-0.898674,0.438616,-0.906596,
-0.422000,-0.914210,0.405241,-0.921514,0.388345,-0.928506,0.371317,-0.935184,
-0.354164,-0.941544,0.336890,-0.947586,0.319502,-0.953306,0.302006,-0.958703,0.284408,
--0.963776,0.266713,-0.968522,0.248928,-0.972940,0.231058,-0.977028,0.213110,
--0.980785,0.195090,-0.984210,0.177004,-0.987301,0.158858,-0.990058,0.140658,-0.992480,
-0.122411,-0.994565,0.104122,-0.996313,0.085797,-0.997723,0.067444,-0.998795,
-0.049068,-0.999529,0.030675,-0.999925,0.012272,-0.999981,-0.006136,-0.999699,
--0.024541,-0.999078,-0.042938,-0.998118,-0.061321,-0.996820,-0.079682,-0.995185,
--0.098017,-0.993212,-0.116319,-0.990903,-0.134581,-0.988258,-0.152797,-0.985278,
--0.170962,-0.981964,-0.189069,-0.978317,-0.207111,-0.974339,-0.225084,-0.970031,
--0.242980,-0.965394,-0.260794,-0.960431,-0.278520,-0.955141,-0.296151,-0.949528,
--0.313682,-0.943593,-0.331106,-0.937339,-0.348419,-0.930767,-0.365613,-0.923880,
--0.382683,-0.916679,-0.399624,-0.909168,-0.416430,-0.901349,-0.433094,-0.893224,
--0.449611,-0.884797,-0.465977,-0.876070,-0.482184,-0.867046,-0.498228,-0.857729,
--0.514103,-0.848120,-0.529804,-0.838225,-0.545325,-0.828045,-0.560662,-0.817585,
--0.575808,-0.806848,-0.590760,-0.795837,-0.605511,-0.784557,-0.620057,-0.773010,
--0.634393,-0.761202,-0.648514,-0.749136,-0.662416,-0.736817,-0.676093,-0.724247,
--0.689541,-0.711432,-0.702755,-0.698376,-0.715731,-0.685084,-0.728464,-0.671559,
--0.740951,-0.657807,-0.753187,-0.643832,-0.765167,-0.629638,-0.776888,-0.615232,
--0.788346,-0.600616,-0.799537,-0.585798,-0.810457,-0.570781,-0.821102,-0.555570,
--0.831470,-0.540171,-0.841555,-0.524590,-0.851355,-0.508830,-0.860867,-0.492898,
--0.870087,-0.476799,-0.879012,-0.460539,-0.887640,-0.444122,-0.895966,-0.427555,
--0.903989,-0.410843,-0.911706,-0.393992,-0.919114,-0.377007,-0.926210,-0.359895,
--0.932993,-0.342661,-0.939459,-0.325310,-0.945607,-0.307850,-0.951435,-0.290285,
--0.956940,-0.272621,-0.962121,-0.254866,-0.966976,-0.237024,-0.971504,-0.219101,
--0.975702,-0.201105,-0.979570,-0.183040,-0.983105,-0.164913,-0.986308,-0.146730,
--0.989177,-0.128498,-0.991710,-0.110222,-0.993907,-0.091909,-0.995767,-0.073565,
--0.997290,-0.055195,-0.998476,-0.036807,-0.999322,-0.018407,-0.999831
-
-};
-
-/*
-* @brief  Initialization function for the floating point CFFT/CIFFT function.
-*
-* @param[in,out] *S points to an instance of the floating point CFFT/CIFFT function structure.
-* @param[in] fftLen  length of the CFFT/CIFFT .
-* @param[in] ifft_flag Flag for the selection of CFFT or CIFFT
-* @return The function returns NE10_OK if initialization was successful or NE10_ERR if
-* <code>fftLen</code> is not a supported value.
-*
-* The function inialises the Twiddle factors table and bit reverse table
-*/
-
-ne10_result_t ne10_cfft_radix4_init_float(
-  ne10_cfft_radix4_instance_f32_t * S,
-  ne10_uint16_t fftLen,
-  ne10_uint8_t ifftFlag)
-{
-  ne10_uint32_t i,j;
-  /*  Initialise the default arm status */
-  ne10_result_t status = NE10_OK;
-
-  /*  Initialise the FFT length */
-  S->fft_len = fftLen;
-
-  /*  Initialise the twiddle coef modifier value */
-  S->twid_coef_modifier = 1u;
-
-  /*  Initialise the Flag for selection of CFFT or CIFFT */
-  S->ifft_flag = ifftFlag;
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fft_len)
-  {
-
-  case 1024u:
-    /*  Initializations of structure parameters for 1024 point FFT */
-
-    /*  Initialise the Twiddle coefficient pointer */
-    S->p_twiddle = (ne10_float32_t *) cfft_twiddle_coef;
-
-    /*  Initialise the bit reversal table modifier */
-    S->bit_rev_factor = 1u;
-    /*  Initialise the 1/N Value */
-    S->one_by_fft_len = 0.0009765625f;
-    break;
-
-
-  case 256u:
-    /*  Initializations of structure parameters for 256 point FFT */
-
-    /*  Initialise the Twiddle coefficient pointer */
-    S->p_twiddle = (ne10_float32_t *) cfft_twiddle_coef;
-    S->bit_rev_factor = 4u;
-    S->one_by_fft_len = 0.00390625f;
-    break;
-
-  case 64u:
-    /*  Initializations of structure parameters for 64 point FFT */
-    /*  Initialise the Twiddle coefficient pointer */
-    S->p_twiddle = (ne10_float32_t *) cfft_twiddle_coef;
-    S->bit_rev_factor = 16u;
-    S->one_by_fft_len = 0.015625f;
-    break;
-
-  case 16u:
-    /*  Initializations of structure parameters for 16 point FFT */
-
-    /*  Initialise the Twiddle coefficient pointer */
-    S->p_twiddle = (ne10_float32_t *) cfft_twiddle_coef;
-
-    S->bit_rev_factor = 64u;
-    S->one_by_fft_len = 0.0625f;
-    break;
-
-
-  default:
-    /*  Reporting argument error if fftSize is not valid value */
-    status = NE10_ERR;
-    break;
-  }
-  return status;
-}
-
index 65136a5..519179c 100644 (file)
@@ -46,10 +46,6 @@ extern "C" {
     /*common fft functions */
 
     /*common functions for float fft */
-    extern void ne10_data_bitreversal_float32 (ne10_fft_cpx_float32_t * Fout,
-            const ne10_fft_cpx_float32_t * f,
-            ne10_int32_t fstride,
-            ne10_int32_t * factors);
     extern void ne10_fft_split_r2c_1d_float32 (ne10_fft_cpx_float32_t *dst,
             const ne10_fft_cpx_float32_t *src,
             ne10_fft_cpx_float32_t *twiddles,
@@ -58,41 +54,17 @@ extern "C" {
             const ne10_fft_cpx_float32_t *src,
             ne10_fft_cpx_float32_t *twiddles,
             ne10_int32_t ncfft);
-    extern void ne10_radix4_butterfly_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
+    extern void ne10_mixed_radix_fft_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
+            ne10_fft_cpx_float32_t * Fin,
             ne10_int32_t * factors,
             ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_radix4_butterfly_forward_float32_neon");
-    extern void ne10_radix4_butterfly_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
-            ne10_int32_t * factors,
-            ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_radix4_butterfly_backward_float32_neon");
+    asm ("ne10_mixed_radix_fft_forward_float32_neon");
 
-    extern void ne10_radix2_butterfly_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
+    extern void ne10_mixed_radix_fft_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
+            ne10_fft_cpx_float32_t * Fin,
             ne10_int32_t * factors,
             ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_radix2_butterfly_forward_float32_neon");
-    extern void ne10_radix2_butterfly_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
-            ne10_int32_t * factors,
-            ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_radix2_butterfly_backward_float32_neon");
-
-    extern void ne10_mixed_radix_butterfly_length_even_power2_float32_neon (ne10_fft_cpx_float32_t * Fout,
-            ne10_int32_t * factors,
-            ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_mixed_radix_butterfly_length_even_power2_float32_neon");
-    extern void ne10_mixed_radix_butterfly_length_odd_power2_float32_neon (ne10_fft_cpx_float32_t * Fout,
-            ne10_int32_t * factors,
-            ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_mixed_radix_butterfly_length_odd_power2_float32_neon");
-
-    extern void ne10_mixed_radix_butterfly_inverse_length_even_power2_float32_neon (ne10_fft_cpx_float32_t * Fout,
-            ne10_int32_t * factors,
-            ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_mixed_radix_butterfly_inverse_length_even_power2_float32_neon");
-    extern void ne10_mixed_radix_butterfly_inverse_length_odd_power2_float32_neon (ne10_fft_cpx_float32_t * Fout,
-            ne10_int32_t * factors,
-            ne10_fft_cpx_float32_t * twiddles)
-    asm ("ne10_mixed_radix_butterfly_inverse_length_odd_power2_float32_neon");
+    asm ("ne10_mixed_radix_fft_backward_float32_neon");
 
     /* common functions for fixed point fft */
     /* bit reversal for int 16 */
@@ -188,6 +160,7 @@ extern "C" {
             ne10_fft_cpx_int32_t * twiddles)
     asm ("ne10_radix2_butterfly_backward_int32_scaled_neon");
 
+
 #ifdef __cplusplus
 }
 #endif
index 03e7be2..37383d6 100644 (file)
@@ -49,229 +49,796 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 #include "NE10_fft.h"
 
 static void ne10_mixed_radix_butterfly_float32_c (ne10_fft_cpx_float32_t * Fout,
+        ne10_fft_cpx_float32_t   * Fin,
         ne10_int32_t * factors,
         ne10_fft_cpx_float32_t * twiddles)
 {
-    ne10_int32_t i, j, mstride;
+    ne10_int32_t fstride, mstride, N;
+    ne10_int32_t fstride1;
+    ne10_int32_t f_count, m_count;
     ne10_int32_t stage_count;
-    ne10_int32_t fstride;
 
-    ne10_fft_cpx_float32_t tmp;
-    ne10_fft_cpx_float32_t scratch[6];
-    ne10_fft_cpx_float32_t *tw, *tw1, *tw2, *tw3;
-    ne10_fft_cpx_float32_t * F;
+    ne10_fft_cpx_float32_t   scratch_in[8];
+    ne10_fft_cpx_float32_t   scratch_out[8];
+    ne10_fft_cpx_float32_t   scratch[16];
+    ne10_fft_cpx_float32_t   scratch_tw[6];
 
+    ne10_fft_cpx_float32_t   *Fin1, *Fin2, *Fout1, *Fout2;
+    ne10_fft_cpx_float32_t   *Fout_ls = Fout;
+    ne10_fft_cpx_float32_t   *Ftmp;
+    ne10_fft_cpx_float32_t   *tw, *tw1, *tw2;
+    const ne10_float32_t TW_81 = 0.70710678;
+    const ne10_float32_t TW_81N = -0.70710678;
 
-    // the first stage
+    // init fstride, mstride, N
     stage_count = factors[0];
     fstride = factors[1];
-    if (factors[2 * stage_count] == 2) // length of FFT is 2^n (n is odd)
+    mstride = factors[ (stage_count << 1) - 1 ];
+    N = factors[ stage_count << 1 ]; // radix
+
+    // the first stage
+    Fin1 = Fin;
+    Fout1 = Fout;
+    if (N == 2)   // length of FFT is 2^n (n is odd)
     {
-        //fstride is nfft>>1
-        for (i = 0; i < fstride; i++)
+        // radix 8
+        N = fstride >> 1; // 1/4 of length of FFT
+        tw = twiddles;
+        fstride1 = fstride >> 2;
+
+        Fin1 = Fin;
+        for (f_count = 0; f_count < fstride1; f_count ++)
         {
-            tmp.r = Fout[2 * i + 1].r;
-            tmp.i = Fout[2 * i + 1].i;
-            Fout[2 * i + 1].r = Fout[2 * i].r - tmp.r;
-            Fout[2 * i + 1].i = Fout[2 * i].i - tmp.i;
-            Fout[2 * i].r = Fout[2 * i].r + tmp.r;
-            Fout[2 * i].i = Fout[2 * i].i + tmp.i;
-        }
+            Fout1 = & Fout[ f_count * 8 ];
+            // load
+            scratch_tw[0] = tw[0];
+            scratch_tw[2] = tw[2];
+            scratch_tw[4] = tw[4];
+            scratch_tw[1] = tw[1];
+            scratch_tw[3] = tw[3];
+            scratch_tw[5] = tw[5];
+
+            scratch_in[0].r = Fin1[0].r + Fin1[0 + fstride].r;
+            scratch_in[0].i = Fin1[0].i + Fin1[0 + fstride].i;
+            scratch_in[1].r = Fin1[0].r - Fin1[0 + fstride].r;
+            scratch_in[1].i = Fin1[0].i - Fin1[0 + fstride].i;
+            scratch_in[2].r = Fin1[fstride1].r + Fin1[fstride1 + fstride].r;
+            scratch_in[2].i = Fin1[fstride1].i + Fin1[fstride1 + fstride].i;
+            scratch_in[3].r = Fin1[fstride1].r - Fin1[fstride1 + fstride].r;
+            scratch_in[3].i = Fin1[fstride1].i - Fin1[fstride1 + fstride].i;
+            scratch_in[4].r = Fin1[fstride1 * 2].r + Fin1[fstride1 * 2 + fstride].r;
+            scratch_in[4].i = Fin1[fstride1 * 2].i + Fin1[fstride1 * 2 + fstride].i;
+            scratch_in[5].r = Fin1[fstride1 * 2].r - Fin1[fstride1 * 2 + fstride].r;
+            scratch_in[5].i = Fin1[fstride1 * 2].i - Fin1[fstride1 * 2 + fstride].i;
+            scratch_in[6].r = Fin1[fstride1 * 3].r + Fin1[fstride1 * 3 + fstride].r;
+            scratch_in[6].i = Fin1[fstride1 * 3].i + Fin1[fstride1 * 3 + fstride].i;
+            scratch_in[7].r = Fin1[fstride1 * 3].r - Fin1[fstride1 * 3 + fstride].r;
+            scratch_in[7].i = Fin1[fstride1 * 3].i - Fin1[fstride1 * 3 + fstride].i;
+
+            // radix 4 butterfly without twiddles
+            scratch[0] = scratch_in[0];
+            scratch[1] = scratch_in[1];
+
+            scratch[2] = scratch_in[2];
+            scratch[3].r = (scratch_in[3].r + scratch_in[3].i) * TW_81;
+            scratch[3].i = (scratch_in[3].i - scratch_in[3].r) * TW_81;
+
+            scratch[4] = scratch_in[4];
+            scratch[5].r = scratch_in[5].i;
+            scratch[5].i = -scratch_in[5].r;
+
+            scratch[6].r = scratch_in[6].r;
+            scratch[6].i = scratch_in[6].i;
+            scratch[7].r = (scratch_in[7].r - scratch_in[7].i) * TW_81N;
+            scratch[7].i = (scratch_in[7].i + scratch_in[7].r) * TW_81N;
+
+            // radix 2 butterfly
+            scratch[8].r = scratch[0].r + scratch[4].r;
+            scratch[8].i = scratch[0].i + scratch[4].i;
+            scratch[9].r = scratch[1].r + scratch[5].r;
+            scratch[9].i = scratch[1].i + scratch[5].i;
+
+            scratch[10].r = scratch[0].r - scratch[4].r;
+            scratch[10].i = scratch[0].i - scratch[4].i;
+            scratch[11].r = scratch[1].r - scratch[5].r;
+            scratch[11].i = scratch[1].i - scratch[5].i;
+
+            // radix 2 butterfly
+            scratch[12].r = scratch[2].r + scratch[6].r;
+            scratch[12].i = scratch[2].i + scratch[6].i;
+            scratch[13].r = scratch[3].r + scratch[7].r;
+            scratch[13].i = scratch[3].i + scratch[7].i;
+
+            scratch[14].r = scratch[2].r - scratch[6].r;
+            scratch[14].i = scratch[2].i - scratch[6].i;
+            scratch[15].r = scratch[3].r - scratch[7].r;
+            scratch[15].i = scratch[3].i - scratch[7].i;
+
+            // third result
+            scratch_out[4].r = scratch[8].r - scratch[12].r;
+            scratch_out[4].i = scratch[8].i - scratch[12].i;
+            scratch_out[5].r = scratch[9].r - scratch[13].r;
+            scratch_out[5].i = scratch[9].i - scratch[13].i;
+
+            // first result
+            scratch_out[0].r = scratch[8].r + scratch[12].r;
+            scratch_out[0].i = scratch[8].i + scratch[12].i;
+            scratch_out[1].r = scratch[9].r + scratch[13].r;
+            scratch_out[1].i = scratch[9].i + scratch[13].i;
+
+            // second result
+            scratch_out[2].r = scratch[10].r + scratch[14].i;
+            scratch_out[2].i = scratch[10].i - scratch[14].r;
+            scratch_out[3].r = scratch[11].r + scratch[15].i;
+            scratch_out[3].i = scratch[11].i - scratch[15].r;
+
+            // forth result
+            scratch_out[6].r = scratch[10].r - scratch[14].i;
+            scratch_out[6].i = scratch[10].i + scratch[14].r;
+            scratch_out[7].r = scratch[11].r - scratch[15].i;
+            scratch_out[7].i = scratch[11].i + scratch[15].r;
+
+            // store
+            Fout1[0] = scratch_out[0];
+            Fout1[1] = scratch_out[1];
+            Fout1[2] = scratch_out[2];
+            Fout1[3] = scratch_out[3];
+            Fout1[4] = scratch_out[4];
+            Fout1[5] = scratch_out[5];
+            Fout1[6] = scratch_out[6];
+            Fout1[7] = scratch_out[7];
+
+            Fin1 += 1;
+        } // f_count
+        tw += 6;
+        mstride <<= 2;
+        fstride >>= 4;
+        stage_count -= 2;
+
+        // swap
+        Ftmp = Fin;
+        Fin = Fout;
+        Fout = Ftmp;
     }
-    else if (factors[2 * stage_count] == 4) // length of FFT is 2^n (n is even)
+    else if (N == 4)   // length of FFT is 2^n (n is even)
     {
         //fstride is nfft>>2
-        for (i = 0; i < fstride; i++)
+        for (f_count = fstride; f_count ; f_count --)
         {
-            scratch[2].r = Fout[4 * i].r - Fout[4 * i + 2].r;
-            scratch[2].i = Fout[4 * i].i - Fout[4 * i + 2].i;
-
-            Fout[4 * i].r += Fout[4 * i + 2].r;
-            Fout[4 * i].i += Fout[4 * i + 2].i;
-
-            scratch[0].r = Fout[4 * i + 1].r + Fout[4 * i + 3].r;
-            scratch[0].i = Fout[4 * i + 1].i + Fout[4 * i + 3].i;
-
-            scratch[1].r = Fout[4 * i + 1].r - Fout[4 * i + 3].r;
-            scratch[1].i = Fout[4 * i + 1].i - Fout[4 * i + 3].i;
-            Fout[4 * i + 2].r = Fout[4 * i].r - scratch[0].r;
-            Fout[4 * i + 2].i = Fout[4 * i].i - scratch[0].i;
-
-            Fout[4 * i].r += scratch[0].r;
-            Fout[4 * i].i += scratch[0].i;
-
-            Fout[4 * i + 1].r = scratch[2].r + scratch[1].i;
-            Fout[4 * i + 1].i = scratch[2].i - scratch[1].r;
-            Fout[4 * i + 3].r = scratch[2].r - scratch[1].i;
-            Fout[4 * i + 3].i = scratch[2].i + scratch[1].r;
-        }
+            // load
+            scratch_in[0] = *Fin1;
+            Fin2 = Fin1 + fstride;
+            scratch_in[1] = *Fin2;
+            Fin2 = Fin2 + fstride;
+            scratch_in[2] = *Fin2;
+            Fin2 = Fin2 + fstride;
+            scratch_in[3] = *Fin2;
+
+            // radix 4 butterfly without twiddles
+
+            // radix 2 butterfly
+            scratch[0].r = scratch_in[0].r + scratch_in[2].r;
+            scratch[0].i = scratch_in[0].i + scratch_in[2].i;
+
+            scratch[1].r = scratch_in[0].r - scratch_in[2].r;
+            scratch[1].i = scratch_in[0].i - scratch_in[2].i;
+
+            // radix 2 butterfly
+            scratch[2].r = scratch_in[1].r + scratch_in[3].r;
+            scratch[2].i = scratch_in[1].i + scratch_in[3].i;
+
+            scratch[3].r = scratch_in[1].r - scratch_in[3].r;
+            scratch[3].i = scratch_in[1].i - scratch_in[3].i;
+
+            // third result
+            scratch_out[2].r = scratch[0].r - scratch[2].r;
+            scratch_out[2].i = scratch[0].i - scratch[2].i;
+
+            // first result
+            scratch_out[0].r = scratch[0].r + scratch[2].r;
+            scratch_out[0].i = scratch[0].i + scratch[2].i;
+
+            // second result
+            scratch_out[1].r = scratch[1].r + scratch[3].i;
+            scratch_out[1].i = scratch[1].i - scratch[3].r;
+
+            // forth result
+            scratch_out[3].r = scratch[1].r - scratch[3].i;
+            scratch_out[3].i = scratch[1].i + scratch[3].r;
+
+            // store
+            * Fout1 ++ = scratch_out[0];
+            * Fout1 ++ = scratch_out[1];
+            * Fout1 ++ = scratch_out[2];
+            * Fout1 ++ = scratch_out[3];
+
+            Fin1++;
+        } // f_count
+
+        N = fstride; // 1/4 of length of FFT
+
+        // swap
+        Ftmp = Fin;
+        Fin = Fout;
+        Fout = Ftmp;
+
+        // update address for other stages
+        stage_count--;
+        tw = twiddles;
+        fstride >>= 2;
+        // end of first stage
     }
-    stage_count--;
 
-    // other stages
-    mstride = factors[2 * stage_count + 1];
-    tw = twiddles;
-    for (; stage_count > 0; stage_count--)
+
+    // others but the last one
+    for (; stage_count > 1 ; stage_count--)
     {
-        fstride = fstride >> 2;
-        for (i = 0; i < fstride; i++)
+        Fin1 = Fin;
+        for (f_count = 0; f_count < fstride; f_count ++)
         {
-            F = &Fout[i * mstride * 4];
+            Fout1 = & Fout[ f_count * mstride << 2 ];
             tw1 = tw;
-            tw2 = tw + mstride;
-            tw3 = tw + mstride * 2;
-            for (j = 0; j < mstride; j++)
+            for (m_count = mstride; m_count ; m_count --)
             {
-                scratch[0].r = F[mstride].r * tw1->r - F[mstride].i * tw1->i;
-                scratch[0].i = F[mstride].r * tw1->i + F[mstride].i * tw1->r;
-                scratch[1].r = F[mstride * 2].r * tw2->r - F[mstride * 2].i * tw2->i;
-                scratch[1].i = F[mstride * 2].r * tw2->i + F[mstride * 2].i * tw2->r;
-                scratch[2].r = F[mstride * 3].r * tw3->r - F[mstride * 3].i * tw3->i;
-                scratch[2].i = F[mstride * 3].r * tw3->i + F[mstride * 3].i * tw3->r;
-
-                scratch[5].r = F->r - scratch[1].r;
-                scratch[5].i = F->i - scratch[1].i;
-                F->r += scratch[1].r;
-                F->i += scratch[1].i;
-
-                scratch[3].r = scratch[0].r + scratch[2].r;
-                scratch[3].i = scratch[0].i + scratch[2].i;
-                scratch[4].r = scratch[0].r - scratch[2].r;
-                scratch[4].i = scratch[0].i - scratch[2].i;
-
-                F[mstride * 2].r = F->r - scratch[3].r;
-                F[mstride * 2].i = F->i - scratch[3].i;
-                F->r += scratch[3].r;
-                F->i += scratch[3].i;
-
-                F[mstride].r = scratch[5].r + scratch[4].i;
-                F[mstride].i = scratch[5].i - scratch[4].r;
-                F[mstride * 3].r = scratch[5].r - scratch[4].i;
-                F[mstride * 3].i = scratch[5].i + scratch[4].r;
+                // load
+                scratch_tw[0] = *tw1;
+                tw2 = tw1 + mstride;
+                scratch_tw[1] = *tw2;
+                tw2 += mstride;
+                scratch_tw[2] = *tw2;
+                scratch_in[0] = * Fin1;
+                Fin2 = Fin1 + N;
+                scratch_in[1] = * Fin2;
+                Fin2 += N;
+                scratch_in[2] = * Fin2;
+                Fin2 += N;
+                scratch_in[3] = * Fin2;
+
+                // radix 4 butterfly with twiddles
+
+                scratch[0] = scratch_in[0];
+                scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
+                scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
+
+                scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
+                scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
+
+                scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
+                scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
+
+                // radix 2 butterfly
+                scratch[4].r = scratch[0].r + scratch[2].r;
+                scratch[4].i = scratch[0].i + scratch[2].i;
+
+                scratch[5].r = scratch[0].r - scratch[2].r;
+                scratch[5].i = scratch[0].i - scratch[2].i;
+
+                // radix 2 butterfly
+                scratch[6].r = scratch[1].r + scratch[3].r;
+                scratch[6].i = scratch[1].i + scratch[3].i;
+
+                scratch[7].r = scratch[1].r - scratch[3].r;
+                scratch[7].i = scratch[1].i - scratch[3].i;
+
+                // third result
+                scratch_out[2].r = scratch[4].r - scratch[6].r;
+                scratch_out[2].i = scratch[4].i - scratch[6].i;
+
+                // first result
+                scratch_out[0].r = scratch[4].r + scratch[6].r;
+                scratch_out[0].i = scratch[4].i + scratch[6].i;
+
+                // second result
+                scratch_out[1].r = scratch[5].r + scratch[7].i;
+                scratch_out[1].i = scratch[5].i - scratch[7].r;
+
+                // forth result
+                scratch_out[3].r = scratch[5].r - scratch[7].i;
+                scratch_out[3].i = scratch[5].i + scratch[7].r;
+
+                // store
+                *Fout1 = scratch_out[0];
+                Fout2 = Fout1 + mstride;
+                *Fout2 = scratch_out[1];
+                Fout2 += mstride;
+                *Fout2 = scratch_out[2];
+                Fout2 += mstride;
+                *Fout2 = scratch_out[3];
 
                 tw1++;
-                tw2++;
-                tw3++;
-                F++;
-            }
-        }
+                Fin1 ++;
+                Fout1 ++;
+            } // m_count
+        } // f_count
         tw += mstride * 3;
         mstride <<= 2;
-    }
+        // swap
+        Ftmp = Fin;
+        Fin = Fout;
+        Fout = Ftmp;
+        fstride >>= 2;
+    } // stage_count
+
+    // the last one
+    if (stage_count)
+    {
+        Fin1 = Fin;
+        // if stage count is even, output to the input array
+        Fout1 = Fout_ls;
 
+        for (f_count = 0; f_count < fstride; f_count ++)
+        {
+            tw1 = tw;
+            for (m_count = mstride; m_count ; m_count --)
+            {
+                // load
+                scratch_tw[0] = *tw1;
+                tw2 = tw1 + mstride;
+                scratch_tw[1] = *tw2;
+                tw2 += mstride;
+                scratch_tw[2] = *tw2;
+                scratch_in[0] = * Fin1;
+                Fin2 = Fin1 + N;
+                scratch_in[1] = * Fin2;
+                Fin2 += N;
+                scratch_in[2] = * Fin2;
+                Fin2 += N;
+                scratch_in[3] = * Fin2;
+
+                // radix 4 butterfly with twiddles
+
+                scratch[0] = scratch_in[0];
+                scratch[1].r = scratch_in[1].r * scratch_tw[0].r - scratch_in[1].i * scratch_tw[0].i;
+                scratch[1].i = scratch_in[1].i * scratch_tw[0].r + scratch_in[1].r * scratch_tw[0].i;
+
+                scratch[2].r = scratch_in[2].r * scratch_tw[1].r - scratch_in[2].i * scratch_tw[1].i;
+                scratch[2].i = scratch_in[2].i * scratch_tw[1].r + scratch_in[2].r * scratch_tw[1].i;
+
+                scratch[3].r = scratch_in[3].r * scratch_tw[2].r - scratch_in[3].i * scratch_tw[2].i;
+                scratch[3].i = scratch_in[3].i * scratch_tw[2].r + scratch_in[3].r * scratch_tw[2].i;
+
+                // radix 2 butterfly
+                scratch[4].r = scratch[0].r + scratch[2].r;
+                scratch[4].i = scratch[0].i + scratch[2].i;
+
+                scratch[5].r = scratch[0].r - scratch[2].r;
+                scratch[5].i = scratch[0].i - scratch[2].i;
+
+                // radix 2 butterfly
+                scratch[6].r = scratch[1].r + scratch[3].r;
+                scratch[6].i = scratch[1].i + scratch[3].i;
+
+                scratch[7].r = scratch[1].r - scratch[3].r;
+                scratch[7].i = scratch[1].i - scratch[3].i;
+
+                // third result
+                scratch_out[2].r = scratch[4].r - scratch[6].r;
+                scratch_out[2].i = scratch[4].i - scratch[6].i;
+
+                // first result
+                scratch_out[0].r = scratch[4].r + scratch[6].r;
+                scratch_out[0].i = scratch[4].i + scratch[6].i;
+
+                // second result
+                scratch_out[1].r = scratch[5].r + scratch[7].i;
+                scratch_out[1].i = scratch[5].i - scratch[7].r;
+
+                // forth result
+                scratch_out[3].r = scratch[5].r - scratch[7].i;
+                scratch_out[3].i = scratch[5].i + scratch[7].r;
+
+                // store
+                *Fout1 = scratch_out[0];
+                Fout2 = Fout1 + N;
+                *Fout2 = scratch_out[1];
+                Fout2 += N;
+                *Fout2 = scratch_out[2];
+                Fout2 += N;
+                *Fout2 = scratch_out[3];
+
+                tw1 ++;
+                Fin1 ++;
+                Fout1 ++;
+            } // m_count
+        } // f_count
+    } // last stage
 }
 
 static void ne10_mixed_radix_butterfly_inverse_float32_c (ne10_fft_cpx_float32_t * Fout,
+        ne10_fft_cpx_float32_t   * Fin,
         ne10_int32_t * factors,
         ne10_fft_cpx_float32_t * twiddles)
-
 {
-    ne10_int32_t i, j, mstride;
+    ne10_int32_t fstride, mstride, N;
+    ne10_int32_t fstride1;
+    ne10_int32_t f_count, m_count;
     ne10_int32_t stage_count;
-    ne10_int32_t fstride;
 
-    ne10_fft_cpx_float32_t tmp;
-    ne10_fft_cpx_float32_t scratch[6];
-    ne10_fft_cpx_float32_t *tw, *tw1, *tw2, *tw3;
-    ne10_fft_cpx_float32_t * F;
+    ne10_fft_cpx_float32_t   scratch_in[8];
+    ne10_fft_cpx_float32_t   scratch_out[8];
+    ne10_fft_cpx_float32_t   scratch[16];
+    ne10_fft_cpx_float32_t   scratch_tw[6];
 
+    ne10_fft_cpx_float32_t   *Fin1, *Fin2, *Fout1, *Fout2;
+    ne10_fft_cpx_float32_t   *Fout_ls = Fout;
+    ne10_fft_cpx_float32_t   *Ftmp;
+    ne10_fft_cpx_float32_t   *tw, *tw1, *tw2;
+    const ne10_float32_t TW_81 = 0.70710678;
+    const ne10_float32_t TW_81N = -0.70710678;
 
-    // the first stage
+    // init fstride, mstride, N
     stage_count = factors[0];
     fstride = factors[1];
-    if (factors[2 * stage_count] == 2) // length of FFT is 2^n (n is odd)
+    mstride = factors[ (stage_count << 1) - 1 ];
+    N = factors[ stage_count << 1 ]; // radix
+
+    // the first stage
+    Fin1 = Fin;
+    Fout1 = Fout;
+    if (N == 2)   // length of FFT is 2^n (n is odd)
     {
-        //fstride is nfft>>1;
-        for (i = 0; i < fstride; i++)
+        // radix 8
+        N = fstride >> 1; // 1/4 of length of FFT
+        tw = twiddles;
+        fstride1 = fstride >> 2;
+
+        Fin1 = Fin;
+        for (f_count = 0; f_count < fstride1; f_count ++)
         {
-            tmp.r = Fout[2 * i + 1].r;
-            tmp.i = Fout[2 * i + 1].i;
-            Fout[2 * i + 1].r = Fout[2 * i].r - tmp.r;
-            Fout[2 * i + 1].i = Fout[2 * i].i - tmp.i;
-            Fout[2 * i].r = Fout[2 * i].r + tmp.r;
-            Fout[2 * i].i = Fout[2 * i].i + tmp.i;
-        }
+            Fout1 = & Fout[ f_count * 8 ];
+            // load
+            scratch_tw[0] = tw[0];
+            scratch_tw[2] = tw[2];
+            scratch_tw[4] = tw[4];
+            scratch_tw[1] = tw[1];
+            scratch_tw[3] = tw[3];
+            scratch_tw[5] = tw[5];
+
+            scratch_in[0].r = Fin1[0].r + Fin1[0 + fstride].r;
+            scratch_in[0].i = Fin1[0].i + Fin1[0 + fstride].i;
+            scratch_in[1].r = Fin1[0].r - Fin1[0 + fstride].r;
+            scratch_in[1].i = Fin1[0].i - Fin1[0 + fstride].i;
+            scratch_in[2].r = Fin1[fstride1].r + Fin1[fstride1 + fstride].r;
+            scratch_in[2].i = Fin1[fstride1].i + Fin1[fstride1 + fstride].i;
+            scratch_in[3].r = Fin1[fstride1].r - Fin1[fstride1 + fstride].r;
+            scratch_in[3].i = Fin1[fstride1].i - Fin1[fstride1 + fstride].i;
+            scratch_in[4].r = Fin1[fstride1 * 2].r + Fin1[fstride1 * 2 + fstride].r;
+            scratch_in[4].i = Fin1[fstride1 * 2].i + Fin1[fstride1 * 2 + fstride].i;
+            scratch_in[5].r = Fin1[fstride1 * 2].r - Fin1[fstride1 * 2 + fstride].r;
+            scratch_in[5].i = Fin1[fstride1 * 2].i - Fin1[fstride1 * 2 + fstride].i;
+            scratch_in[6].r = Fin1[fstride1 * 3].r + Fin1[fstride1 * 3 + fstride].r;
+            scratch_in[6].i = Fin1[fstride1 * 3].i + Fin1[fstride1 * 3 + fstride].i;
+            scratch_in[7].r = Fin1[fstride1 * 3].r - Fin1[fstride1 * 3 + fstride].r;
+            scratch_in[7].i = Fin1[fstride1 * 3].i - Fin1[fstride1 * 3 + fstride].i;
+
+            // radix 4 butterfly with twiddles
+
+            scratch[0] = scratch_in[0];
+            scratch[1] = scratch_in[1];
+
+            scratch[2] = scratch_in[2];
+            scratch[3].r = (scratch_in[3].r - scratch_in[3].i) * TW_81;
+            scratch[3].i = (scratch_in[3].i + scratch_in[3].r) * TW_81;
+
+            scratch[4] = scratch_in[4];
+            scratch[5].r = -scratch_in[5].i;
+            scratch[5].i = scratch_in[5].r;
+
+            scratch[6].r = scratch_in[6].r;
+            scratch[6].i = scratch_in[6].i;
+            scratch[7].r = (scratch_in[7].r + scratch_in[7].i) * TW_81N;
+            scratch[7].i = (scratch_in[7].i - scratch_in[7].r) * TW_81N;
+
+            // radix 2 butterfly
+            scratch[8].r = scratch[0].r + scratch[4].r;
+            scratch[8].i = scratch[0].i + scratch[4].i;
+            scratch[9].r = scratch[1].r + scratch[5].r;
+            scratch[9].i = scratch[1].i + scratch[5].i;
+
+            scratch[10].r = scratch[0].r - scratch[4].r;
+            scratch[10].i = scratch[0].i - scratch[4].i;
+            scratch[11].r = scratch[1].r - scratch[5].r;
+            scratch[11].i = scratch[1].i - scratch[5].i;
+
+            // radix 2 butterfly
+            scratch[12].r = scratch[2].r + scratch[6].r;
+            scratch[12].i = scratch[2].i + scratch[6].i;
+            scratch[13].r = scratch[3].r + scratch[7].r;
+            scratch[13].i = scratch[3].i + scratch[7].i;
+
+            scratch[14].r = scratch[2].r - scratch[6].r;
+            scratch[14].i = scratch[2].i - scratch[6].i;
+            scratch[15].r = scratch[3].r - scratch[7].r;
+            scratch[15].i = scratch[3].i - scratch[7].i;
+
+            // third result
+            scratch_out[4].r = scratch[8].r - scratch[12].r;
+            scratch_out[4].i = scratch[8].i - scratch[12].i;
+            scratch_out[5].r = scratch[9].r - scratch[13].r;
+            scratch_out[5].i = scratch[9].i - scratch[13].i;
+
+            // first result
+            scratch_out[0].r = scratch[8].r + scratch[12].r;
+            scratch_out[0].i = scratch[8].i + scratch[12].i;
+            scratch_out[1].r = scratch[9].r + scratch[13].r;
+            scratch_out[1].i = scratch[9].i + scratch[13].i;
+
+            // second result
+            scratch_out[2].r = scratch[10].r - scratch[14].i;
+            scratch_out[2].i = scratch[10].i + scratch[14].r;
+            scratch_out[3].r = scratch[11].r - scratch[15].i;
+            scratch_out[3].i = scratch[11].i + scratch[15].r;
+
+            // forth result
+            scratch_out[6].r = scratch[10].r + scratch[14].i;
+            scratch_out[6].i = scratch[10].i - scratch[14].r;
+            scratch_out[7].r = scratch[11].r + scratch[15].i;
+            scratch_out[7].i = scratch[11].i - scratch[15].r;
+
+            // store
+            Fout1[0] = scratch_out[0];
+            Fout1[1] = scratch_out[1];
+            Fout1[2] = scratch_out[2];
+            Fout1[3] = scratch_out[3];
+            Fout1[4] = scratch_out[4];
+            Fout1[5] = scratch_out[5];
+            Fout1[6] = scratch_out[6];
+            Fout1[7] = scratch_out[7];
+
+            Fin1 += 1;
+        } // f_count
+        tw += 6;
+        mstride <<= 2;
+        fstride >>= 4;
+        stage_count -= 2;
+
+        // swap
+        Ftmp = Fin;
+        Fin = Fout;
+        Fout = Ftmp;
     }
-    else if (factors[2 * stage_count] == 4) // length of FFT is 2^n (n is even)
+    else if (N == 4)   // length of FFT is 2^n (n is even)
     {
         //fstride is nfft>>2
-        for (i = 0; i < fstride; i++)
+        for (f_count = fstride; f_count ; f_count --)
         {
-            scratch[2].r = Fout[4 * i].r - Fout[4 * i + 2].r;
-            scratch[2].i = Fout[4 * i].i - Fout[4 * i + 2].i;
-
-            Fout[4 * i].r += Fout[4 * i + 2].r;
-            Fout[4 * i].i += Fout[4 * i + 2].i;
-
-            scratch[0].r = Fout[4 * i + 1].r + Fout[4 * i + 3].r;
-            scratch[0].i = Fout[4 * i + 1].i + Fout[4 * i + 3].i;
-
-            scratch[1].r = Fout[4 * i + 1].r - Fout[4 * i + 3].r;
-            scratch[1].i = Fout[4 * i + 1].i - Fout[4 * i + 3].i;
-            Fout[4 * i + 2].r = Fout[4 * i].r - scratch[0].r;
-            Fout[4 * i + 2].i = Fout[4 * i].i - scratch[0].i;
-
-            Fout[4 * i].r += scratch[0].r;
-            Fout[4 * i].i += scratch[0].i;
-
-            Fout[4 * i + 1].r = scratch[2].r - scratch[1].i;
-            Fout[4 * i + 1].i = scratch[2].i + scratch[1].r;
-            Fout[4 * i + 3].r = scratch[2].r + scratch[1].i;
-            Fout[4 * i + 3].i = scratch[2].i - scratch[1].r;
-        }
+            // load
+            scratch_in[0] = *Fin1;
+            Fin2 = Fin1 + fstride;
+            scratch_in[1] = *Fin2;
+            Fin2 = Fin2 + fstride;
+            scratch_in[2] = *Fin2;
+            Fin2 = Fin2 + fstride;
+            scratch_in[3] = *Fin2;
+
+            // radix 4 butterfly without twiddles
+
+            // radix 2 butterfly
+            scratch[0].r = scratch_in[0].r + scratch_in[2].r;
+            scratch[0].i = scratch_in[0].i + scratch_in[2].i;
+
+            scratch[1].r = scratch_in[0].r - scratch_in[2].r;
+            scratch[1].i = scratch_in[0].i - scratch_in[2].i;
+
+            // radix 2 butterfly
+            scratch[2].r = scratch_in[1].r + scratch_in[3].r;
+            scratch[2].i = scratch_in[1].i + scratch_in[3].i;
+
+            scratch[3].r = scratch_in[1].r - scratch_in[3].r;
+            scratch[3].i = scratch_in[1].i - scratch_in[3].i;
+
+            // third result
+            scratch_out[2].r = scratch[0].r - scratch[2].r;
+            scratch_out[2].i = scratch[0].i - scratch[2].i;
+
+            // first result
+            scratch_out[0].r = scratch[0].r + scratch[2].r;
+            scratch_out[0].i = scratch[0].i + scratch[2].i;
+
+            // second result
+            scratch_out[1].r = scratch[1].r - scratch[3].i;
+            scratch_out[1].i = scratch[1].i + scratch[3].r;
+
+            // forth result
+            scratch_out[3].r = scratch[1].r + scratch[3].i;
+            scratch_out[3].i = scratch[1].i - scratch[3].r;
+
+            // store
+            * Fout1 ++ = scratch_out[0];
+            * Fout1 ++ = scratch_out[1];
+            * Fout1 ++ = scratch_out[2];
+            * Fout1 ++ = scratch_out[3];
+
+            Fin1++;
+        } // f_count
+
+        N = fstride; // 1/4 of length of FFT
+
+        // swap
+        Ftmp = Fin;
+        Fin = Fout;
+        Fout = Ftmp;
+
+        // update address for other stages
+        stage_count--;
+        tw = twiddles;
+        fstride >>= 2;
+        // end of first stage
     }
-    stage_count--;
 
-    // other stages
-    mstride = factors[2 * stage_count + 1];
-    tw = twiddles;
-    for (; stage_count > 0; stage_count--)
+
+    // others but the last one
+    for (; stage_count > 1 ; stage_count--)
     {
-        fstride = fstride >> 2;
-        for (i = 0; i < fstride; i++)
+        Fin1 = Fin;
+        for (f_count = 0; f_count < fstride; f_count ++)
         {
-            F = &Fout[i * mstride * 4];
+            Fout1 = & Fout[ f_count * mstride << 2 ];
             tw1 = tw;
-            tw2 = tw + mstride;
-            tw3 = tw + mstride * 2;
-            for (j = 0; j < mstride; j++)
+            for (m_count = mstride; m_count ; m_count --)
             {
-                scratch[0].r = F[mstride].r * tw1->r + F[mstride].i * tw1->i;
-                scratch[0].i = F[mstride].i * tw1->r - F[mstride].r * tw1->i;
-                scratch[1].r = F[mstride * 2].r * tw2->r + F[mstride * 2].i * tw2->i;
-                scratch[1].i = F[mstride * 2].i * tw2->r - F[mstride * 2].r * tw2->i;
-                scratch[2].r = F[mstride * 3].r * tw3->r + F[mstride * 3].i * tw3->i;
-                scratch[2].i = F[mstride * 3].i * tw3->r - F[mstride * 3].r * tw3->i;
-
-                scratch[5].r = F->r - scratch[1].r;
-                scratch[5].i = F->i - scratch[1].i;
-                F->r += scratch[1].r;
-                F->i += scratch[1].i;
-
-                scratch[3].r = scratch[0].r + scratch[2].r;
-                scratch[3].i = scratch[0].i + scratch[2].i;
-                scratch[4].r = scratch[0].r - scratch[2].r;
-                scratch[4].i = scratch[0].i - scratch[2].i;
-
-                F[mstride * 2].r = F->r - scratch[3].r;
-                F[mstride * 2].i = F->i - scratch[3].i;
-                F->r += scratch[3].r;
-                F->i += scratch[3].i;
-
-                F[mstride].r = scratch[5].r - scratch[4].i;
-                F[mstride].i = scratch[5].i + scratch[4].r;
-                F[mstride * 3].r = scratch[5].r + scratch[4].i;
-                F[mstride * 3].i = scratch[5].i - scratch[4].r;
+                // load
+                scratch_tw[0] = *tw1;
+                tw2 = tw1 + mstride;
+                scratch_tw[1] = *tw2;
+                tw2 += mstride;
+                scratch_tw[2] = *tw2;
+                scratch_in[0] = * Fin1;
+                Fin2 = Fin1 + N;
+                scratch_in[1] = * Fin2;
+                Fin2 += N;
+                scratch_in[2] = * Fin2;
+                Fin2 += N;
+                scratch_in[3] = * Fin2;
+
+                // radix 4 butterfly with twiddles
+
+                scratch[0] = scratch_in[0];
+                scratch[1].r = scratch_in[1].r * scratch_tw[0].r + scratch_in[1].i * scratch_tw[0].i;
+                scratch[1].i = scratch_in[1].i * scratch_tw[0].r - scratch_in[1].r * scratch_tw[0].i;
+
+                scratch[2].r = scratch_in[2].r * scratch_tw[1].r + scratch_in[2].i * scratch_tw[1].i;
+                scratch[2].i = scratch_in[2].i * scratch_tw[1].r - scratch_in[2].r * scratch_tw[1].i;
+
+                scratch[3].r = scratch_in[3].r * scratch_tw[2].r + scratch_in[3].i * scratch_tw[2].i;
+                scratch[3].i = scratch_in[3].i * scratch_tw[2].r - scratch_in[3].r * scratch_tw[2].i;
+
+                // radix 2 butterfly
+                scratch[4].r = scratch[0].r + scratch[2].r;
+                scratch[4].i = scratch[0].i + scratch[2].i;
+
+                scratch[5].r = scratch[0].r - scratch[2].r;
+                scratch[5].i = scratch[0].i - scratch[2].i;
+
+                // radix 2 butterfly
+                scratch[6].r = scratch[1].r + scratch[3].r;
+                scratch[6].i = scratch[1].i + scratch[3].i;
+
+                scratch[7].r = scratch[1].r - scratch[3].r;
+                scratch[7].i = scratch[1].i - scratch[3].i;
+
+                // third result
+                scratch_out[2].r = scratch[4].r - scratch[6].r;
+                scratch_out[2].i = scratch[4].i - scratch[6].i;
+
+                // first result
+                scratch_out[0].r = scratch[4].r + scratch[6].r;
+                scratch_out[0].i = scratch[4].i + scratch[6].i;
+
+                // second result
+                scratch_out[1].r = scratch[5].r - scratch[7].i;
+                scratch_out[1].i = scratch[5].i + scratch[7].r;
+
+                // forth result
+                scratch_out[3].r = scratch[5].r + scratch[7].i;
+                scratch_out[3].i = scratch[5].i - scratch[7].r;
+
+                // store
+                *Fout1 = scratch_out[0];
+                Fout2 = Fout1 + mstride;
+                *Fout2 = scratch_out[1];
+                Fout2 += mstride;
+                *Fout2 = scratch_out[2];
+                Fout2 += mstride;
+                *Fout2 = scratch_out[3];
 
                 tw1++;
-                tw2++;
-                tw3++;
-                F++;
-            }
-        }
+                Fin1 ++;
+                Fout1 ++;
+            } // m_count
+        } // f_count
         tw += mstride * 3;
         mstride <<= 2;
-    }
+        // swap
+        Ftmp = Fin;
+        Fin = Fout;
+        Fout = Ftmp;
+        fstride >>= 2;
+    } // stage_count
+
+    // the last one
+    if (stage_count)
+    {
+        Fin1 = Fin;
+        // if stage count is even, output to the input array
+        Fout1 = Fout_ls;
+
+        for (f_count = 0; f_count < fstride; f_count ++)
+        {
+            tw1 = tw;
+            for (m_count = mstride; m_count ; m_count --)
+            {
+                // load
+                scratch_tw[0] = *tw1;
+                tw2 = tw1 + mstride;
+                scratch_tw[1] = *tw2;
+                tw2 += mstride;
+                scratch_tw[2] = *tw2;
+                scratch_in[0] = * Fin1;
+                Fin2 = Fin1 + N;
+                scratch_in[1] = * Fin2;
+                Fin2 += N;
+                scratch_in[2] = * Fin2;
+                Fin2 += N;
+                scratch_in[3] = * Fin2;
+
+                // radix 4 butterfly with twiddles
+
+                scratch[0] = scratch_in[0];
+                scratch[1].r = scratch_in[1].r * scratch_tw[0].r + scratch_in[1].i * scratch_tw[0].i;
+                scratch[1].i = scratch_in[1].i * scratch_tw[0].r - scratch_in[1].r * scratch_tw[0].i;
+
+                scratch[2].r = scratch_in[2].r * scratch_tw[1].r + scratch_in[2].i * scratch_tw[1].i;
+                scratch[2].i = scratch_in[2].i * scratch_tw[1].r - scratch_in[2].r * scratch_tw[1].i;
+
+                scratch[3].r = scratch_in[3].r * scratch_tw[2].r + scratch_in[3].i * scratch_tw[2].i;
+                scratch[3].i = scratch_in[3].i * scratch_tw[2].r - scratch_in[3].r * scratch_tw[2].i;
+
+                // radix 2 butterfly
+                scratch[4].r = scratch[0].r + scratch[2].r;
+                scratch[4].i = scratch[0].i + scratch[2].i;
+
+                scratch[5].r = scratch[0].r - scratch[2].r;
+                scratch[5].i = scratch[0].i - scratch[2].i;
+
+                // radix 2 butterfly
+                scratch[6].r = scratch[1].r + scratch[3].r;
+                scratch[6].i = scratch[1].i + scratch[3].i;
+
+                scratch[7].r = scratch[1].r - scratch[3].r;
+                scratch[7].i = scratch[1].i - scratch[3].i;
+
+                // third result
+                scratch_out[2].r = scratch[4].r - scratch[6].r;
+                scratch_out[2].i = scratch[4].i - scratch[6].i;
+
+                // first result
+                scratch_out[0].r = scratch[4].r + scratch[6].r;
+                scratch_out[0].i = scratch[4].i + scratch[6].i;
+
+                // second result
+                scratch_out[1].r = scratch[5].r - scratch[7].i;
+                scratch_out[1].i = scratch[5].i + scratch[7].r;
+
+                // forth result
+                scratch_out[3].r = scratch[5].r + scratch[7].i;
+                scratch_out[3].i = scratch[5].i - scratch[7].r;
+
+                // store
+                *Fout1 = scratch_out[0];
+                Fout2 = Fout1 + N;
+                *Fout2 = scratch_out[1];
+                Fout2 += N;
+                *Fout2 = scratch_out[2];
+                Fout2 += N;
+                *Fout2 = scratch_out[3];
+
+                tw1 ++;
+                Fin1 ++;
+                Fout1 ++;
+            } // m_count
+        } // f_count
+    } // last stage
 }
 
 /* factors buffer:
@@ -307,34 +874,6 @@ static ne10_int32_t ne10_factor (ne10_int32_t n, ne10_int32_t * facbuf)
     return NE10_OK;
 }
 
-void ne10_data_bitreversal_float32 (ne10_fft_cpx_float32_t * Fout,
-                                    const ne10_fft_cpx_float32_t * f,
-                                    ne10_int32_t fstride,
-                                    ne10_int32_t * factors)
-{
-    const ne10_int32_t p = *factors++; /* the radix  */
-    const ne10_int32_t m = *factors++; /* stage's fft length/p */
-    const ne10_fft_cpx_float32_t * Fout_end = Fout + p * m;
-    if (m == 1)
-    {
-        do
-        {
-            *Fout = *f;
-            f += fstride;
-        }
-        while (++Fout != Fout_end);
-    }
-    else
-    {
-        do
-        {
-            ne10_data_bitreversal_float32 (Fout, f, fstride * p, factors);
-            f += fstride;
-        }
-        while ( (Fout += m) != Fout_end);
-    }
-
-}
 
 void ne10_fft_split_r2c_1d_float32 (ne10_fft_cpx_float32_t *dst,
                                     const ne10_fft_cpx_float32_t *src,
@@ -560,14 +1099,13 @@ void ne10_fft_c2c_1d_float32_c (ne10_fft_cpx_float32_t *fout,
                                 ne10_int32_t nfft,
                                 ne10_int32_t inverse_fft)
 {
-    // copy the data from input to output and bit reversal
-    ne10_data_bitreversal_float32 (fout, fin, 1, &factors[2]);
 
     if (inverse_fft)
-        ne10_mixed_radix_butterfly_inverse_float32_c (fout, factors, twiddles);
+        ne10_mixed_radix_butterfly_inverse_float32_c (fout, fin, factors, twiddles);
     else
-        ne10_mixed_radix_butterfly_float32_c (fout, factors, twiddles);
+        ne10_mixed_radix_butterfly_float32_c (fout, fin, factors, twiddles);
 }
+
 /**
  * @}
  */ //end of C2C_FFT_IFFT group
index 38b69d0..5d2234d 100644 (file)
@@ -136,10 +136,10 @@ static inline void ne10_fft8_forward_float32 (ne10_fft_cpx_float32_t * Fout,
     Fout[6].r = t0_r - t3_i;
     Fout[6].i = t0_i + t3_r;
 
-    t4_r =  (s3_r + s3_i) * TW_81;
-    t4_i = -(s3_r - s3_i) * TW_81;
-    t5_r =  (s7_r - s7_i) * TW_81;
-    t5_i =  (s7_r + s7_i) * TW_81;
+    t4_r = (s3_r + s3_i) * TW_81;
+    t4_i = - (s3_r - s3_i) * TW_81;
+    t5_r = (s7_r - s7_i) * TW_81;
+    t5_i = (s7_r + s7_i) * TW_81;
 
     t0_r = s1_r - s5_i;
     t0_i = s1_i + s5_r;
@@ -205,7 +205,7 @@ static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t * Fout,
     t4_r = (s3_r - s3_i) * TW_81;
     t4_i = (s3_r + s3_i) * TW_81;
     t5_r = (s7_r + s7_i) * TW_81;
-    t5_i = -(s7_r - s7_i) * TW_81;
+    t5_i = - (s7_r - s7_i) * TW_81;
 
     t0_r = s1_r + s5_i;
     t0_i = s1_i - s5_r;
@@ -225,404 +225,6 @@ static inline void ne10_fft8_backward_float32 (ne10_fft_cpx_float32_t * Fout,
     Fout[7].i = t0_i - t3_r;
 }
 
-static inline ne10_data_bitreversal_butterfly4_forward_float32 (ne10_fft_cpx_float32_t * out,
-        ne10_fft_cpx_float32_t * in,
-        ne10_int32_t fstride,
-        ne10_int32_t stride1)
-{
-    ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
-    t2_r = in[0].r - in[fstride * 2].r;
-    t2_i = in[0].i - in[fstride * 2].i;
-    t3_r = in[0].r + in[fstride * 2].r;
-    t3_i = in[0].i + in[fstride * 2].i;
-    t0_r = in[fstride].r + in[fstride * 3].r;
-    t0_i = in[fstride].i + in[fstride * 3].i;
-    t1_r = in[fstride].r - in[fstride * 3].r;
-    t1_i = in[fstride].i - in[fstride * 3].i;
-    out[2].r = t3_r - t0_r;
-    out[2].i = t3_i - t0_i;
-    out[0].r = t3_r + t0_r;
-    out[0].i = t3_i + t0_i;
-    out[1].r = t2_r + t1_i;
-    out[1].i = t2_i - t1_r;
-    out[3].r = t2_r - t1_i;
-    out[3].i = t2_i + t1_r;
-}
-
-static inline ne10_data_bitreversal_butterfly4_backward_float32 (ne10_fft_cpx_float32_t * out,
-        ne10_fft_cpx_float32_t * in,
-        ne10_int32_t fstride,
-        ne10_int32_t stride1)
-{
-    ne10_float32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
-    t2_r = in[0].r - in[fstride * 2].r;
-    t2_i = in[0].i - in[fstride * 2].i;
-    t3_r = in[0].r + in[fstride * 2].r;
-    t3_i = in[0].i + in[fstride * 2].i;
-    t0_r = in[fstride].r + in[fstride * 3].r;
-    t0_i = in[fstride].i + in[fstride * 3].i;
-    t1_r = in[fstride].r - in[fstride * 3].r;
-    t1_i = in[fstride].i - in[fstride * 3].i;
-    out[2].r = t3_r - t0_r;
-    out[2].i = t3_i - t0_i;
-    out[0].r = t3_r + t0_r;
-    out[0].i = t3_i + t0_i;
-    out[1].r = t2_r - t1_i;
-    out[1].i = t2_i + t1_r;
-    out[3].r = t2_r + t1_i;
-    out[3].i = t2_i - t1_r;
-}
-
-#define  ne10_data_bitreversal64_butterfly4_float32(inverse) \
-static void ne10_data_bitreversal64_butterfly4_##inverse##_float32 (ne10_fft_cpx_float32_t * Fout, \
-        ne10_fft_cpx_float32_t * Fin) \
-{ \
-    ne10_int32_t i, p; \
-    ne10_int32_t fstride; \
-    ne10_int32_t stride1; \
-    ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
-    ne10_fft_cpx_float32_t * F; \
-    ne10_fft_cpx_float32_t * in; \
-    fstride = 16; \
-    F = Fout; \
-    in = Fin; \
-    stride1 = fstride >> 2; \
-    for (i = 0; i < 4; i++) \
-    { \
-        in = &Fin[i]; \
-        for (p = 0; p < 4; p++) \
-        { \
-            ne10_data_bitreversal_butterfly4_##inverse##_float32 (F, in, fstride, stride1); \
-            in += stride1; \
-            F += 4; \
-        } \
-    } \
-}
-
-#define  ne10_data_bitreversal256_butterfly4_float32(inverse) \
-static void ne10_data_bitreversal256_butterfly4_##inverse##_float32 (ne10_fft_cpx_float32_t * Fout, \
-        ne10_fft_cpx_float32_t * Fin) \
-{ \
-    ne10_int32_t i, j, p; \
-    ne10_int32_t fstride; \
-    ne10_int32_t stride1; \
-    ne10_int32_t stride2; \
-    ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
-    ne10_fft_cpx_float32_t * F; \
-    ne10_fft_cpx_float32_t * in; \
-    fstride = 64; \
-    F = Fout; \
-    in = Fin; \
-    stride1 = fstride >> 2; \
-    stride2 = stride1 >> 2; \
-    for (j = 0; j < 4; j++) \
-    { \
-        for (i = 0; i < 4; i++) \
-        { \
-            in = &Fin[j + i * stride2]; \
-            for (p = 0; p < 4; p++) \
-            { \
-                ne10_data_bitreversal_butterfly4_##inverse##_float32 (F, in, fstride, stride1); \
-                in += stride1; \
-                F += 4; \
-            } \
-        } \
-    } \
-}
-
-#define  ne10_data_bitreversal1024_butterfly4_float32(inverse) \
-static void ne10_data_bitreversal1024_butterfly4_##inverse##_float32 (ne10_fft_cpx_float32_t * Fout, \
-        ne10_fft_cpx_float32_t * Fin) \
-{ \
-    ne10_int32_t i, j, k, p; \
-    ne10_int32_t fstride; \
-    ne10_int32_t stride1; \
-    ne10_int32_t stride2; \
-    ne10_int32_t stride3; \
-    ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
-    ne10_fft_cpx_float32_t * F; \
-    ne10_fft_cpx_float32_t * in; \
-    fstride = 256; \
-    F = Fout; \
-    in = Fin; \
-    stride1 = fstride >> 2; \
-    stride2 = stride1 >> 2; \
-    stride3 = stride2 >> 2; \
-    for (k = 0; k < 4; k++) \
-    { \
-        for (j = 0; j < 4; j++) \
-        { \
-            for (i = 0; i < 4; i++) \
-            { \
-                in = &Fin[k + j * stride3 + i * stride2]; \
-                for (p = 0; p < 4; p++) \
-                { \
-                    ne10_data_bitreversal_butterfly4_##inverse##_float32 (F, in, fstride, stride1); \
-                    in += stride1; \
-                    F += 4; \
-                } \
-            } \
-        } \
-    } \
-}
-
-#define  ne10_data_bitreversal4096_butterfly4_float32(inverse) \
-static void ne10_data_bitreversal4096_butterfly4_##inverse##_float32 (ne10_fft_cpx_float32_t * Fout, \
-        ne10_fft_cpx_float32_t * Fin) \
-{ \
-    ne10_int32_t i, j, k, l, p; \
-    ne10_int32_t fstride; \
-    ne10_int32_t stride1; \
-    ne10_int32_t stride2; \
-    ne10_int32_t stride3; \
-    ne10_int32_t stride4; \
-    ne10_int32_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i; \
-    ne10_fft_cpx_float32_t * F; \
-    ne10_fft_cpx_float32_t * in; \
-    fstride = 1024; \
-    F = Fout; \
-    in = Fin; \
-    stride1 = fstride >> 2; \
-    stride2 = stride1 >> 2; \
-    stride3 = stride2 >> 2; \
-    stride4 = stride3 >> 2; \
-    for (l = 0; l < 4; l++) \
-    { \
-        for (k = 0; k < 4; k++) \
-        { \
-            for (j = 0; j < 4; j++) \
-            { \
-                for (i = 0; i < 4; i++) \
-                { \
-                    in = &Fin[l + k*stride4 + j * stride3 + i * stride2]; \
-                    for (p = 0; p < 4; p++) \
-                    { \
-                        ne10_data_bitreversal_butterfly4_##inverse##_float32 (F, in, fstride, stride1); \
-                        in += stride1; \
-                        F += 4; \
-                    } \
-                } \
-            } \
-        } \
-    } \
-}
-
-#define  ne10_butterfly_length_even_power2_float32_neon(inverse) \
-static void ne10_butterfly_##inverse##_length_even_power2_float32_neon (ne10_fft_cpx_float32_t * Fout, \
-        ne10_fft_cpx_float32_t * Fin, \
-        ne10_int32_t * factors, \
-        ne10_fft_cpx_float32_t * twiddles) \
-{ \
-    ne10_int32_t fstride = factors[1]; \
-    if (fstride == 16) \
-        ne10_data_bitreversal64_butterfly4_##inverse##_float32 (Fout, Fin); \
-    else if (fstride == 64) \
-        ne10_data_bitreversal256_butterfly4_##inverse##_float32 (Fout, Fin); \
-    else if (fstride == 256) \
-        ne10_data_bitreversal1024_butterfly4_##inverse##_float32 (Fout, Fin); \
-    else if (fstride == 1024) \
-        ne10_data_bitreversal4096_butterfly4_##inverse##_float32 (Fout, Fin); \
-    ne10_radix4_butterfly_##inverse##_float32_neon (Fout, factors, twiddles);\
-}
-
-ne10_data_bitreversal64_butterfly4_float32 (forward)
-ne10_data_bitreversal64_butterfly4_float32 (backward)
-ne10_data_bitreversal256_butterfly4_float32 (forward)
-ne10_data_bitreversal256_butterfly4_float32 (backward)
-ne10_data_bitreversal1024_butterfly4_float32 (forward)
-ne10_data_bitreversal1024_butterfly4_float32 (backward)
-ne10_data_bitreversal4096_butterfly4_float32 (forward)
-ne10_data_bitreversal4096_butterfly4_float32 (backward)
-
-ne10_butterfly_length_even_power2_float32_neon (forward)
-ne10_butterfly_length_even_power2_float32_neon (backward)
-
-static inline ne10_data_bitreversal_butterfly2_float32_neon (ne10_fft_cpx_float32_t * out,
-        ne10_fft_cpx_float32_t * in,
-        ne10_int32_t fstride,
-        ne10_int32_t stride1)
-{
-    float32x2_t d_in0_0, d_in0_1;
-    float32x2_t d_in1_0, d_in1_1;
-    float32x2_t d_in2_0, d_in2_1;
-    float32x2_t d_in3_0, d_in3_1;
-    float32x4_t q_in01_0, q_in01_1, q_in23_0, q_in23_1;
-    float32x4_t q_out01_0, q_out01_1, q_out23_0, q_out23_1;
-    /* load loop */
-    d_in0_0 = vld1_f32 ( (float32_t*) (&in[0]));
-    d_in0_1 = vld1_f32 ( (float32_t*) (&in[fstride]));
-    d_in1_0 = vld1_f32 ( (float32_t*) (&in[stride1]));
-    d_in1_1 = vld1_f32 ( (float32_t*) (&in[stride1 + fstride]));
-    d_in2_0 = vld1_f32 ( (float32_t*) (&in[stride1 * 2]));
-    d_in2_1 = vld1_f32 ( (float32_t*) (&in[stride1 * 2 + fstride]));
-    d_in3_0 = vld1_f32 ( (float32_t*) (&in[stride1 * 3]));
-    d_in3_1 = vld1_f32 ( (float32_t*) (&in[stride1 * 3 + fstride]));
-    /* calculate loop */
-    q_in01_0 = vcombine_f32 (d_in0_0, d_in1_0);
-    q_in01_1 = vcombine_f32 (d_in0_1, d_in1_1);
-    q_in23_0 = vcombine_f32 (d_in2_0, d_in3_0);
-    q_in23_1 = vcombine_f32 (d_in2_1, d_in3_1);
-    q_out01_0 = vaddq_f32 (q_in01_0, q_in01_1);
-    q_out01_1 = vsubq_f32 (q_in01_0, q_in01_1);
-    q_out23_0 = vaddq_f32 (q_in23_0, q_in23_1);
-    q_out23_1 = vsubq_f32 (q_in23_0, q_in23_1);
-    /* store loop */
-    vst1q_f32 ( (float32_t*) (&out[0]), vcombine_f32 (vget_low_f32 (q_out01_0), vget_low_f32 (q_out01_1)));
-    vst1q_f32 ( (float32_t*) (&out[2]), vcombine_f32 (vget_high_f32 (q_out01_0), vget_high_f32 (q_out01_1)));
-    vst1q_f32 ( (float32_t*) (&out[4]), vcombine_f32 (vget_low_f32 (q_out23_0), vget_low_f32 (q_out23_1)));
-    vst1q_f32 ( (float32_t*) (&out[6]), vcombine_f32 (vget_high_f32 (q_out23_0), vget_high_f32 (q_out23_1)));
-}
-
-static void ne10_data_bitreversal32_float32_neon (ne10_fft_cpx_float32_t * Fout,
-        ne10_fft_cpx_float32_t * Fin)
-{
-    ne10_int32_t i;
-    ne10_int32_t fstride;
-
-    ne10_fft_cpx_float32_t * F;
-    ne10_fft_cpx_float32_t * in;
-    ne10_int32_t stride1;
-    ne10_int32_t stride2;
-
-    // get the input, resort, calculate the first stage
-    fstride = 16;
-
-    F = Fout;
-    in = Fin;
-    stride1 = fstride >> 2;
-    stride2 = stride1 >> 2;
-    for (i = 0; i < 4; i++)
-    {
-        ne10_data_bitreversal_butterfly2_float32_neon (F, in, fstride, stride1);
-        F += 8;
-        in += stride2;
-    }
-}
-
-static void ne10_data_bitreversal128_float32_neon (ne10_fft_cpx_float32_t * Fout,
-        ne10_fft_cpx_float32_t * Fin)
-{
-    ne10_int32_t i, j;
-    ne10_int32_t fstride;
-    ne10_int32_t stride1;
-    ne10_int32_t stride2;
-
-    ne10_fft_cpx_float32_t * F;
-    ne10_fft_cpx_float32_t * in;
-
-    // get the input, resort, calculate the first stage
-    fstride = 64;
-    F = Fout;
-    stride1 = fstride >> 2;
-    stride2 = stride1 >> 2;
-    for (j = 0; j < 4; j++)
-    {
-        in = &Fin[j];
-        for (i = 0; i < 4; i++)
-        {
-            ne10_data_bitreversal_butterfly2_float32_neon (F, in, fstride, stride1);
-            F += 8;
-            in += stride2;
-        }
-    }
-}
-
-
-static void ne10_data_bitreversal512_float32_neon (ne10_fft_cpx_float32_t * Fout,
-        ne10_fft_cpx_float32_t * Fin)
-{
-    ne10_int32_t i, j, k;
-    ne10_int32_t fstride;
-    ne10_int32_t stride1;
-    ne10_int32_t stride2;
-    ne10_int32_t stride3;
-
-    ne10_fft_cpx_float32_t * F;
-    ne10_fft_cpx_float32_t * in;
-
-    // get the input, resort, calculate the first stage
-    fstride = 256;
-
-    F = Fout;
-    stride1 = fstride >> 2;
-    stride2 = stride1 >> 2;
-    stride3 = stride2 >> 2;
-    for (k = 0; k < 4; k++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            in = &Fin[k + j * stride3];
-            for (i = 0; i < 4; i++)
-            {
-                ne10_data_bitreversal_butterfly2_float32_neon (F, in, fstride, stride1);
-                F += 8;
-                in += stride2;
-            }
-        }
-    }
-}
-static void ne10_data_bitreversal2048_float32_neon (ne10_fft_cpx_float32_t * Fout,
-        ne10_fft_cpx_float32_t * Fin)
-{
-    ne10_int32_t i, j, k, l;
-    ne10_int32_t fstride;
-    ne10_int32_t stride1;
-    ne10_int32_t stride2;
-    ne10_int32_t stride3;
-    ne10_int32_t stride4;
-
-    ne10_fft_cpx_float32_t * F;
-    ne10_fft_cpx_float32_t * in;
-
-    // get the input, resort, calculate the first stage
-    fstride = 1024;
-
-    F = Fout;
-    stride1 = fstride >> 2;
-    stride2 = stride1 >> 2;
-    stride3 = stride2 >> 2;
-    stride4 = stride3 >> 2;
-    for (l = 0; l < 4; l++)
-    {
-        for (k = 0; k < 4; k++)
-        {
-            for (j = 0; j < 4; j++)
-            {
-                in = &Fin[l + k * stride4 + j * stride3];
-                for (i = 0; i < 4; i++)
-                {
-                    ne10_data_bitreversal_butterfly2_float32_neon (F, in, fstride, stride1);
-                    F += 8;
-                    in += stride2;
-                }
-            }
-        }
-    }
-}
-
-
-#define  ne10_butterfly_length_odd_power2_float32_neon(inverse) \
-static void ne10_butterfly_##inverse##_length_odd_power2_float32_neon (ne10_fft_cpx_float32_t * Fout, \
-        ne10_fft_cpx_float32_t * Fin, \
-        ne10_int32_t * factors, \
-        ne10_fft_cpx_float32_t * twiddles) \
-{ \
-    ne10_int32_t fstride = factors[1]; \
-    ne10_int32_t i; \
-    if (fstride == 16) \
-        ne10_data_bitreversal32_float32_neon (Fout, Fin); \
-    else if (fstride == 64) \
-        ne10_data_bitreversal128_float32_neon (Fout, Fin); \
-    else if (fstride == 256) \
-        ne10_data_bitreversal512_float32_neon (Fout, Fin); \
-    else if (fstride == 1024) \
-        ne10_data_bitreversal2048_float32_neon (Fout, Fin); \
-    ne10_radix2_butterfly_##inverse##_float32_neon (Fout, factors, twiddles); \
-}
-
-ne10_butterfly_length_odd_power2_float32_neon (forward)
-ne10_butterfly_length_odd_power2_float32_neon (backward)
-
 static void ne10_fft16_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
         ne10_fft_cpx_float32_t * Fin,
         ne10_fft_cpx_float32_t * twiddles)
@@ -857,46 +459,6 @@ static void ne10_fft16_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
     vst2q_f32 (p_dst3, q2_out_cdef);
 }
 
-static void ne10_mixed_radix_butterfly_forward_float32_neon (ne10_fft_cpx_float32_t * Fout,
-        ne10_int32_t * factors,
-        ne10_fft_cpx_float32_t * twiddles)
-{
-    ne10_int32_t stage_count;
-
-    // the first stage
-    stage_count = factors[0];
-    if (factors[2 * stage_count] == 2)
-    {
-        //radix 2/4, FFT length is 2^n (n is odd)
-        ne10_mixed_radix_butterfly_length_odd_power2_float32_neon (Fout, factors, twiddles);
-    }
-    else if (factors[2 * stage_count] == 4)
-    {
-        //radix 4, FFT length is 2^n (n is even)
-        ne10_mixed_radix_butterfly_length_even_power2_float32_neon (Fout, factors, twiddles);
-    }
-}
-
-static void ne10_mixed_radix_butterfly_backward_float32_neon (ne10_fft_cpx_float32_t * Fout,
-        ne10_int32_t * factors,
-        ne10_fft_cpx_float32_t * twiddles)
-{
-    ne10_int32_t stage_count;
-
-    stage_count = factors[0];
-    if (factors[2 * stage_count] == 2)
-    {
-        //radix 2/4, FFT length is 2^n (n is odd)
-        ne10_mixed_radix_butterfly_inverse_length_odd_power2_float32_neon (Fout, factors, twiddles);
-    }
-    else if (factors[2 * stage_count] == 4)
-    {
-        //radix 4, FFT length is 2^n (n is even)
-        ne10_mixed_radix_butterfly_inverse_length_even_power2_float32_neon (Fout, factors, twiddles);
-    }
-}
-
-
 void ne10_fft_split_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
         const ne10_fft_cpx_float32_t *src,
         ne10_fft_cpx_float32_t *twiddles,
@@ -1104,6 +666,7 @@ void ne10_fft_split_c2r_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
  * Otherwise, this FFT is an out-of-place algorithm. When you want to get an in-place FFT, it creates a temp buffer as
  *  output buffer and then copies the temp buffer back to input buffer. For the usage of this function, please check test/test_suite_fft_float32.c
  */
+
 void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
                                    ne10_fft_cpx_float32_t *fin,
                                    ne10_fft_cpx_float32_t *twiddles,
@@ -1124,21 +687,8 @@ void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
         case 16:
             ne10_fft16_backward_float32_neon (fout, fin, twiddles);
             break;
-        case 32:
-        case 128:
-        case 512:
-        case 2048:
-            ne10_butterfly_backward_length_odd_power2_float32_neon (fout, fin, factors, twiddles);
-            break;
-        case 64:
-        case 256:
-        case 1024:
-        case 4096:
-            ne10_butterfly_backward_length_even_power2_float32_neon (fout, fin, factors, twiddles);
-            break;
         default:
-            ne10_data_bitreversal_float32 (fout, fin, 1, &factors[2]);
-            ne10_mixed_radix_butterfly_backward_float32_neon (fout, factors, twiddles);
+            ne10_mixed_radix_fft_backward_float32_neon (fout, fin, factors, twiddles);
             break;
         }
     }
@@ -1155,21 +705,8 @@ void ne10_fft_c2c_1d_float32_neon (ne10_fft_cpx_float32_t *fout,
         case 16:
             ne10_fft16_forward_float32_neon (fout, fin, twiddles);
             break;
-        case 32:
-        case 128:
-        case 512:
-        case 2048:
-            ne10_butterfly_forward_length_odd_power2_float32_neon (fout, fin, factors, twiddles);
-            break;
-        case 64:
-        case 256:
-        case 1024:
-        case 4096:
-            ne10_butterfly_forward_length_even_power2_float32_neon (fout, fin, factors, twiddles);
-            break;
         default:
-            ne10_data_bitreversal_float32 (fout, fin, 1, &factors[2]);
-            ne10_mixed_radix_butterfly_forward_float32_neon (fout, factors, twiddles);
+            ne10_mixed_radix_fft_forward_float32_neon (fout, fin, factors, twiddles);
             break;
         }
     }
index 1c0874f..254552b 100644 (file)
         /* Registers define*/
         /*ARM Registers*/
         p_fout          .req   r0
-        p_factors       .req   r1
-        p_twiddles      .req   r2
-        p_fin           .req   r3
-        p_fout0         .req   r4
-        p_fout1         .req   r5
-        p_fout2         .req   r6
-        p_fout3         .req   r7
-        stage_count     .req   r8
-        fstride         .req   r9
-        mstride         .req   r10
-        count           .req   r1
-        count_f         .req   r1
-        count_m         .req   r12
-        p_tw1           .req   r3
-        p_tw2           .req   r11
-        p_tw3           .req   r14
-        radix           .req   r5
-        tmp0            .req   r12
+        p_fin           .req   r1
+        p_factors       .req   r2
+        p_twiddles      .req   r3
+        stage_count     .req   r4
+        fstride         .req   r5
+        mstride         .req   r6
+
+        radix           .req   r12
+        p_fin0          .req   r7
+        p_fin1          .req   r8
+        p_fin2          .req   r9
+        p_fin3          .req   r10
+        p_tmp           .req   r11
+        count           .req   r2
+        fstride1        .req   r2
+        fstep           .req   r7
+
+        p_out_ls        .req   r14
+        nstep           .req   r2
+        mstep           .req   r7
+        count_f         .req   r8
+        count_m         .req   r9
+        p_tw1           .req   r10
+        p_in1           .req   r11
+        p_out1          .req   r12
+        tmp0            .req   r9
 
         /*NEON variale Declaration for the first stage*/
-        d_in0_r01       .dn   d0
-        d_in0_i01       .dn   d2
-        d_in1_r01       .dn   d4
-        d_in1_i01       .dn   d6
-        d_in0_r23       .dn   d1
-        d_in0_i23       .dn   d3
-        d_in1_r23       .dn   d5
-        d_in1_i23       .dn   d7
-        q_in0_r0123     .qn   q0
-        q_in0_i0123     .qn   q1
-        q_in1_r0123     .qn   q2
-        q_in1_i0123     .qn   q3
-        d_out0_r01      .dn   d16
-        d_out0_i01      .dn   d18
-        d_out1_r01      .dn   d20
-        d_out1_i01      .dn   d22
-        d_out0_r23      .dn   d17
-        d_out0_i23      .dn   d19
-        d_out1_r23      .dn   d21
-        d_out1_i23      .dn   d23
-        q_out0_r0123    .qn   q8
-        q_out0_i0123    .qn   q9
-        q_out1_r0123    .qn   q10
-        q_out1_i0123    .qn   q11
-
-        d_in0_0         .dn   d0
-        d_in1_0         .dn   d1
-        d_in2_0         .dn   d2
-        d_in3_0         .dn   d3
-        d_in0_1         .dn   d4
-        d_in1_1         .dn   d5
-        d_in2_1         .dn   d6
-        d_in3_1         .dn   d7
         q_in0_01        .qn   q0
-        q_in1_01        .qn   q2
-        q_in2_01        .qn   q1
+        q_in1_01        .qn   q1
+        q_in2_01        .qn   q2
         q_in3_01        .qn   q3
-        d_out0_0        .dn   d16
-        d_out1_0        .dn   d17
-        d_out2_0        .dn   d18
-        d_out3_0        .dn   d19
-        d_out0_1        .dn   d20
-        d_out1_1        .dn   d21
-        d_out2_1        .dn   d22
-        d_out3_1        .dn   d23
-        q_out0_01       .qn   q8
-        q_out1_01       .qn   q10
-        q_out2_01       .qn   q9
-        q_out3_01       .qn   q11
-        d_s0            .dn   d24
-        q_s0_01         .qn   q12
-        d_s1            .dn   d26
-        q_s1_01         .qn   q13
-        d_s2            .dn   d28
-        q_s2_01         .qn   q14
+        q_s0_2          .qn   q4
+        q_s1_2          .qn   q5
+        q_s2_2          .qn   q6
+        q_s3_2          .qn   q7
+        d_s1_r2         .dn   d10
+        d_s1_i2         .dn   d11
+        d_s3_r2         .dn   d14
+        d_s3_i2         .dn   d15
+        q_out0_2        .qn   q8
+        q_out1_2        .qn   q9
+        q_out2_2        .qn   q10
+        q_out3_2        .qn   q11
+        d_out1_r15      .dn   d18
+        d_out1_i15      .dn   d19
+        d_out3_r37      .dn   d22
+        d_out3_i37      .dn   d23
+
+        d_in0_r         .dn   d0
+        d_in0_i         .dn   d1
+        d_in1_r         .dn   d2
+        d_in1_i         .dn   d3
+        d_in2_r         .dn   d4
+        d_in2_i         .dn   d5
+        d_in3_r         .dn   d6
+        d_in3_i         .dn   d7
+        d_in4_r         .dn   d8
+        d_in4_i         .dn   d9
+        d_in5_r         .dn   d10
+        d_in5_i         .dn   d11
+        d_in6_r         .dn   d12
+        d_in6_i         .dn   d13
+        d_in7_r         .dn   d14
+        d_in7_i         .dn   d15
+        q_in0           .qn   q0
+        q_in1           .qn   q1
+        q_in2           .qn   q2
+        q_in3           .qn   q3
+        q_in4           .qn   q4
+        q_in5           .qn   q5
+        q_in6           .qn   q6
+        q_in7           .qn   q7
+        q_sin0          .qn   q8
+        q_sin1          .qn   q9
+        q_sin2          .qn   q10
+        q_sin3          .qn   q11
+        q_sin4          .qn   q12
+        q_sin5          .qn   q13
+        q_sin6          .qn   q14
+        q_sin7          .qn   q15
+        d_sin3_r        .dn   d22
+        d_sin3_i        .dn   d23
+        d_sin5_r        .dn   d26
+        d_sin5_i        .dn   d27
+        d_sin7_r        .dn   d30
+        d_sin7_i        .dn   d31
+
+        d_tw_twn        .dn   d0
+        d_s3_r          .dn   d2
+        d_s3_i          .dn   d3
+        d_s7_r          .dn   d4
+        d_s7_i          .dn   d5
+        q_s3            .qn   q1
+        q_s7            .qn   q2
+        q_s8            .qn   q11
+        q_s9            .qn   q15
+        q_s10           .qn   q3
+        q_s11           .qn   q4
+        q_s12           .qn   q5
+        q_s13           .qn   q6
+        q_s14           .qn   q7
+        q_s15           .qn   q0
+        q_out0          .qn   q1
+        q_out1          .qn   q2
+        q_out2          .qn   q8
+        q_out3          .qn   q9
+        q_out4          .qn   q10
+        q_out5          .qn   q12
+        q_out6          .qn   q13
+        q_out7          .qn   q14
+        d_s10_r         .dn   d6
+        d_s10_i         .dn   d7
+        d_s11_r         .dn   d8
+        d_s11_i         .dn   d9
+        d_s14_r         .dn   d14
+        d_s14_i         .dn   d15
+        d_s15_r         .dn   d0
+        d_s15_i         .dn   d1
+        d_out2_r        .dn   d16
+        d_out2_i        .dn   d17
+        d_out3_r        .dn   d18
+        d_out3_i        .dn   d19
+        d_out6_r        .dn   d26
+        d_out6_i        .dn   d27
+        d_out7_r        .dn   d28
+        d_out7_i        .dn   d29
 
-        /*NEON variale Declaration for mstride loop */
-        q_fin0_r        .qn   q0
-        q_fin0_i        .qn   q1
-        q_fin1_r        .qn   q0
-        q_fin1_i        .qn   q1
-        q_tw1_r         .qn   q2
-        q_tw1_i         .qn   q3
-        q_fin2_r        .qn   q8
-        q_fin2_i        .qn   q9
-        q_tw2_r         .qn   q10
-        q_tw2_i         .qn   q11
-        q_fin3_r        .qn   q4
-        q_fin3_i        .qn   q5
-        q_tw3_r         .qn   q6
-        q_tw3_i         .qn   q7
-        q_s0_r          .qn   q12
-        q_s0_i          .qn   q13
-        q_s1_r          .qn   q14
-        q_s1_i          .qn   q15
-        q_s2_r          .qn   q2
-        q_s2_i          .qn   q10
-        q_s5_r          .qn   q4
-        q_s5_i          .qn   q5
-        q_s4_r          .qn   q6
-        q_s4_i          .qn   q7
-        q_s3_r          .qn   q8
-        q_s3_i          .qn   q9
-        q_fout0_r       .qn   q0
-        q_fout0_i       .qn   q1
-        q_fout2_r       .qn   q2
-        q_fout2_i       .qn   q3
-        q_fout1_r       .qn   q12
-        q_fout1_i       .qn   q13
-        q_fout3_r       .qn   q14
-        q_fout3_i       .qn   q15
-
-        /*NEON variale Declaration for mstride 2 loop */
-        d_tw1_r01       .dn   d16
-        d_tw2_r01       .dn   d17
-        d_tw1_i01       .dn   d18
-        d_tw2_i01       .dn   d19
-        d_tw3_r01       .dn   d20
-        d_tw3_i01       .dn   d21
-        q_fin0_r0123    .qn   q0
-        q_fin0_i0123    .qn   q1
-        d_fin0_r01      .dn   d0
-        d_fin1_r01      .dn   d1
-        d_fin0_i01      .dn   d2
-        d_fin1_i01      .dn   d3
-        d_fin2_r01      .dn   d4
-        d_fin3_r01      .dn   d5
-        d_fin2_i01      .dn   d6
-        d_fin3_i01      .dn   d7
-        d_fin0_r23      .dn   d22
-        d_fin1_r23      .dn   d23
-        d_fin0_i23      .dn   d24
-        d_fin1_i23      .dn   d25
-        d_fin2_r23      .dn   d26
-        d_fin3_r23      .dn   d27
-        d_fin2_i23      .dn   d28
-        d_fin3_i23      .dn   d29
-        q_s0_r0123      .qn   q13
-        q_s0_i0123      .qn   q14
-        d_s0_r01        .dn   d26
-        d_s0_r23        .dn   d27
-        d_s0_i01        .dn   d28
-        d_s0_i23        .dn   d29
-        q_s1_r0123      .qn   q5
-        q_s1_i0123      .qn   q6
-        d_s1_r01        .dn   d10
-        d_s1_r23        .dn   d11
-        d_s1_i01        .dn   d12
-        d_s1_i23        .dn   d13
-        q_s2_r0123      .qn   q15
-        q_s2_i0123      .qn   q4
-        d_s2_r01        .dn   d30
-        d_s2_r23        .dn   d31
-        d_s2_i01        .dn   d8
-        d_s2_i23        .dn   d9
-        q_s5_r0123      .qn   q11
-        q_s5_i0123      .qn   q12
-        q_s4_r0123      .qn   q5
-        q_s4_i0123      .qn   q10
-        q_s3_r0123      .qn   q6
-        q_s3_i0123      .qn   q7
-        q_fout0_r0123   .qn   q0
-        q_fout0_i0123   .qn   q1
-        q_fout2_r0123   .qn   q2
-        q_fout2_i0123   .qn   q3
-        q_fout1_r0123   .qn   q13
-        q_fout1_i0123   .qn   q14
-        q_fout3_r0123   .qn   q6
-        q_fout3_i0123   .qn   q7
-        d_fout0_r01     .dn   d0
-        d_fout1_r01     .dn   d1
-        d_fout0_i01     .dn   d2
-        d_fout1_i01     .dn   d3
-        d_fout2_r01     .dn   d4
-        d_fout3_r01     .dn   d5
-        d_fout2_i01     .dn   d6
-        d_fout3_i01     .dn   d7
-        d_fout0_r23     .dn   d26
-        d_fout1_r23     .dn   d27
-        d_fout0_i23     .dn   d28
-        d_fout1_i23     .dn   d29
-        d_fout2_r23     .dn   d12
-        d_fout3_r23     .dn   d13
-        d_fout2_i23     .dn   d14
-        d_fout3_i23     .dn   d15
-
-        d_tmp0          .dn   d30
-        d_tmp1          .dn   d31
-        q_tmp           .qn   q15
-        d_tmp2_0        .dn   d28
-        d_tmp2_1        .dn   d29
-        q_tmp2          .qn   q14
-
-        .macro RADIX4_BUTTERFLY_P4
-        vld2.32         {q_fin1_r, q_fin1_i}, [p_fout1]
-        vld2.32         {q_tw1_r, q_tw1_i}, [p_tw1]!
-        vld2.32         {q_fin2_r, q_fin2_i}, [p_fout2]
-        vld2.32         {q_tw2_r, q_tw2_i}, [p_tw2]!
-        vld2.32         {q_fin3_r, q_fin3_i}, [p_fout3]
-        vld2.32         {q_tw3_r, q_tw3_i}, [p_tw3]!
-
-        vmul.f32        q_s0_r, q_fin1_r, q_tw1_r
-        vmul.f32        q_s0_i, q_fin1_i, q_tw1_r
-        vmul.f32        q_s1_r, q_fin2_r, q_tw2_r
-        vmul.f32        q_s1_i, q_fin2_i, q_tw2_r
-        vmul.f32        q_s2_r, q_fin3_r, q_tw3_r
-        vmul.f32        q_s2_i, q_fin3_i, q_tw3_r
-        vmls.f32        q_s0_r, q_fin1_i, q_tw1_i
-        vmla.f32        q_s0_i, q_fin1_r, q_tw1_i
-        vmls.f32        q_s1_r, q_fin2_i, q_tw2_i
-        vmla.f32        q_s1_i, q_fin2_r, q_tw2_i
-        vld2.32         {q_fin0_r, q_fin0_i}, [p_fout0]
-        vmls.f32        q_s2_r, q_fin3_i, q_tw3_i
-        vmla.f32        q_s2_i, q_fin3_r, q_tw3_i
-
-        vsub.f32        q_s5_r, q_fin0_r, q_s1_r
-        vsub.f32        q_s5_i, q_fin0_i, q_s1_i
-        vadd.f32        q_fout0_r, q_fin0_r, q_s1_r
-        vadd.f32        q_fout0_i, q_fin0_i, q_s1_i
-
-        vadd.f32        q_s3_r, q_s0_r, q_s2_r
-        vadd.f32        q_s3_i, q_s0_i, q_s2_i
-        vsub.f32        q_s4_r, q_s0_r, q_s2_r
-        vsub.f32        q_s4_i, q_s0_i, q_s2_i
-
-        vsub.f32        q_fout2_r, q_fout0_r, q_s3_r
-        vsub.f32        q_fout2_i, q_fout0_i, q_s3_i
-        vadd.f32        q_fout0_r, q_fout0_r, q_s3_r
-        vadd.f32        q_fout0_i, q_fout0_i, q_s3_i
-
-        vadd.f32        q_fout1_r, q_s5_r, q_s4_i
-        vsub.f32        q_fout1_i, q_s5_i, q_s4_r
-        vsub.f32        q_fout3_r, q_s5_r, q_s4_i
-        vadd.f32        q_fout3_i, q_s5_i, q_s4_r
-
-        vst2.32         {q_fout2_r, q_fout2_i}, [p_fout2]!
-        vst2.32         {q_fout0_r, q_fout0_i}, [p_fout0]!
-        vst2.32         {q_fout1_r, q_fout1_i}, [p_fout1]!
-        vst2.32         {q_fout3_r, q_fout3_i}, [p_fout3]!
-        .endm
 
-        .macro RADIX4_BUTTERFLY_INVERSE_P4
-        vld2.32         {q_fin1_r, q_fin1_i}, [p_fout1]
-        vld2.32         {q_tw1_r, q_tw1_i}, [p_tw1]!
-        vld2.32         {q_fin2_r, q_fin2_i}, [p_fout2]
-        vld2.32         {q_tw2_r, q_tw2_i}, [p_tw2]!
-        vmul.f32        q_s0_r, q_fin1_r, q_tw1_r
-        vmul.f32        q_s0_i, q_fin1_i, q_tw1_r
-        vmla.f32        q_s0_r, q_fin1_i, q_tw1_i
-        vmls.f32        q_s0_i, q_fin1_r, q_tw1_i
-
-        vld2.32         {q_fin3_r, q_fin3_i}, [p_fout3]
-        vld2.32         {q_tw3_r, q_tw3_i}, [p_tw3]!
-        vmul.f32        q_s1_r, q_fin2_r, q_tw2_r
-        vmul.f32        q_s1_i, q_fin2_i, q_tw2_r
-        vmla.f32        q_s1_r, q_fin2_i, q_tw2_i
-        vmls.f32        q_s1_i, q_fin2_r, q_tw2_i
-
-        vld2.32         {q_fin0_r, q_fin0_i}, [p_fout0]
-        vmul.f32        q_s2_r, q_fin3_r, q_tw3_r
-        vmul.f32        q_s2_i, q_fin3_i, q_tw3_r
-        vmla.f32        q_s2_r, q_fin3_i, q_tw3_i
-        vmls.f32        q_s2_i, q_fin3_r, q_tw3_i
-
-        vsub.f32        q_s5_r, q_fin0_r, q_s1_r
-        vsub.f32        q_s5_i, q_fin0_i, q_s1_i
-        vadd.f32        q_fout0_r, q_fin0_r, q_s1_r
-        vadd.f32        q_fout0_i, q_fin0_i, q_s1_i
-
-        vadd.f32        q_s3_r, q_s0_r, q_s2_r
-        vadd.f32        q_s3_i, q_s0_i, q_s2_i
-        vsub.f32        q_s4_r, q_s0_r, q_s2_r
-        vsub.f32        q_s4_i, q_s0_i, q_s2_i
-
-        vsub.f32        q_fout2_r, q_fout0_r, q_s3_r
-        vsub.f32        q_fout2_i, q_fout0_i, q_s3_i
-        vadd.f32        q_fout0_r, q_fout0_r, q_s3_r
-        vadd.f32        q_fout0_i, q_fout0_i, q_s3_i
-        vst2.32         {q_fout2_r, q_fout2_i}, [p_fout2]!
-
-        vsub.f32        q_fout1_r, q_s5_r, q_s4_i
-        vadd.f32        q_fout1_i, q_s5_i, q_s4_r
-        vadd.f32        q_fout3_r, q_s5_r, q_s4_i
-        vsub.f32        q_fout3_i, q_s5_i, q_s4_r
-        vst2.32         {q_fout0_r, q_fout0_i}, [p_fout0]!
-        vst2.32         {q_fout1_r, q_fout1_i}, [p_fout1]!
-        vst2.32         {q_fout3_r, q_fout3_i}, [p_fout3]!
+        /*NEON variale Declaration for mstride loop */
+        d_fin0_r        .dn   d0
+        d_fin0_i        .dn   d1
+        d_fin1_r        .dn   d2
+        d_fin1_i        .dn   d3
+        d_fin2_r        .dn   d4
+        d_fin2_i        .dn   d5
+        d_fin3_r        .dn   d6
+        d_fin3_i        .dn   d7
+        d_tw0_r         .dn   d8
+        d_tw0_i         .dn   d9
+        d_tw1_r         .dn   d10
+        d_tw1_i         .dn   d11
+        d_tw2_r         .dn   d12
+        d_tw2_i         .dn   d13
+        q_fin0          .qn   q0
+        q_scr0          .qn   q15
+        q_scr1          .qn   q7
+        q_scr2          .qn   q8
+        q_scr3          .qn   q9
+        q_scr4          .qn   q10
+        q_scr5          .qn   q11
+        q_scr6          .qn   q12
+        q_scr7          .qn   q13
+        d_scr1_r        .dn   d14
+        d_scr1_i        .dn   d15
+        d_scr2_r        .dn   d16
+        d_scr2_i        .dn   d17
+        d_scr3_r        .dn   d18
+        d_scr3_i        .dn   d19
+        d_scr5_r        .dn   d22
+        d_scr5_i        .dn   d23
+        d_scr7_r        .dn   d26
+        d_scr7_i        .dn   d27
+        q_fout0         .qn   q7
+        q_fout2         .qn   q8
+        d_fout0_r       .dn   d14
+        d_fout0_i       .dn   d15
+        d_fout1_r       .dn   d28
+        d_fout1_i       .dn   d29
+        d_fout2_r       .dn   d16
+        d_fout2_i       .dn   d17
+        d_fout3_r       .dn   d30
+        d_fout3_i       .dn   d31
+
+        .macro BUTTERFLY4X2_WITHOUT_TWIDDLES inverse
+
+        /* radix 4 butterfly without twiddles */
+        vadd.f32        q_s0_2, q_in0_01, q_in2_01
+        vsub.f32        q_s1_2, q_in0_01, q_in2_01
+        vld2.32         {q_in0_01}, [p_fin0:64]!
+        vld2.32         {q_in2_01}, [p_fin2:64]!
+        vadd.f32        q_s2_2, q_in1_01, q_in3_01
+        vsub.f32        q_s3_2, q_in1_01, q_in3_01
+        vld2.32         {q_in1_01}, [p_fin1:64]!
+        vld2.32         {q_in3_01}, [p_fin3:64]!
+
+        vsub.f32        q_out2_2, q_s0_2, q_s2_2
+        vadd.f32        q_out0_2, q_s0_2, q_s2_2
+
+        .ifeqs "\inverse", "TRUE"
+        vsub.f32        d_out1_r15, d_s1_r2, d_s3_i2
+        vadd.f32        d_out1_i15, d_s1_i2, d_s3_r2
+        vadd.f32        d_out3_r37, d_s1_r2, d_s3_i2
+        vsub.f32        d_out3_i37, d_s1_i2, d_s3_r2
+        .else
+        vadd.f32        d_out1_r15, d_s1_r2, d_s3_i2
+        vsub.f32        d_out1_i15, d_s1_i2, d_s3_r2
+        vsub.f32        d_out3_r37, d_s1_r2, d_s3_i2
+        vadd.f32        d_out3_i37, d_s1_i2, d_s3_r2
+        .endif
+
+        vtrn.32         q_out0_2, q_out1_2
+        vtrn.32         q_out2_2, q_out3_2
+        vst2.32         {q_out0_2}, [p_tmp]!
+        vst2.32         {q_out2_2}, [p_tmp]!
+        vst2.32         {q_out1_2}, [p_tmp]!
+        vst2.32         {q_out3_2}, [p_tmp]!
         .endm
 
-        .macro RADIX24_BUTTERFLY_P4
-        vld2.32         {d_tw3_r01, d_tw3_i01}, [p_tw1]
-        vld2.32         {d_fin0_r01, d_fin1_r01, d_fin0_i01, d_fin1_i01}, [p_fout0]!
-        vld2.32         {d_fin2_r01, d_fin3_r01, d_fin2_i01, d_fin3_i01}, [p_fout0], tmp0
-        vld2.32         {d_fin0_r23, d_fin1_r23, d_fin0_i23, d_fin1_i23}, [p_fout1]!
-        vld2.32         {d_fin2_r23, d_fin3_r23, d_fin2_i23, d_fin3_i23}, [p_fout1], tmp0
-
-        vmul.f32       d_s2_r01, d_fin3_r01, d_tw3_r01
-        vmul.f32       d_s2_i01, d_fin3_r01, d_tw3_i01
-        vmul.f32       d_s2_r23, d_fin3_r23, d_tw3_r01
-        vmul.f32       d_s2_i23, d_fin3_r23, d_tw3_i01
-        vmls.f32       d_s2_r01, d_fin3_i01, d_tw3_i01
-        vmla.f32       d_s2_i01, d_fin3_i01, d_tw3_r01
-        vmls.f32       d_s2_r23, d_fin3_i23, d_tw3_i01
-        vmla.f32       d_s2_i23, d_fin3_i23, d_tw3_r01
-
-        vmul.f32       d_s1_r01, d_fin2_r01, d_tw2_r01
-        vmul.f32       d_s1_r23, d_fin2_r23, d_tw2_r01
-        vmul.f32       d_s1_i01, d_fin2_r01, d_tw2_i01
-        vmul.f32       d_s1_i23, d_fin2_r23, d_tw2_i01
-        vmls.f32       d_s1_r01, d_fin2_i01, d_tw2_i01
-        vmls.f32       d_s1_r23, d_fin2_i23, d_tw2_i01
-        vmla.f32       d_s1_i01, d_fin2_i01, d_tw2_r01
-        vmla.f32       d_s1_i23, d_fin2_i23, d_tw2_r01
-
-        vmul.f32       d_s0_r01, d_fin1_r01, d_tw1_r01
-        vmul.f32       d_s0_r23, d_fin1_r23, d_tw1_r01
-        vmul.f32       d_s0_i01, d_fin1_r01, d_tw1_i01
-        vmul.f32       d_s0_i23, d_fin1_r23, d_tw1_i01
-        vmls.f32       d_s0_r01, d_fin1_i01, d_tw1_i01
-        vmls.f32       d_s0_r23, d_fin1_i23, d_tw1_i01
-        vmla.f32       d_s0_i01, d_fin1_i01, d_tw1_r01
-        vmla.f32       d_s0_i23, d_fin1_i23, d_tw1_r01
-
-        vmov            d_fin1_r01, d_fin0_r23
-        vmov            d_fin1_i01, d_fin0_i23
-
-        vsub.f32        q_s5_r0123, q_fin0_r0123, q_s1_r0123
-        vsub.f32        q_s5_i0123, q_fin0_i0123, q_s1_i0123
-        vadd.f32        q_fout0_r0123, q_fin0_r0123, q_s1_r0123
-        vadd.f32        q_fout0_i0123, q_fin0_i0123, q_s1_i0123
-
-        vadd.f32        q_s3_r0123, q_s0_r0123, q_s2_r0123
-        vadd.f32        q_s3_i0123, q_s0_i0123, q_s2_i0123
-        vsub.f32        q_s4_r0123, q_s0_r0123, q_s2_r0123
-        vsub.f32        q_s4_i0123, q_s0_i0123, q_s2_i0123
-        vsub.f32        q_fout2_r0123, q_fout0_r0123, q_s3_r0123
-        vsub.f32        q_fout2_i0123, q_fout0_i0123, q_s3_i0123
-        vadd.f32        q_fout0_r0123, q_fout0_r0123, q_s3_r0123
-        vadd.f32        q_fout0_i0123, q_fout0_i0123, q_s3_i0123
-
-        vadd.f32        q_fout1_r0123, q_s5_r0123, q_s4_i0123
-        vsub.f32        q_fout1_i0123, q_s5_i0123, q_s4_r0123
-        vsub.f32        q_fout3_r0123, q_s5_r0123, q_s4_i0123
-        vadd.f32        q_fout3_i0123, q_s5_i0123, q_s4_r0123
-
-        vswp            d_fout1_r01, d_fout0_r23
-        vswp            d_fout1_i01, d_fout0_i23
-        vswp            d_fout3_r01, d_fout2_r23
-        vswp            d_fout3_i01, d_fout2_i23
-
-        vst2.32         {d_fout0_r01, d_fout1_r01, d_fout0_i01, d_fout1_i01}, [p_fout2]!
-        vst2.32         {d_fout0_r23, d_fout1_r23, d_fout0_i23, d_fout1_i23}, [p_fout3]!
-        vst2.32         {d_fout2_r01, d_fout3_r01, d_fout2_i01, d_fout3_i01}, [p_fout2], tmp0
-        vst2.32         {d_fout2_r23, d_fout3_r23, d_fout2_i23, d_fout3_i23}, [p_fout3], tmp0
-        .endm
+        .macro BUTTERFLY4X2_WITH_TWIDDLES inverse
+
+        sub             p_in1, p_in1, nstep, lsl #2
+        add             p_in1, p_in1, #16
+        sub             p_tw1, p_tw1, mstep, lsl #1
+        add             p_tw1, p_tw1, #16
+        vmov            q_scr0, q_fin0
+        vmul.f32        d_scr1_r, d_fin1_r, d_tw0_r
+        vmul.f32        d_scr1_i, d_fin1_i, d_tw0_r
+        vmul.f32        d_scr2_r, d_fin2_r, d_tw1_r
+        vmul.f32        d_scr2_i, d_fin2_i, d_tw1_r
+        vmul.f32        d_scr3_r, d_fin3_r, d_tw2_r
+        vmul.f32        d_scr3_i, d_fin3_i, d_tw2_r
+        vld2.32         {d_fin0_r, d_fin0_i}, [p_in1:64], nstep
+
+        .ifeqs "\inverse", "TRUE"
+        vmla.f32        d_scr1_r, d_fin1_i, d_tw0_i
+        vmls.f32        d_scr1_i, d_fin1_r, d_tw0_i
+        vmla.f32        d_scr2_r, d_fin2_i, d_tw1_i
+        vmls.f32        d_scr2_i, d_fin2_r, d_tw1_i
+        vmla.f32        d_scr3_r, d_fin3_i, d_tw2_i
+        vmls.f32        d_scr3_i, d_fin3_r, d_tw2_i
+        .else
+        vmls.f32        d_scr1_r, d_fin1_i, d_tw0_i
+        vmla.f32        d_scr1_i, d_fin1_r, d_tw0_i
+        vmls.f32        d_scr2_r, d_fin2_i, d_tw1_i
+        vmla.f32        d_scr2_i, d_fin2_r, d_tw1_i
+        vmls.f32        d_scr3_r, d_fin3_i, d_tw2_i
+        vmla.f32        d_scr3_i, d_fin3_r, d_tw2_i
+        .endif
+
+        vld2.32         {d_fin1_r, d_fin1_i}, [p_in1:64], nstep
+        vld2.32         {d_tw0_r, d_tw0_i}, [p_tw1:64], mstep
+        vld2.32         {d_fin2_r, d_fin2_i}, [p_in1:64], nstep
+        vld2.32         {d_fin3_r, d_fin3_i}, [p_in1:64], nstep
+        vld2.32         {d_tw1_r, d_tw1_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw2_r, d_tw2_i}, [p_tw1:64]
+
+        vadd.f32        q_scr4, q_scr0, q_scr2
+        vsub.f32        q_scr5, q_scr0, q_scr2
+        vadd.f32        q_scr6, q_scr1, q_scr3
+        vsub.f32        q_scr7, q_scr1, q_scr3
+
+        vadd.f32        q_fout0, q_scr4, q_scr6
+        vsub.f32        q_fout2, q_scr4, q_scr6
+
+        .ifeqs "\inverse", "TRUE"
+        vsub.f32        d_fout1_r, d_scr5_r, d_scr7_i
+        vadd.f32        d_fout1_i, d_scr5_i, d_scr7_r
+        vadd.f32        d_fout3_r, d_scr5_r, d_scr7_i
+        vsub.f32        d_fout3_i, d_scr5_i, d_scr7_r
+        .else
+        vadd.f32        d_fout1_r, d_scr5_r, d_scr7_i
+        vsub.f32        d_fout1_i, d_scr5_i, d_scr7_r
+        vsub.f32        d_fout3_r, d_scr5_r, d_scr7_i
+        vadd.f32        d_fout3_i, d_scr5_i, d_scr7_r
+        .endif
+
+        vst2.32         {d_fout0_r, d_fout0_i}, [p_out1], mstep
+        vst2.32         {d_fout1_r, d_fout1_i}, [p_out1], mstep
+        vst2.32         {d_fout2_r, d_fout2_i}, [p_out1], mstep
+        vst2.32         {d_fout3_r, d_fout3_i}, [p_out1], mstep
+        sub             p_out1, p_out1, mstep, lsl #2
+        add             p_out1, p_out1, #16
 
-        .macro RADIX24_BUTTERFLY_INVERSE_P4
-        vld2.32         {d_tw3_r01, d_tw3_i01}, [p_tw1]
-        vld2.32         {d_fin0_r01, d_fin1_r01, d_fin0_i01, d_fin1_i01}, [p_fout0]!
-        vld2.32         {d_fin2_r01, d_fin3_r01, d_fin2_i01, d_fin3_i01}, [p_fout0], tmp0
-        vld2.32         {d_fin0_r23, d_fin1_r23, d_fin0_i23, d_fin1_i23}, [p_fout1]!
-        vld2.32         {d_fin2_r23, d_fin3_r23, d_fin2_i23, d_fin3_i23}, [p_fout1], tmp0
-
-        vmul.f32       d_s2_r01, d_fin3_r01, d_tw3_r01
-        vmul.f32       d_s2_i01, d_fin3_i01, d_tw3_r01
-        vmul.f32       d_s2_r23, d_fin3_r23, d_tw3_r01
-        vmul.f32       d_s2_i23, d_fin3_i23, d_tw3_r01
-        vmla.f32       d_s2_r01, d_fin3_i01, d_tw3_i01
-        vmls.f32       d_s2_i01, d_fin3_r01, d_tw3_i01
-        vmla.f32       d_s2_r23, d_fin3_i23, d_tw3_i01
-        vmls.f32       d_s2_i23, d_fin3_r23, d_tw3_i01
-
-        vmul.f32       d_s1_r01, d_fin2_r01, d_tw2_r01
-        vmul.f32       d_s1_r23, d_fin2_r23, d_tw2_r01
-        vmul.f32       d_s1_i01, d_fin2_i01, d_tw2_r01
-        vmul.f32       d_s1_i23, d_fin2_i23, d_tw2_r01
-        vmla.f32       d_s1_r01, d_fin2_i01, d_tw2_i01
-        vmla.f32       d_s1_r23, d_fin2_i23, d_tw2_i01
-        vmls.f32       d_s1_i01, d_fin2_r01, d_tw2_i01
-        vmls.f32       d_s1_i23, d_fin2_r23, d_tw2_i01
-
-        vmul.f32       d_s0_r01, d_fin1_r01, d_tw1_r01
-        vmul.f32       d_s0_r23, d_fin1_r23, d_tw1_r01
-        vmul.f32       d_s0_i01, d_fin1_i01, d_tw1_r01
-        vmul.f32       d_s0_i23, d_fin1_i23, d_tw1_r01
-        vmla.f32       d_s0_r01, d_fin1_i01, d_tw1_i01
-        vmla.f32       d_s0_r23, d_fin1_i23, d_tw1_i01
-        vmls.f32       d_s0_i01, d_fin1_r01, d_tw1_i01
-        vmls.f32       d_s0_i23, d_fin1_r23, d_tw1_i01
-
-        vmov            d_fin1_r01, d_fin0_r23
-        vmov            d_fin1_i01, d_fin0_i23
-
-        vsub.f32        q_s5_r0123, q_fin0_r0123, q_s1_r0123
-        vsub.f32        q_s5_i0123, q_fin0_i0123, q_s1_i0123
-        vadd.f32        q_fout0_r0123, q_fin0_r0123, q_s1_r0123
-        vadd.f32        q_fout0_i0123, q_fin0_i0123, q_s1_i0123
-
-        vadd.f32        q_s3_r0123, q_s0_r0123, q_s2_r0123
-        vadd.f32        q_s3_i0123, q_s0_i0123, q_s2_i0123
-        vsub.f32        q_s4_r0123, q_s0_r0123, q_s2_r0123
-        vsub.f32        q_s4_i0123, q_s0_i0123, q_s2_i0123
-        vsub.f32        q_fout2_r0123, q_fout0_r0123, q_s3_r0123
-        vsub.f32        q_fout2_i0123, q_fout0_i0123, q_s3_i0123
-        vadd.f32        q_fout0_r0123, q_fout0_r0123, q_s3_r0123
-        vadd.f32        q_fout0_i0123, q_fout0_i0123, q_s3_i0123
-
-        vsub.f32        q_fout1_r0123, q_s5_r0123, q_s4_i0123
-        vadd.f32        q_fout1_i0123, q_s5_i0123, q_s4_r0123
-        vadd.f32        q_fout3_r0123, q_s5_r0123, q_s4_i0123
-        vsub.f32        q_fout3_i0123, q_s5_i0123, q_s4_r0123
-
-        vswp            d_fout1_r01, d_fout0_r23
-        vswp            d_fout1_i01, d_fout0_i23
-        vswp            d_fout3_r01, d_fout2_r23
-        vswp            d_fout3_i01, d_fout2_i23
-
-        vst2.32         {d_fout0_r01, d_fout1_r01, d_fout0_i01, d_fout1_i01}, [p_fout2]!
-        vst2.32         {d_fout0_r23, d_fout1_r23, d_fout0_i23, d_fout1_i23}, [p_fout3]!
-        vst2.32         {d_fout2_r01, d_fout3_r01, d_fout2_i01, d_fout3_i01}, [p_fout2], tmp0
-        vst2.32         {d_fout2_r23, d_fout3_r23, d_fout2_i23, d_fout3_i23}, [p_fout3], tmp0
         .endm
 
 
-        .align 4
-        .global ne10_radix4_butterfly_forward_float32_neon
-        .thumb
-        .thumb_func
-
-ne10_radix4_butterfly_forward_float32_neon:
-
-        push            {r4-r12,lr}
-
-        ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
-        ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
-        add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
-        ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
-        sub             stage_count, stage_count, #1
-
-        /* loop of the stages  */
-.L_ne10_radix4_butterfly_forward_stages:
-        lsr             fstride, fstride, #2
-
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_radix4_butterfly_forward_stages_fstride:
-        sub             tmp0, fstride, count_f
-        mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
-        mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
-
-        /* loop of mstride  */
-        mov             count_m, mstride
-
-.L_ne10_radix4_butterfly_forward_stages_mstride:
-
-        RADIX4_BUTTERFLY_P4
-
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_radix4_butterfly_forward_stages_mstride
-
-        /* end of mstride_loop */
+        .macro BUTTERFLY8X2_WITHOUT_TWIDDLES inverse
+        /**
+         *   q_in0: Fin1[0]
+         *   q_in1: Fin1[0 + fstride]
+         *   q_in2: Fin1[fstride1]
+         *   q_in3: Fin1[fstride1 + fstride]
+         *   q_in4: Fin1[fstride1*2]
+         *   q_in5: Fin1[fstride1*2 + fstride]
+         *   q_in6: Fin1[fstride1*3]
+         *   q_in7: Fin1[fstride1*3 + fstride]
+         *
+         */
 
-        subs            count_f, count_f, #1
-        bgt             .L_ne10_radix4_butterfly_forward_stages_fstride
+        ldr             tmp0, =TW_81
+        vld2.32         {d_in0_r, d_in0_i}, [p_in1:64], fstep
+        vld2.32         {d_in2_r, d_in2_i}, [p_in1:64], fstep
+        vld2.32         {d_in4_r, d_in4_i}, [p_in1:64], fstep
+        vld2.32         {d_in6_r, d_in6_i}, [p_in1:64], fstep
+        vld2.32         {d_in1_r, d_in1_i}, [p_in1:64], fstep
+        vld2.32         {d_in3_r, d_in3_i}, [p_in1:64], fstep
+        vld2.32         {d_in5_r, d_in5_i}, [p_in1:64], fstep
+        vld2.32         {d_in7_r, d_in7_i}, [p_in1:64], fstep
+
+        // radix 4 butterfly without twiddles
+        vadd.f32        q_sin0, q_in0, q_in1
+        vsub.f32        q_sin1, q_in0, q_in1
+        vld1.32         {d_tw_twn}, [tmp0]
+        vadd.f32        q_sin2, q_in2, q_in3
+        vsub.f32        q_sin3, q_in2, q_in3
+        vadd.f32        q_sin4, q_in4, q_in5
+        vsub.f32        q_sin5, q_in4, q_in5
+        vadd.f32        q_sin6, q_in6, q_in7
+        vsub.f32        q_sin7, q_in6, q_in7
+
+        .ifeqs "\inverse", "TRUE"
+        vneg.f32        d_sin5_i, d_sin5_i
+        vsub.f32        d_s3_r, d_sin3_r, d_sin3_i
+        vadd.f32        d_s3_i, d_sin3_i, d_sin3_r
+        vadd.f32        d_s7_r, d_sin7_r, d_sin7_i
+        vsub.f32        d_s7_i, d_sin7_i, d_sin7_r
+        .else
+        vneg.f32        d_sin5_r, d_sin5_r
+        vadd.f32        d_s3_r, d_sin3_r, d_sin3_i
+        vsub.f32        d_s3_i, d_sin3_i, d_sin3_r
+        vsub.f32        d_s7_r, d_sin7_r, d_sin7_i
+        vadd.f32        d_s7_i, d_sin7_i, d_sin7_r
+        .endif
+        vswp            d_sin5_r, d_sin5_i
+
+        vmul.f32        q_s3, q_s3, d_tw_twn[0]
+        vmul.f32        q_s7, q_s7, d_tw_twn[1]
+
+        // radix 2 butterfly
+        vadd.f32        q_s8, q_sin0, q_sin4
+        vadd.f32        q_s9, q_sin1, q_sin5
+        vsub.f32        q_s10, q_sin0, q_sin4
+        vsub.f32        q_s11, q_sin1, q_sin5
+
+        // radix 2 butterfly
+        vadd.f32        q_s12, q_sin2, q_sin6
+        vadd.f32        q_s13, q_s3, q_s7
+        vsub.f32        q_s14, q_sin2, q_sin6
+        vsub.f32        q_s15, q_s3, q_s7
+
+        vsub.f32        q_out4, q_s8, q_s12
+        vsub.f32        q_out5, q_s9, q_s13
+        vadd.f32        q_out0, q_s8, q_s12
+        vadd.f32        q_out1, q_s9, q_s13
+
+        .ifeqs "\inverse", "TRUE"
+        vsub.f32        d_out2_r, d_s10_r, d_s14_i
+        vadd.f32        d_out2_i, d_s10_i, d_s14_r
+        vsub.f32        d_out3_r, d_s11_r, d_s15_i
+        vadd.f32        d_out3_i, d_s11_i, d_s15_r
+        vadd.f32        d_out6_r, d_s10_r, d_s14_i
+        vsub.f32        d_out6_i, d_s10_i, d_s14_r
+        vadd.f32        d_out7_r, d_s11_r, d_s15_i
+        vsub.f32        d_out7_i, d_s11_i, d_s15_r
+        .else
+        vadd.f32        d_out2_r, d_s10_r, d_s14_i
+        vsub.f32        d_out2_i, d_s10_i, d_s14_r
+        vadd.f32        d_out3_r, d_s11_r, d_s15_i
+        vsub.f32        d_out3_i, d_s11_i, d_s15_r
+        vsub.f32        d_out6_r, d_s10_r, d_s14_i
+        vadd.f32        d_out6_i, d_s10_i, d_s14_r
+        vsub.f32        d_out7_r, d_s11_r, d_s15_i
+        vadd.f32        d_out7_i, d_s11_i, d_s15_r
+        .endif
+
+        vtrn.32         q_out0, q_out1
+        vtrn.32         q_out2, q_out3
+        vtrn.32         q_out4, q_out5
+        vtrn.32         q_out6, q_out7
+
+
+        vst2.32         {q_out0}, [p_out1]!
+        vst2.32         {q_out2}, [p_out1]!
+        vst2.32         {q_out4}, [p_out1]!
+        vst2.32         {q_out6}, [p_out1]!
+        vst2.32         {q_out1}, [p_out1]!
+        vst2.32         {q_out3}, [p_out1]!
+        vst2.32         {q_out5}, [p_out1]!
+        vst2.32         {q_out7}, [p_out1]!
+
+        sub             p_in1, p_in1, fstep, lsl #3
+        add             p_in1, p_in1, #16
 
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
+        .endm
 
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_radix4_butterfly_forward_stages
+        .global TW_81
+TW_81:
+.float 0.70710678
+.float -0.70710678
 
-.L_ne10_radix4_butterfly_forward_end:
-        /*Return From Function*/
-        pop             {r4-r12,pc}
 
+        /**
+         * @details
+         * This function implements the radix4/8 forward FFT
+         *
+         * @param[in/out] *Fout        points to input/output pointers
+         * @param[in]     *factors     factors pointer:
+                                        * 0: stage number
+                                        * 1: stride for the first stage
+                                        * others: factor out powers of 4, powers of 2
+         * @param[in]     *twiddles     twiddles coeffs of FFT
+         */
 
         .align 4
-        .global ne10_radix2_butterfly_forward_float32_neon
+        .global ne10_mixed_radix_fft_forward_float32_neon
         .thumb
         .thumb_func
 
-ne10_radix2_butterfly_forward_float32_neon:
-
+ne10_mixed_radix_fft_forward_float32_neon:
         push            {r4-r12,lr}
         vpush           {q4-q7}
 
         ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
         ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
         add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
+        ldr             radix, [p_factors]                         /* get factors[2*stage_count]--- the first radix */
         ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
-        sub             stage_count, stage_count, #2
 
+        /* save the output buffer for the last stage  */
+        mov             p_out_ls, p_fout
 
-        /* loop of the second stages  */
-.L_ne10_radix2_butterfly_forwards_second_stage:
-        lsr             fstride, fstride, #2
+        /* ---------------the first stage---------------  */
+        /* judge the radix is 2 or 4  */
+        cmp             radix, #2
+        beq             .L_ne10_radix8_butterfly_first_stage
 
-        /* loop of fstride  */
-        mov             count_f, fstride
-        mov             p_tw1, p_twiddles
-        mov             p_fout0, p_fout
-        add             p_fout1, p_fout, mstride, lsl #5
-        mov             p_fout2, p_fout
-        mov             p_fout3, p_fout1
-        mov             tmp0, #96
-        vld2.32         {d_tw1_r01, d_tw2_r01, d_tw1_i01, d_tw2_i01}, [p_tw1]!
-
-.L_ne10_radix2_butterfly_forwards_second_stage_fstride:
-        @RADIX24_BUTTERFLY_P4
-        vld2.32         {d_tw3_r01, d_tw3_i01}, [p_tw1]
-        vld2.32         {d_fin0_r01, d_fin1_r01, d_fin0_i01, d_fin1_i01}, [p_fout0]!
-        vld2.32         {d_fin2_r01, d_fin3_r01, d_fin2_i01, d_fin3_i01}, [p_fout0], tmp0
-        vld2.32         {d_fin0_r23, d_fin1_r23, d_fin0_i23, d_fin1_i23}, [p_fout1]!
-        vld2.32         {d_fin2_r23, d_fin3_r23, d_fin2_i23, d_fin3_i23}, [p_fout1], tmp0
-
-        vmul.f32       d_s2_r01, d_fin3_r01, d_tw3_r01
-        vmul.f32       d_s2_i01, d_fin3_r01, d_tw3_i01
-        vmul.f32       d_s2_r23, d_fin3_r23, d_tw3_r01
-        vmul.f32       d_s2_i23, d_fin3_r23, d_tw3_i01
-        vmls.f32       d_s2_r01, d_fin3_i01, d_tw3_i01
-        vmla.f32       d_s2_i01, d_fin3_i01, d_tw3_r01
-        vmls.f32       d_s2_r23, d_fin3_i23, d_tw3_i01
-        vmla.f32       d_s2_i23, d_fin3_i23, d_tw3_r01
-
-        vmul.f32       d_s1_r01, d_fin2_r01, d_tw2_r01
-        vmul.f32       d_s1_r23, d_fin2_r23, d_tw2_r01
-        vmul.f32       d_s1_i01, d_fin2_r01, d_tw2_i01
-        vmul.f32       d_s1_i23, d_fin2_r23, d_tw2_i01
-        vmls.f32       d_s1_r01, d_fin2_i01, d_tw2_i01
-        vmls.f32       d_s1_r23, d_fin2_i23, d_tw2_i01
-        vmla.f32       d_s1_i01, d_fin2_i01, d_tw2_r01
-        vmla.f32       d_s1_i23, d_fin2_i23, d_tw2_r01
-
-        vmul.f32       d_s0_r01, d_fin1_r01, d_tw1_r01
-        vmul.f32       d_s0_r23, d_fin1_r23, d_tw1_r01
-        vmul.f32       d_s0_i01, d_fin1_r01, d_tw1_i01
-        vmul.f32       d_s0_i23, d_fin1_r23, d_tw1_i01
-        vmls.f32       d_s0_r01, d_fin1_i01, d_tw1_i01
-        vmls.f32       d_s0_r23, d_fin1_i23, d_tw1_i01
-        vmla.f32       d_s0_i01, d_fin1_i01, d_tw1_r01
-        vmla.f32       d_s0_i23, d_fin1_i23, d_tw1_r01
-
-        vmov            d_fin1_r01, d_fin0_r23
-        vmov            d_fin1_i01, d_fin0_i23
-
-        vsub.f32        q_s5_r0123, q_fin0_r0123, q_s1_r0123
-        vsub.f32        q_s5_i0123, q_fin0_i0123, q_s1_i0123
-        vadd.f32        q_fout0_r0123, q_fin0_r0123, q_s1_r0123
-        vadd.f32        q_fout0_i0123, q_fin0_i0123, q_s1_i0123
-
-        vadd.f32        q_s3_r0123, q_s0_r0123, q_s2_r0123
-        vadd.f32        q_s3_i0123, q_s0_i0123, q_s2_i0123
-        vsub.f32        q_s4_r0123, q_s0_r0123, q_s2_r0123
-        vsub.f32        q_s4_i0123, q_s0_i0123, q_s2_i0123
-        vsub.f32        q_fout2_r0123, q_fout0_r0123, q_s3_r0123
-        vsub.f32        q_fout2_i0123, q_fout0_i0123, q_s3_i0123
-        vadd.f32        q_fout0_r0123, q_fout0_r0123, q_s3_r0123
-        vadd.f32        q_fout0_i0123, q_fout0_i0123, q_s3_i0123
-
-        vadd.f32        q_fout1_r0123, q_s5_r0123, q_s4_i0123
-        vsub.f32        q_fout1_i0123, q_s5_i0123, q_s4_r0123
-        vsub.f32        q_fout3_r0123, q_s5_r0123, q_s4_i0123
-        vadd.f32        q_fout3_i0123, q_s5_i0123, q_s4_r0123
-
-        vswp            d_fout1_r01, d_fout0_r23
-        vswp            d_fout1_i01, d_fout0_i23
-        vswp            d_fout3_r01, d_fout2_r23
-        vswp            d_fout3_i01, d_fout2_i23
-
-        vst2.32         {d_fout0_r01, d_fout1_r01, d_fout0_i01, d_fout1_i01}, [p_fout2]!
-        vst2.32         {d_fout0_r23, d_fout1_r23, d_fout0_i23, d_fout1_i23}, [p_fout3]!
-        vst2.32         {d_fout2_r01, d_fout3_r01, d_fout2_i01, d_fout3_i01}, [p_fout2], tmp0
-        vst2.32         {d_fout2_r23, d_fout3_r23, d_fout2_i23, d_fout3_i23}, [p_fout3], tmp0
-        subs            count_f, count_f, #2
-        bgt             .L_ne10_radix2_butterfly_forwards_second_stage_fstride
+        /* ---------------first stage: radix 4  */
+        mov             count, fstride
+        mov             p_fin0, p_fin
+        mov             p_tmp, p_fout
+        add             p_fin2, p_fin0, fstride, lsl #4   /* get the address of F[fstride*2] */
+        add             p_fin1, p_fin0, fstride, lsl #3   /* get the address of F[fstride] */
+        add             p_fin3, p_fin2, fstride, lsl #3   /* get the address of F[fstride*3] */
+        vld2.32         {q_in0_01}, [p_fin0:64]!
+        vld2.32         {q_in2_01}, [p_fin2:64]!
+        vld2.32         {q_in1_01}, [p_fin1:64]!
+        vld2.32         {q_in3_01}, [p_fin3:64]!
+
+.L_ne10_radix4_butterfly_first_stage_fstride:
+        BUTTERFLY4X2_WITHOUT_TWIDDLES "FALSE"
 
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
+        subs            count, count, #2
+        bgt             .L_ne10_radix4_butterfly_first_stage_fstride
 
+        /* swap input/output buffer  */
+        mov             tmp0, p_fout
+        mov             p_fout, p_fin
+        mov             p_fin, tmp0
 
-        /* loop of the other stages  */
-.L_ne10_radix2_butterfly_forwards_stages:
+        /* (stage_count-2): reduce the counter for the last stage  */
+        sub             stage_count, stage_count, #2
+        lsl             nstep, fstride, #3
         lsr             fstride, fstride, #2
 
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_radix2_butterfly_forwards_stages_fstride:
-        sub             tmp0, fstride, count_f
-        mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
-        mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
+        b               .L_ne10_butterfly_other_stages
+        /* ---------------end of first stage: radix 4  */
 
-        /* loop of mstride  */
-        mov             count_m, mstride
 
-.L_ne10_radix2_butterfly_forwards_stages_mstride:
-        RADIX4_BUTTERFLY_P4
 
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_radix2_butterfly_forwards_stages_mstride
+        /* ---------------first stage: radix 8  */
+.L_ne10_radix8_butterfly_first_stage:
+        lsr             fstride1, fstride, #2
+        mov             p_in1, p_fin
+        mov             p_out1, p_fout
+        lsl             fstep, fstride, #1
 
-        /* end of mstride_loop */
+.L_ne10_radix8_butterfly_first_stage_fstride1:
+        BUTTERFLY8X2_WITHOUT_TWIDDLES "FALSE"
 
-        subs            count_f, count_f, #1
-        bgt             .L_ne10_radix2_butterfly_forwards_stages_fstride
+        subs            fstride1, fstride1, #2
+        bgt             .L_ne10_radix8_butterfly_first_stage_fstride1
 
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
+        lsl             nstep, fstride, #2
+        sub             stage_count, stage_count, #2
+        lsr             fstride, fstride, #4
         lsl             mstride, mstride, #2
+        add             p_twiddles, p_twiddles, #48 /* get the address of twiddles += 6 */
 
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_radix2_butterfly_forwards_stages
+        /* swap input/output buffer  */
+        mov             tmp0, p_fout
+        mov             p_fout, p_fin
+        mov             p_fin, tmp0
 
-.L_ne10_radix2_butterfly_forwards_end:
-        /*Return From Function*/
-        vpop            {q4-q7}
-        pop             {r4-r12,pc}
+        /* if the last stage  */
+        cmp            stage_count, #1
+        beq            .L_ne10_butterfly_last_stages
 
-        .align 4
-        .global ne10_radix2_butterfly_backward_float32_neon
-        .thumb
-        .thumb_func
+        /* (stage_count-1): reduce the counter for the last stage  */
+        sub            stage_count, stage_count, #1
+        /*--------------- end of first stage: radix 8  */
+        /* ---------------end of first stage---------------  */
 
-ne10_radix2_butterfly_backward_float32_neon:
 
-        push            {r4-r12,lr}
-        vpush           {q4-q7}
-
-        ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
-        ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
-        add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
-        ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
-        sub             stage_count, stage_count, #2
-
-
-        /* loop of the second stages  */
-.L_ne10_radix2_butterfly_backward_second_stage:
-        lsr             fstride, fstride, #2
+        /* ---------------other stages  except last stage---------------  */
+        /* loop of other stages  */
+.L_ne10_butterfly_other_stages:
+        lsl             mstep, mstride, #3
+        mov             p_in1, p_fin
+        vld2.32         {d_fin0_r, d_fin0_i}, [p_in1:64], nstep
+        vld2.32         {d_fin1_r, d_fin1_i}, [p_in1:64], nstep
+        vld2.32         {d_fin2_r, d_fin2_i}, [p_in1:64], nstep
+        vld2.32         {d_fin3_r, d_fin3_i}, [p_in1:64], nstep
 
         /* loop of fstride  */
         mov             count_f, fstride
+.L_ne10_butterfly_other_stages_fstride:
         mov             p_tw1, p_twiddles
-        mov             p_fout0, p_fout
-        add             p_fout1, p_fout, mstride, lsl #5
-        mov             p_fout2, p_fout
-        mov             p_fout3, p_fout1
-        mov             tmp0, #96
-        vld2.32         {d_tw1_r01, d_tw2_r01, d_tw1_i01, d_tw2_i01}, [p_tw1]!
-
-.L_ne10_radix2_butterfly_backward_second_stage_fstride:
-        RADIX24_BUTTERFLY_INVERSE_P4
-
-        subs            count_f, count_f, #2
-        bgt             .L_ne10_radix2_butterfly_backward_second_stage_fstride
-
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
-
-
-        /* loop of the other stages  */
-.L_ne10_radix2_butterfly_backward_stages:
-        lsr             fstride, fstride, #2
-
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_radix2_butterfly_backward_stages_fstride:
         sub             tmp0, fstride, count_f
         mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
-        mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
+        add             p_out1, p_fout, tmp0, lsl #5
+        vld2.32         {d_tw0_r, d_tw0_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw1_r, d_tw1_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw2_r, d_tw2_i}, [p_tw1:64]
 
         /* loop of mstride  */
         mov             count_m, mstride
 
-.L_ne10_radix2_butterfly_backward_stages_mstride:
-        RADIX4_BUTTERFLY_INVERSE_P4
+.L_ne10_butterfly_other_stages_mstride:
+        BUTTERFLY4X2_WITH_TWIDDLES "FALSE"
 
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_radix2_butterfly_backward_stages_mstride
-
-        /* end of mstride_loop */
+        subs            count_m, count_m, #2
+        bgt             .L_ne10_butterfly_other_stages_mstride
+        /* end of mstride loop */
 
         subs            count_f, count_f, #1
-        bgt             .L_ne10_radix2_butterfly_backward_stages_fstride
+        bgt             .L_ne10_butterfly_other_stages_fstride
 
         add             p_twiddles, p_twiddles, mstride, lsl #4
         add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
         lsl             mstride, mstride, #2
+        lsr             fstride, fstride, #2
 
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_radix2_butterfly_backward_stages
-
-
-.L_ne10_radix2_butterfly_backward_end:
-        /*Return From Function*/
-        vpop            {q4-q7}
-        pop             {r4-r12,pc}
-
-
-        .align 4
-        .global ne10_radix4_butterfly_backward_float32_neon
-        .thumb
-        .thumb_func
-
-ne10_radix4_butterfly_backward_float32_neon:
-
-        push            {r4-r12,lr}
-        vpush           {q4-q7}
+        /* swap input/output buffer  */
+        mov             tmp0, p_fout
+        mov             p_fout, p_fin
+        mov             p_fin, tmp0
 
-        ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
-        ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
-        add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
-        ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
-        sub             stage_count, stage_count, #1
+        subs            stage_count, stage_count, #1
+        bgt             .L_ne10_butterfly_other_stages
+        /* ---------------end other stages  except last stage---------------  */
 
-        /* loop of the stages  */
-.L_ne10_radix4_butterfly_backward_stages:
-        lsr             fstride, fstride, #2
 
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_radix4_butterfly_backward_stages_fstride:
-        sub             tmp0, fstride, count_f
-        mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
+        /* ---------------last stage---------------  */
+.L_ne10_butterfly_last_stages:
+        mov             p_in1, p_fin
+        mov             p_out1, p_out_ls
         mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
+        mov             mstep, nstep
+        vld2.32         {d_fin0_r, d_fin0_i}, [p_in1:64], nstep
+        vld2.32         {d_fin1_r, d_fin1_i}, [p_in1:64], nstep
+        vld2.32         {d_fin2_r, d_fin2_i}, [p_in1:64], nstep
+        vld2.32         {d_fin3_r, d_fin3_i}, [p_in1:64], nstep
+        vld2.32         {d_tw0_r, d_tw0_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw1_r, d_tw1_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw2_r, d_tw2_i}, [p_tw1:64]
 
         /* loop of mstride  */
         mov             count_m, mstride
+.L_ne10_butterfly_last_stages_mstride:
+        BUTTERFLY4X2_WITH_TWIDDLES "FALSE"
 
-.L_ne10_radix4_butterfly_backward_stages_mstride:
-        RADIX4_BUTTERFLY_INVERSE_P4
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_radix4_butterfly_backward_stages_mstride
-
-        /* end of mstride_loop */
-
-        subs            count_f, count_f, #1
-        bgt             .L_ne10_radix4_butterfly_backward_stages_fstride
-
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
-
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_radix4_butterfly_backward_stages
-
+        subs            count_m, count_m, #2
+        bgt             .L_ne10_butterfly_last_stages_mstride
+        /* end of mstride loop */
+        /* ---------------end of last stage---------------  */
 
-.L_ne10_radix4_inverse_butterfly_backward_end:
+.L_ne10_butterfly_end:
         /*Return From Function*/
         vpop            {q4-q7}
         pop             {r4-r12,pc}
 
-
+        /* end of ne10_mixed_radix_fft_forward_float32_neon */
 
         /**
          * @details
-         * This function implements the 4 butterfly
+         * This function implements the radix4/8 backward FFT
          *
          * @param[in/out] *Fout        points to input/output pointers
          * @param[in]     *factors     factors pointer:
@@ -827,11 +622,11 @@ ne10_radix4_butterfly_backward_float32_neon:
          */
 
         .align 4
-        .global ne10_mixed_radix_butterfly_length_odd_power2_float32_neon
+        .global ne10_mixed_radix_fft_backward_float32_neon
         .thumb
         .thumb_func
 
-ne10_mixed_radix_butterfly_length_odd_power2_float32_neon:
+ne10_mixed_radix_fft_backward_float32_neon:
         push            {r4-r12,lr}
         vpush           {q4-q7}
 
@@ -841,420 +636,162 @@ ne10_mixed_radix_butterfly_length_odd_power2_float32_neon:
         ldr             radix, [p_factors]                         /* get factors[2*stage_count]--- the first radix */
         ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
 
-        mov             p_fin, p_fout
-        mov             p_fout0, p_fout
-        mov             count, fstride
-
-        /* the first stage  */
-.L_ne10_butterfly_length_odd_power2_first_stage:
-        vld4.32         {d_in0_r01, d_in0_i01, d_in1_r01, d_in1_i01}, [p_fin]!
-        vld4.32         {d_in0_r23, d_in0_i23, d_in1_r23, d_in1_i23}, [p_fin]!
-        vsub.f32        q_out1_r0123, q_in0_r0123, q_in1_r0123
-        vsub.f32        q_out1_i0123, q_in0_i0123, q_in1_i0123
-        vadd.f32        q_out0_r0123, q_in0_r0123, q_in1_r0123
-        vadd.f32        q_out0_i0123, q_in0_i0123, q_in1_i0123
-        subs            count, count, #4
-        vst4.32         {d_out0_r01, d_out0_i01, d_out1_r01, d_out1_i01}, [p_fout0]!
-        vst4.32         {d_out0_r23, d_out0_i23, d_out1_r23, d_out1_i23}, [p_fout0]!
-
-        bgt             .L_ne10_butterfly_length_odd_power2_first_stage
-
-        /* the second stages  */
-        subs            stage_count, stage_count, #1
-        lsr             fstride, fstride, #2
-
-        /* loop of fstride  */
-        mov             count_f, fstride
-        mov             p_tw1, p_twiddles
-        mov             p_fout0, p_fout
-        add             p_fout1, p_fout, mstride, lsl #5
-        mov             p_fout2, p_fout
-        mov             p_fout3, p_fout1
-        mov             tmp0, #96
-        vld2.32         {d_tw1_r01, d_tw2_r01, d_tw1_i01, d_tw2_i01}, [p_tw1]!
+        /* save the output buffer for the last stage  */
+        mov             p_out_ls, p_fout
 
-.L_ne10_butterfly_length_odd_power2_second_stage:
-        RADIX24_BUTTERFLY_P4
+        /* ---------------the first stage---------------  */
+        /* judge the radix is 2 or 4  */
+        cmp             radix, #2
+        beq             .L_ne10_radix8_butterfly_inverse_first_stage
 
-        subs            count_f, count_f, #2
-        bgt             .L_ne10_butterfly_length_odd_power2_second_stage
+        /* ---------------first stage: radix 4  */
+        mov             count, fstride
+        mov             p_fin0, p_fin
+        mov             p_tmp, p_fout
+        add             p_fin2, p_fin0, fstride, lsl #4   /* get the address of F[fstride*2] */
+        add             p_fin1, p_fin0, fstride, lsl #3   /* get the address of F[fstride] */
+        add             p_fin3, p_fin2, fstride, lsl #3   /* get the address of F[fstride*3] */
+        vld2.32         {q_in0_01}, [p_fin0:64]!
+        vld2.32         {q_in2_01}, [p_fin2:64]!
+        vld2.32         {q_in1_01}, [p_fin1:64]!
+        vld2.32         {q_in3_01}, [p_fin3:64]!
+
+.L_ne10_radix4_butterfly_inverse_first_stage_fstride:
+        BUTTERFLY4X2_WITHOUT_TWIDDLES "TRUE"
 
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
+        subs            count, count, #2
+        bgt             .L_ne10_radix4_butterfly_inverse_first_stage_fstride
 
-        /* other stages  */
-        subs            stage_count, stage_count, #1
+        /* swap input/output buffer  */
+        mov             tmp0, p_fout
+        mov             p_fout, p_fin
+        mov             p_fin, tmp0
 
-        /* loop of other stages  */
-.L_ne10_butterfly_length_odd_power2_other_stages:
+        /* (stage_count-2): reduce the counter for the last stage  */
+        sub             stage_count, stage_count, #2
+        lsl             nstep, fstride, #3
         lsr             fstride, fstride, #2
 
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_butterfly_length_odd_power2_other_stages_fstride:
-        sub             tmp0, fstride, count_f
-        mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
-        mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
+        b               .L_ne10_butterfly_inverse_other_stages
+        /* ---------------end of first stage: radix 4  */
 
-        /* loop of mstride  */
-        mov             count_m, mstride
 
-.L_ne10_butterfly_length_odd_power2_other_stages_mstride:
-        RADIX4_BUTTERFLY_P4
 
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_butterfly_length_odd_power2_other_stages_mstride
-        /* end of mstride loop */
+        /* ---------------first stage: radix 8  */
+.L_ne10_radix8_butterfly_inverse_first_stage:
+        lsr             fstride1, fstride, #2
+        mov             p_in1, p_fin
+        mov             p_out1, p_fout
+        lsl             fstep, fstride, #1
 
-        subs            count_f, count_f, #1
-        bgt             .L_ne10_butterfly_length_odd_power2_other_stages_fstride
+.L_ne10_radix8_butterfly_inverse_first_stage_fstride1:
+        BUTTERFLY8X2_WITHOUT_TWIDDLES "TRUE"
 
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
+        subs            fstride1, fstride1, #2
+        bgt             .L_ne10_radix8_butterfly_inverse_first_stage_fstride1
 
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_butterfly_length_odd_power2_other_stages
-
-.L_ne10_butterfly_length_odd_power2_end:
-        /*Return From Function*/
-        vpop            {q4-q7}
-        pop             {r4-r12,pc}
-
-        /* end of ne10_butterfly_length_odd_power2_float32_neon */
-
-
-        /**
-         * @details
-         * This function implements the 4 butterfly
-         *
-         * @param[in/out] *Fout        points to input/output pointers
-         * @param[in]     *factors     factors pointer:
-                                        * 0: stage number
-                                        * 1: stride for the first stage
-                                        * others: factor out powers of 4, powers of 2
-         * @param[in]     *twiddles     twiddles coeffs of FFT
-         */
-
-        .align 4
-        .global ne10_mixed_radix_butterfly_length_even_power2_float32_neon
-        .thumb
-        .thumb_func
-
-ne10_mixed_radix_butterfly_length_even_power2_float32_neon:
-        push            {r4-r12,lr}
-        vpush           {q4-q7}
+        lsl             nstep, fstride, #2
+        sub             stage_count, stage_count, #2
+        lsr             fstride, fstride, #4
+        lsl             mstride, mstride, #2
+        add             p_twiddles, p_twiddles, #48 /* get the address of twiddles += 6 */
 
-        ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
-        ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
-        add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
-        ldr             radix, [p_factors]                         /* get factors[2*stage_count]--- the first radix */
-        ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
+        /* swap input/output buffer  */
+        mov             tmp0, p_fout
+        mov             p_fout, p_fin
+        mov             p_fin, tmp0
 
-        mov             p_fin, p_fout
-        mov             p_fout0, p_fout
-        mov             count, fstride
+        /* if the last stage  */
+        cmp            stage_count, #1
+        beq            .L_ne10_butterfly_inverse_last_stages
 
-        /* the first stage  */
-.L_ne10_butterfly_length_even_power2_first_stage:
-        vld1.32         {d_in0_0, d_in1_0, d_in2_0, d_in3_0}, [p_fin]!
-        vld1.32         {d_in0_1, d_in1_1, d_in2_1, d_in3_1}, [p_fin]!
-        vswp            d_in1_0, d_in0_1
-        vswp            d_in3_0, d_in2_1
-        vsub.f32        q_s2_01, q_in0_01, q_in2_01
-        vadd.f32        q_out0_01, q_in0_01, q_in2_01
-        vadd.f32        q_s0_01, q_in1_01, q_in3_01
-        vsub.f32        q_s1_01, q_in1_01, q_in3_01
-        vsub.f32        q_out2_01, q_out0_01, q_s0_01
-        vrev64.32       q_s1_01, q_s1_01
-        vadd.f32        q_out0_01, q_out0_01, q_s0_01
-        vadd.f32        q_out1_01, q_s2_01, q_s1_01
-        vsub.f32        q_out3_01, q_s2_01, q_s1_01
-        vrev64.32       q_tmp, q_out1_01
-        vrev64.32       q_tmp2, q_out3_01
-        vtrn.32         q_out3_01, q_tmp
-        vtrn.32         q_out1_01, q_tmp2
-        vswp            d_out1_0, d_out0_1
-        vswp            d_out3_0, d_out2_1
-        subs            count, count, #2
-        vst1.32         {d_out0_0, d_out1_0, d_out2_0, d_out3_0}, [p_fout0]!
-        vst1.32         {d_out0_1, d_out1_1, d_out2_1, d_out3_1}, [p_fout0]!
-        bgt             .L_ne10_butterfly_length_even_power2_first_stage
+        /* (stage_count-1): reduce the counter for the last stage  */
+        sub            stage_count, stage_count, #1
+        /*--------------- end of first stage: radix 8  */
+        /* ---------------end of first stage---------------  */
 
-        /* other stages  */
-        subs            stage_count, stage_count, #1
 
+        /* ---------------other stages  except last stage---------------  */
         /* loop of other stages  */
-.L_ne10_butterfly_length_even_power2_other_stages:
-        lsr             fstride, fstride, #2
+.L_ne10_butterfly_inverse_other_stages:
+        lsl             mstep, mstride, #3
+        mov             p_in1, p_fin
+        vld2.32         {d_fin0_r, d_fin0_i}, [p_in1:64], nstep
+        vld2.32         {d_fin1_r, d_fin1_i}, [p_in1:64], nstep
+        vld2.32         {d_fin2_r, d_fin2_i}, [p_in1:64], nstep
+        vld2.32         {d_fin3_r, d_fin3_i}, [p_in1:64], nstep
 
         /* loop of fstride  */
         mov             count_f, fstride
-.L_ne10_butterfly_length_even_power2_other_stages_fstride:
+.L_ne10_butterfly_inverse_other_stages_fstride:
+        mov             p_tw1, p_twiddles
         sub             tmp0, fstride, count_f
         mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
-        mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
+        add             p_out1, p_fout, tmp0, lsl #5
+        vld2.32         {d_tw0_r, d_tw0_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw1_r, d_tw1_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw2_r, d_tw2_i}, [p_tw1:64]
 
         /* loop of mstride  */
         mov             count_m, mstride
 
-.L_ne10_butterfly_length_even_power2_other_stages_mstride:
-        RADIX4_BUTTERFLY_P4        
+.L_ne10_butterfly_inverse_other_stages_mstride:
+        BUTTERFLY4X2_WITH_TWIDDLES "TRUE"
 
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_butterfly_length_even_power2_other_stages_mstride
+        subs            count_m, count_m, #2
+        bgt             .L_ne10_butterfly_inverse_other_stages_mstride
         /* end of mstride loop */
 
         subs            count_f, count_f, #1
-        bgt             .L_ne10_butterfly_length_even_power2_other_stages_fstride
+        bgt             .L_ne10_butterfly_inverse_other_stages_fstride
 
         add             p_twiddles, p_twiddles, mstride, lsl #4
         add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
         lsl             mstride, mstride, #2
-
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_butterfly_length_even_power2_other_stages
-
-.L_ne10_butterfly_length_even_power2_end:
-        /*Return From Function*/
-        vpop            {q4-q7}
-        pop             {r4-r12,pc}
-
-        /* end of ne10_butterfly_length_even_power2_float32_neon */
-
-
-        /**
-         * @details
-         * This function implements the 4 butterfly
-         *
-         * @param[in/out] *Fout        points to input/output pointers
-         * @param[in]     *factors     factors pointer:
-                                        * 0: stage number
-                                        * 1: stride for the first stage
-                                        * others: factor out powers of 4, powers of 2
-         * @param[in]     *twiddles     twiddles coeffs of FFT
-         */
-
-        .align 4
-        .global ne10_mixed_radix_butterfly_inverse_length_odd_power2_float32_neon
-        .thumb
-        .thumb_func
-
-ne10_mixed_radix_butterfly_inverse_length_odd_power2_float32_neon:
-        push            {r4-r12,lr}
-        vpush           {q4-q7}
-
-        ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
-        ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
-        add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
-        ldr             radix, [p_factors]                         /* get factors[2*stage_count]--- the first radix */
-        ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
-
-        mov             p_fin, p_fout
-        mov             p_fout0, p_fout
-        mov             count, fstride
-
-        /* the first stage  */
-.L_ne10_butterfly_inverse_length_odd_power2_first_stage:
-        vld4.32         {d_in0_r01, d_in0_i01, d_in1_r01, d_in1_i01}, [p_fin]!
-        vld4.32         {d_in0_r23, d_in0_i23, d_in1_r23, d_in1_i23}, [p_fin]!
-        vsub.f32        q_out1_r0123, q_in0_r0123, q_in1_r0123
-        vsub.f32        q_out1_i0123, q_in0_i0123, q_in1_i0123
-        vadd.f32        q_out0_r0123, q_in0_r0123, q_in1_r0123
-        vadd.f32        q_out0_i0123, q_in0_i0123, q_in1_i0123
-        subs            count, count, #4
-        vst4.32         {d_out0_r01, d_out0_i01, d_out1_r01, d_out1_i01}, [p_fout0]!
-        vst4.32         {d_out0_r23, d_out0_i23, d_out1_r23, d_out1_i23}, [p_fout0]!
-
-        bgt             .L_ne10_butterfly_inverse_length_odd_power2_first_stage
-
-        /* the second stages  */
-        subs            stage_count, stage_count, #1
         lsr             fstride, fstride, #2
 
-        /* loop of fstride  */
-        mov             count_f, fstride
-        mov             p_tw1, p_twiddles
-        mov             p_fout0, p_fout
-        add             p_fout1, p_fout, mstride, lsl #5
-        mov             p_fout2, p_fout
-        mov             p_fout3, p_fout1
-        mov             tmp0, #96
-        vld2.32         {d_tw1_r01, d_tw2_r01, d_tw1_i01, d_tw2_i01}, [p_tw1]!
-
-.L_ne10_butterfly_inverse_length_odd_power2_second_stage:
-        RADIX24_BUTTERFLY_INVERSE_P4
-
-        subs            count_f, count_f, #2
-        bgt             .L_ne10_butterfly_inverse_length_odd_power2_second_stage
-
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
+        /* swap input/output buffer  */
+        mov             tmp0, p_fout
+        mov             p_fout, p_fin
+        mov             p_fin, tmp0
 
-        /* other stages  */
         subs            stage_count, stage_count, #1
+        bgt             .L_ne10_butterfly_inverse_other_stages
+        /* ---------------end other stages  except last stage---------------  */
 
-        /* loop of other stages  */
-.L_ne10_butterfly_inverse_length_odd_power2_other_stages:
-        lsr             fstride, fstride, #2
 
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_butterfly_inverse_length_odd_power2_other_stages_fstride:
-        sub             tmp0, fstride, count_f
-        mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
+        /* ---------------last stage---------------  */
+.L_ne10_butterfly_inverse_last_stages:
+        mov             p_in1, p_fin
+        mov             p_out1, p_out_ls
         mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
+        mov             mstep, nstep
+        vld2.32         {d_fin0_r, d_fin0_i}, [p_in1:64], nstep
+        vld2.32         {d_fin1_r, d_fin1_i}, [p_in1:64], nstep
+        vld2.32         {d_fin2_r, d_fin2_i}, [p_in1:64], nstep
+        vld2.32         {d_fin3_r, d_fin3_i}, [p_in1:64], nstep
+        vld2.32         {d_tw0_r, d_tw0_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw1_r, d_tw1_i}, [p_tw1:64], mstep
+        vld2.32         {d_tw2_r, d_tw2_i}, [p_tw1:64]
 
         /* loop of mstride  */
         mov             count_m, mstride
+.L_ne10_butterfly_inverse_last_stages_mstride:
+        BUTTERFLY4X2_WITH_TWIDDLES "TRUE"
 
-.L_ne10_butterfly_inverse_length_odd_power2_other_stages_mstride:
-        RADIX4_BUTTERFLY_INVERSE_P4
-
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_butterfly_inverse_length_odd_power2_other_stages_mstride
+        subs            count_m, count_m, #2
+        bgt             .L_ne10_butterfly_inverse_last_stages_mstride
         /* end of mstride loop */
+        /* ---------------end of last stage---------------  */
 
-        subs            count_f, count_f, #1
-        bgt             .L_ne10_butterfly_inverse_length_odd_power2_other_stages_fstride
-
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
-
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_butterfly_inverse_length_odd_power2_other_stages
-
-.L_ne10_butterfly_inverse_length_odd_power2_end:
+.L_ne10_butterfly_inverse_end:
         /*Return From Function*/
         vpop            {q4-q7}
         pop             {r4-r12,pc}
 
-        /* end of ne10_butterfly_inverse_length_odd_power2_float32_neon */
-
-
-        /**
-         * @details
-         * This function implements the 4 butterfly
-         *
-         * @param[in/out] *Fout        points to input/output pointers
-         * @param[in]     *factors     factors pointer:
-                                        * 0: stage number
-                                        * 1: stride for the first stage
-                                        * others: factor out powers of 4, powers of 2
-         * @param[in]     *twiddles     twiddles coeffs of FFT
-         */
-
-        .align 4
-        .global ne10_mixed_radix_butterfly_inverse_length_even_power2_float32_neon
-        .thumb
-        .thumb_func
-
-ne10_mixed_radix_butterfly_inverse_length_even_power2_float32_neon:
-        push            {r4-r12,lr}
-        vpush           {q4-q7}
+        /* end of ne10_mixed_radix_fft_forward_float32_neon */
 
-        ldr             stage_count, [p_factors]   /* get factors[0]---stage_count */
-        ldr             fstride, [p_factors, #4]   /* get factors[1]---fstride */
-        add             p_factors, p_factors, stage_count, lsl #3 /* get the address of factors[2*stage_count] */
-        ldr             radix, [p_factors]                         /* get factors[2*stage_count]--- the first radix */
-        ldr             mstride, [p_factors, #-4]                  /* get factors[2*stage_count-1]--- mstride */
-
-        mov             p_fin, p_fout
-        mov             p_fout0, p_fout
-        mov             count, fstride
 
-        /* the first stage  */
-.L_ne10_butterfly_inverse_length_even_power2_first_stage:
-        vld1.32         {d_in0_0, d_in1_0, d_in2_0, d_in3_0}, [p_fin]!
-        vld1.32         {d_in0_1, d_in1_1, d_in2_1, d_in3_1}, [p_fin]!
-        vswp            d_in1_0, d_in0_1
-        vswp            d_in3_0, d_in2_1
-        vsub.f32        q_s2_01, q_in0_01, q_in2_01
-        vadd.f32        q_out0_01, q_in0_01, q_in2_01
-        vadd.f32        q_s0_01, q_in1_01, q_in3_01
-        vsub.f32        q_s1_01, q_in1_01, q_in3_01
-        vsub.f32        q_out2_01, q_out0_01, q_s0_01
-        vrev64.32       q_s1_01, q_s1_01
-        vadd.f32        q_out0_01, q_out0_01, q_s0_01
-        vsub.f32        q_out1_01, q_s2_01, q_s1_01
-        vadd.f32        q_out3_01, q_s2_01, q_s1_01
-        vrev64.32       q_tmp, q_out1_01
-        vrev64.32       q_tmp2, q_out3_01
-        vtrn.32         q_out3_01, q_tmp
-        vtrn.32         q_out1_01, q_tmp2
-        vswp            d_out1_0, d_out0_1
-        vswp            d_out3_0, d_out2_1
-        subs            count, count, #2
-        vst1.32         {d_out0_0, d_out1_0, d_out2_0, d_out3_0}, [p_fout0]!
-        vst1.32         {d_out0_1, d_out1_1, d_out2_1, d_out3_1}, [p_fout0]!
-        bgt             .L_ne10_butterfly_inverse_length_even_power2_first_stage
-
-        /* other stages  */
-        subs            stage_count, stage_count, #1
-
-        /* loop of other stages  */
-.L_ne10_butterfly_inverse_length_even_power2_other_stages:
-        lsr             fstride, fstride, #2
-
-        /* loop of fstride  */
-        mov             count_f, fstride
-.L_ne10_butterfly_inverse_length_even_power2_other_stages_fstride:
-        sub             tmp0, fstride, count_f
-        mul             tmp0, tmp0, mstride
-        add             p_fout0, p_fout, tmp0, lsl #5
-        add             p_fout2, p_fout0, mstride, lsl #4   /* get the address of F[mstride*2] */
-        add             p_fout1, p_fout0, mstride, lsl #3   /* get the address of F[mstride] */
-        add             p_fout3, p_fout2, mstride, lsl #3   /* get the address of F[mstride*3] */
-        mov             p_tw1, p_twiddles
-        add             p_tw2, p_tw1, mstride, lsl #3       /* get the address of tw2 */
-        add             p_tw3, p_tw1, mstride, lsl #4       /* get the address of tw3 */
-
-        /* loop of mstride  */
-        mov             count_m, mstride
-
-.L_ne10_butterfly_inverse_length_even_power2_other_stages_mstride:
-        RADIX4_BUTTERFLY_INVERSE_P4
-
-        subs            count_m, count_m, #4
-        bgt             .L_ne10_butterfly_inverse_length_even_power2_other_stages_mstride
-        /* end of mstride loop */
-
-        subs            count_f, count_f, #1
-        bgt             .L_ne10_butterfly_inverse_length_even_power2_other_stages_fstride
-
-        add             p_twiddles, p_twiddles, mstride, lsl #4
-        add             p_twiddles, p_twiddles, mstride, lsl #3 /* get the address of twiddles += mstride*3 */
-        lsl             mstride, mstride, #2
-
-        subs            stage_count, stage_count, #1
-        bgt             .L_ne10_butterfly_inverse_length_even_power2_other_stages
-
-.L_ne10_butterfly_inverse_length_even_power2_end:
-        /*Return From Function*/
-        vpop            {q4-q7}
-        pop             {r4-r12,pc}
-        /* end of ne10_butterfly_inverse_length_even_power2_float32_neon */
 
 
         /* end of the file */
index 8ed1002..2240ba2 100644 (file)
@@ -33,10 +33,6 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
 {
     if (NE10_OK == is_NEON_available)
     {
-        ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_neon;
-        ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_neon;
-        ne10_rfft_float = ne10_rfft_float_neon;
-
         ne10_fft_c2c_1d_float32 = ne10_fft_c2c_1d_float32_neon;
         ne10_fft_c2c_1d_int32_unscaled = ne10_fft_c2c_1d_int32_unscaled_neon;
         ne10_fft_c2c_1d_int32_scaled = ne10_fft_c2c_1d_int32_scaled_neon;
@@ -60,10 +56,6 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
     }
     else
     {
-        ne10_radix4_butterfly_float = ne10_radix4_butterfly_float_c;
-        ne10_radix4_butterfly_inverse_float = ne10_radix4_butterfly_inverse_float_c;
-        ne10_rfft_float = ne10_rfft_float_c;
-
         ne10_fft_c2c_1d_float32 = ne10_fft_c2c_1d_float32_c;
         ne10_fft_c2c_1d_int32_unscaled = ne10_fft_c2c_1d_int32_unscaled_c;
         ne10_fft_c2c_1d_int32_scaled = ne10_fft_c2c_1d_int32_scaled_c;
@@ -89,22 +81,6 @@ ne10_result_t ne10_init_dsp (ne10_int32_t is_NEON_available)
 }
 
 // These are actual definitions of our function pointers that are declared in inc/NE10_dsp.h
-void (*ne10_radix4_butterfly_float) (ne10_float32_t *pDst,
-                                     ne10_float32_t *pSrc,
-                                     ne10_uint16_t N,
-                                     ne10_float32_t *pCoef);
-
-void (*ne10_radix4_butterfly_inverse_float) (ne10_float32_t *pDst,
-        ne10_float32_t *pSrc,
-        ne10_uint16_t N,
-        ne10_float32_t *pCoef,
-        ne10_float32_t onebyN);
-
-void (*ne10_rfft_float) (const ne10_rfft_instance_f32_t * S,
-                         ne10_float32_t * pSrc,
-                         ne10_float32_t * pDst,
-                         ne10_float32_t * pTemp);
-
 void (*ne10_fft_c2c_1d_float32) (ne10_fft_cpx_float32_t *fout,
                                  ne10_fft_cpx_float32_t *fin,
                                  ne10_fft_cpx_float32_t *twiddles,
diff --git a/modules/dsp/NE10_rfft.c b/modules/dsp/NE10_rfft.c
deleted file mode 100644 (file)
index af6737b..0000000
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * NE10 Library : dsp/NE10_rfft.c
- */
-
-#include "NE10_types.h"
-#include "NE10_dsp.h"
-
-/**
- * @ingroup groupDSPs
- */
-
-/**
- * @defgroup RFFT_RIFFT Real FFT
- *
- * \par
- * Complex FFT/IFFT typically assumes complex input and output. However many applications use real valued data in time domain.
- * Real FFT/IFFT efficiently process real valued sequences with the advantage of requirement of low memory and with less complexity.
- *
- * \par
- * This set of functions implements Real Fast Fourier Transforms(RFFT) and Real Inverse Fast Fourier Transform(RIFFT)
- * for floating-point data types.
- *
- *
- * \par Algorithm:
- *
- * <b>Real Fast Fourier Transform:</b>
- * \par
- * Real FFT of N-point is calculated using CFFT of N/2-point and Split RFFT process as shown below figure.
- * \par
- * \image html RFFT.gif "Real Fast Fourier Transform"
- * \par
- * The RFFT functions operate on blocks of input and output data and each call to the function processes
- * <code>fftLenR</code> samples through the transform.  <code>pSrc</code>  points to input array containing <code>fftLenR</code> values.
- * <code>pDst</code>  points to output array containing <code>2*fftLenR</code> values. \n
- * Input for real FFT is in the order of
- * <pre>{real[0], real[1], real[2], real[3], ..}</pre>
- * Output for real FFT is complex and are in the order of
- * <pre>{real(0), imag(0), real(1), imag(1), ...}</pre>
- *
- * <b>Real Inverse Fast Fourier Transform:</b>
- * \par
- * Real IFFT of N-point is calculated using Split RIFFT process and CFFT of N/2-point as shown below figure.
- * \par
- * \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
- * \par
- * The RIFFT functions operate on blocks of input and output data and each call to the function processes
- * <code>2*fftLenR</code> samples through the transform.  <code>pSrc</code>  points to input array containing <code>2*fftLenR</code> values.
- * <code>pDst</code>  points to output array containing <code>fftLenR</code> values. \n
- * Input for real IFFT is complex and are in the order of
- * <pre>{real(0), imag(0), real(1), imag(1), ...}</pre>
- *  Output for real IFFT is real and in the order of
- * <pre>{real[0], real[1], real[2], real[3], ..}</pre>
- *
- * \par Lengths supported by the transform:
- * \par
- * Real FFT/IFFT supports the lengths [128, 512, 2048], as it internally uses CFFT/CIFFT.
- *
- * \par Instance Structure
- * A separate instance structure must be defined for each Instance but the twiddle factors can be reused.
- * There are separate instance structure declarations for each of the 3 supported data types.
- *
- * \par Initialization Functions
- * There is also an associated initialization function for each data type.
- * The initialization function performs the following operations:
- * - Sets the values of the internal structure fields.
- * - Initializes twiddle factor tables.
- * - Initializes CFFT data structure fields.
- * \par
- * Use of the initialization function is optional.
- * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- * To place an instance structure into a const data section, the instance structure must be manually initialized.
- * Manually initialize the instance structure as follows:
- * <pre>
- *ne10_rfft_instance_f32_t S = {fft_len_real, fft_len_by2, ifft_flag_r, bit_reverse_flag_r, twid_coef_r_modifier, p_twiddle_A_real, p_twiddle_B_real, p_cfft};
- * </pre>
- * where <code>fft_len_real</code> length of RFFT/RIFFT; <code>fft_len_by2</code> length of CFFT/CIFFT.
- * <code>ifft_flag_r</code> Flag for selection of RFFT or RIFFT(Set ifftFlagR to calculate RIFFT otherwise calculates RFFT);
- * <code>bit_reverse_flag_r</code> Flag for selection of output order(Set bitReverseFlagR to output in normal order otherwise output in bit reversed order);
- * <code>twid_coef_r_modifier</code> modifier for twiddle factor table which supports 128, 512, 2048 RFFT lengths with same table;
- * <code>p_twiddle_A_real</code>points to A array of twiddle coefficients; <code>p_twiddle_B_real</code>points to B array of twiddle coefficients;
- * <code>p_cfft</code> points to the CFFT Instance structure. The CFFT structure also needs to be initialized, refer to arm_cfft_radix4_f32() for details regarding
- * static initialization of cfft structure.
- *
- */
-
-/**
- * @brief  Core Real FFT process
- * @param[in]   *pSrc                points to the Input buffer
- * @param[in]   N                    length of Real FFT
- * @param[in]   *pATable             points to the twiddle Coef A buffer
- * @param[in]   *pBTable             points to the twiddle Coef B buffer
- * @param[out]  *pDst                points to the Output buffer
- * @return none.
- * The function implements a Real FFT
- */
-
-static void ne10_split_rfft_float_c(
-                     ne10_float32_t * pSrc,
-                     ne10_uint32_t N,
-                     ne10_float32_t * pReTable,
-                     ne10_float32_t * pImTable,
-                     ne10_float32_t * pDst)
-{
-    ne10_uint32_t k;                                    /* Loop Counter */
-    ne10_float32_t uRe,vRe,uIm,vIm;                    /* Temporary variables for output */
-    ne10_float32_t reTwd,imTwd,reTmp,imTmp;
-    ne10_float32_t *pCoefRe,*pCoefIm;                             /* Temporary pointers for twiddle factors */
-    ne10_uint32_t NBy2 = N>>1;
-
-    pCoefRe = pReTable;
-    pCoefIm = pImTable;
-
-    /*First Result*/
-    pDst[0] = pSrc[0] + pSrc[1];
-    pDst[1] = 0;
-    /*N/2 th Result*/
-    pDst[N] = pSrc[0] - pSrc[1];
-    pDst[N+1] = 0;
-
-    /*for k=1 to N/4-1 and k=N/4+1 to K=N/2-1*/
-    for(k=1;k<(N>>2);k++)
-    {
-        /*uRe = (a[k]+a[N/2-k])/2*/
-        uRe = (pSrc[2*k]+pSrc[2*(NBy2-k)])*0.5;
-        /*uIm = (b[k]-b[N/2-k])/2*/
-        uIm = (pSrc[2*k+1]-pSrc[2*(NBy2-k)+1])*0.5;
-        /*VRe = (b[k]+b[N/2-k])/2*/
-        vRe = (pSrc[2*k+1]+pSrc[2*(NBy2-k)+1])*0.5;
-        /*Vim = -(a[k]-a[N/2-k])/2*/
-        vIm = (pSrc[2*(NBy2-k)]-pSrc[2*k])*0.5;
-        reTwd = pCoefRe[k];
-        imTwd = pCoefIm[k];
-        reTmp = vRe*reTwd + vIm*imTwd;
-        imTmp = vIm*reTwd - vRe*imTwd;
-        pDst[2*k] = uRe + reTmp;
-        pDst[2*k+1] = uIm + imTmp;
-        pDst[2*(NBy2-k)] = uRe-reTmp;
-        pDst[2*(NBy2-k)+1] = imTmp-uIm;
-
-        /*Out Put from K=N/2+1 till k=N-1*/
-        /*y[N-k] = conjugate(y[k] k=0 to k<N/2)*/
-        pDst[2*(N-k)] = uRe + reTmp;
-        pDst[2*(N-k)+1] = -(uIm + imTmp);
-        pDst[2*(NBy2+k)] = uRe-reTmp;
-        pDst[2*(NBy2+k)+1] = uIm-imTmp;
-    }
-    /*y[N/4] = a[N/4]-jb[N/4]; y[3*N/4] = a[N/4] + jb[N/4]*/
-    pDst[NBy2] = pSrc[NBy2];
-    pDst[NBy2+1] = -pSrc[NBy2+1];
-    pDst[N+NBy2] = pSrc[NBy2];
-    pDst[N+NBy2+1] = pSrc[NBy2+1];
-
-}
-
-
-/**
- * @brief  Core Real IFFT process
- * @param[in]   *pSrc                 points to the Input buffer
- * @param[in]   N                      length of Real FFT
- * @param[in]   *pATable             points to the twiddle Coef A buffer
- * @param[in]   *pBTable             points to the twiddle Coef B buffer
- * @param[out]  *pDst                 points to the Output buffer
- * @return none.
- * The function implements a Real FFT
- */
-
-
-static void ne10_split_rifft_float_c(
-                     ne10_float32_t * pSrc,
-                     ne10_uint32_t N,
-                     ne10_float32_t * pReTable,
-                     ne10_float32_t * pImTable,
-                     ne10_float32_t * pDst)
-{
-    ne10_uint32_t k;                                    /* Loop Counter */
-    ne10_float32_t uRe,vRe,uIm,vIm;                    /* Temporary variables for output */
-    ne10_float32_t reTwd,imTwd,reTmp,imTmp;
-    ne10_float32_t *pCoefRe,*pCoefIm;                             /* Temporary pointers for twiddle factors */
-    ne10_uint32_t NBy2 = N>>1;
-
-    pCoefRe = pReTable;
-    pCoefIm = pImTable;
-
-    /*First Result*/
-    pDst[0] = (pSrc[0] + pSrc[N])*0.5;
-    pDst[1] = (pSrc[0] - pSrc[N])*0.5;
-
-    /*for k=1 to N/4-1 and k=N/4+1 to K=N/2-1*/
-    for(k=1;k< (N>>2);k++)
-    {
-        /*uRe = (a[k]+a[N/2-k])/2*/
-        uRe = (pSrc[2*k] + pSrc[2*(NBy2-k)])*0.5;
-        /*uIm = (b[k]-b[N/2-k])/2*/
-        uIm = (pSrc[2*k+1] - pSrc[2*(NBy2-k)+1])*0.5;
-
-        reTmp = (pSrc[2*k] - pSrc[2*(NBy2-k)])*0.5;
-        imTmp = (pSrc[2*k+1] + pSrc[2*(NBy2-k)+1])*0.5;
-
-        reTwd = pCoefRe[k];
-        imTwd = pCoefIm[k];
-
-        /*VRe = (b[k]+b[N/2-k])/2*/
-        vRe =  reTmp*reTwd - imTmp*imTwd;
-        /*Vim = -(a[k]-a[N/2-k])/2*/
-        vIm =  imTmp*reTwd + reTmp*imTwd;
-        pDst[2*k] = (uRe-vIm);
-        pDst[2*k+1] = (uIm+vRe);
-
-        pDst[2*(NBy2-k)] = (uRe+vIm);
-        pDst[2*(NBy2-k)+1] = (vRe-uIm);
-    }
-    /*y[N/4] = a[N/4]-jb[N/4]*/
-    pDst[NBy2] = pSrc[NBy2];
-    pDst[NBy2+1] = -pSrc[NBy2+1];
-
-}
-
-/**
- * @addtogroup RFFT_RIFFT
- * @{
- */
-
-/**
- * @brief  Real FFT process
- * @param[in]  *S is an instance for the structure
- * @param[in]  *pSrc point to the input buffer (out-of-place: it's also a tmp buffer, so the input buffer is destroyed)
- * @param[out]  *pDst point to the output buffer (out-of-place)
- * @param[in]  *pTemp point to the temp buffer (used for intermedia buffer)
- * @return none.
- * The function implements a Real FFT/ Real IFFT depending
- * on the direction flag
- * Can support FFT lengths of 128, 512, 2048
- *
- */
-void ne10_rfft_float_c(
-                     const ne10_rfft_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pTemp)
-{
-    const ne10_cfft_radix4_instance_f32_t *S_CFFT = S->p_cfft;
-
-    /* Caluclation of Real IFFT of input */
-    if(S->ifft_flag_r == 1u)
-    {
-        /*  Real IFFT core process */
-        ne10_split_rifft_float_c(pSrc, S->fft_len_real, S->p_twiddle_A_real,
-                S->p_twiddle_B_real, pTemp);
-        /* Complex radix-4 IFFT process */
-        ne10_radix4_butterfly_inverse_float_c(pDst, pTemp, S_CFFT->fft_len, S_CFFT->p_twiddle, S_CFFT->one_by_fft_len);
-    }
-    else
-    {
-        /* Complex radix-4 FFT process */
-        ne10_radix4_butterfly_float_c(pTemp, pSrc, S_CFFT->fft_len, S_CFFT->p_twiddle);
-        /*  Real FFT core process */
-        ne10_split_rfft_float_c(pTemp, S->fft_len_real, S->p_twiddle_A_real,
-                S->p_twiddle_B_real, pDst);
-    }
-
-}
-
-/**
- * @} end of RFFT_RIFFT group
- */
diff --git a/modules/dsp/NE10_rfft.neon.c b/modules/dsp/NE10_rfft.neon.c
deleted file mode 100644 (file)
index d6518d8..0000000
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * NE10 Library : dsp/NE10_rfft.neon.c
- */
-#include <arm_neon.h>
-
-#include "NE10_types.h"
-#include "NE10_mask_table.h"
-#include "NE10_dsp.h"
-/**
- * @brief  Core Real FFT process
- * @param[in]   *pSrc                points to the Input buffer
- * @param[in]   N                    length of Real FFT
- * @param[in]   *pATable             points to the twiddle Coef A buffer
- * @param[in]   *pBTable             points to the twiddle Coef B buffer
- * @param[out]  *pDst                points to the Output buffer
- * @return none.
- * The function implements a Real FFT
- */
-
-static void ne10_split_rfft_float_neon(
-                     ne10_float32_t * pSrc,
-                     ne10_uint32_t N,
-                     ne10_float32_t * pReTable,
-                     ne10_float32_t * pImTable,
-                     ne10_float32_t * pDst)
-{
-  ne10_uint32_t k,Cnt;                                   /* Loop Counter */
-  ne10_float32_t *pCoefRe,*pCoefIm,*pOut1,*pIn1,*pOut2,*pIn2;       /* Temporary pointers for twiddle factors */
-  ne10_uint32_t NBy2 = N>>1;
-  /*NEON Variable Declarations*/
-  float32x4x2_t vin1q2_f32,vin2q2_f32,vtmpq2_f32;
-  float32x4_t vtmp1q_f32,vtmp2q_f32;
-  float32x4_t vureq_f32,vuimq_f32,vvreq_f32,vvimq_f32;
-  float32x4_t vretwdq_f32,vimtwdq_f32;
-  float32x4_t vhalfq_f32;
-  uint32x4_t vmaskq_u32,vmask1q_u32;
-
-  /*Mask value to select three entries*/
-  vmaskq_u32 = vld1q_u32(ne10_qMaskTable32+12);
-  vmask1q_u32 = vld1q_u32(ne10_qMaskTable32+4);
-
-
-
-  pCoefRe = pReTable+1;
-  pCoefIm = pImTable+1;
-
-  /*First Result*/
-  pDst[0] = pSrc[0] + pSrc[1];
-  pDst[1] = 0;
-  /*N/2 th Result*/
-  pDst[N] = pSrc[0] - pSrc[1];
-  pDst[N+1] = 0;
-
-  pOut1=pDst+2;
-  pOut2=pDst+N-8;
-  pIn1 = pSrc+2;
-  pIn2 = pSrc +N -8;
-
-  Cnt = ((N>>2)-1)>>2;
-  vhalfq_f32 = vdupq_n_f32(0.5);
-
-  /*for k=1 to N/4-1 and k=N/4+1 to K=N/2-1*/
-  for(k=0;k<Cnt;k++)
-  {
-      /*b[4] b[3] b[2] b[1]   a[4] a[3] a[2] a[1]*/
-      vin1q2_f32 = vld2q_f32(pIn1);
-      pIn1+=8;
-      /*b[N/2-1] b[N/2-2] b[N/2-3] b[N/2-4]   a[N/2-1] a[N/2-2] a[N/2-3] a[N/2-4]*/
-      vin2q2_f32 = vld2q_f32(pIn2);
-      pIn2-=8;
-      /* a[N/2-2] a[N/2-1] a[N/2-4] a[N/2-3]*/
-      vtmp1q_f32 =vrev64q_f32(vin2q2_f32.val[0]);
-      /* b[N/2-2] b[N/2-1] b[N/2-4] b[N/2-3]*/
-      vtmp2q_f32 =vrev64q_f32(vin2q2_f32.val[1]);
-      /*a[N/2-4] a[N/2-3] a[N/2-2] a[N/2-1]*/
-      vtmp1q_f32 = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-      /* b[N/2-4] b[N/2-3] b[N/2-2] b[N/2-1]*/
-      vtmp2q_f32 = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-      /*uRe = (a[k]+a[N/2-k])/2*/
-      vureq_f32 = vaddq_f32(vin1q2_f32.val[0],vtmp1q_f32);
-      /*uIm = (b[k]-b[N/2-k])/2*/
-      vuimq_f32 = vsubq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-      /*VRe = (b[k]+b[N/2-k])/2*/
-      vvreq_f32 = vaddq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-      /*Vim = -(a[k]-a[N/2-k])/2*/
-      vvimq_f32 = vsubq_f32(vtmp1q_f32,vin1q2_f32.val[0]);
-
-      vureq_f32 = vmulq_f32(vureq_f32,vhalfq_f32);
-      vuimq_f32 = vmulq_f32(vuimq_f32,vhalfq_f32);
-      vvreq_f32 = vmulq_f32(vvreq_f32,vhalfq_f32);
-      vvimq_f32 = vmulq_f32(vvimq_f32,vhalfq_f32);
-
-      vretwdq_f32 = vld1q_f32(pCoefRe);
-      vimtwdq_f32 = vld1q_f32(pCoefIm);
-      pCoefRe+=4;
-      pCoefIm+=4;
-
-      /*reTmp = vRe*reTwd + vIm*imTwd */
-      vtmp1q_f32 = vmulq_f32(vvreq_f32,vretwdq_f32);
-      vtmp1q_f32 = vmlaq_f32(vtmp1q_f32,vvimq_f32,vimtwdq_f32);
-      /*imTmp = vIm*reTwd - vRe*imTwd */
-      vtmp2q_f32 = vmulq_f32(vvimq_f32,vretwdq_f32);
-      vtmp2q_f32 = vmlsq_f32(vtmp2q_f32,vvreq_f32,vimtwdq_f32);
-      //pDst[2*k] = uRe + reTmp;
-      //pDst[2*k+1] = uIm + imTmp;
-      vin1q2_f32.val[0] = vaddq_f32(vureq_f32,vtmp1q_f32);
-      vin1q2_f32.val[1] = vaddq_f32(vuimq_f32,vtmp2q_f32);
-      //pDst[2*(NBy2+k)] = uRe-reTmp;
-      //pDst[2*(NBy2+k)+1] = uIm-imTmp;
-      vin2q2_f32.val[0] = vsubq_f32(vureq_f32,vtmp1q_f32);
-      vin2q2_f32.val[1] = vsubq_f32(vuimq_f32,vtmp2q_f32);
-      vst2q_f32(pOut1,vin1q2_f32);
-      vst2q_f32(pOut1+N,vin2q2_f32);
-      pOut1+=8;
-
-      //pDst[2*(NBy2-k)] = uRe-reTmp;
-      //pDst[2*(NBy2-k)+1] = imTmp-uIm;
-      vtmp2q_f32 = vsubq_f32(vtmp2q_f32,vuimq_f32);
-      vtmp1q_f32 = vrev64q_f32(vin2q2_f32.val[0]);
-      vtmp2q_f32 = vrev64q_f32(vtmp2q_f32);
-
-      vin2q2_f32.val[0] = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-      vin2q2_f32.val[1] = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-      //pDst[2*(N-k)] = uRe + reTmp;
-      //pDst[2*(N-k)+1] = -(uIm + imTmp);
-      vtmp2q_f32 = vnegq_f32(vin1q2_f32.val[1]);
-      vtmp1q_f32 = vrev64q_f32(vin1q2_f32.val[0]);
-      vtmp2q_f32 = vrev64q_f32(vtmp2q_f32);
-
-      vin1q2_f32.val[0] = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-      vin1q2_f32.val[1] = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-      vst2q_f32(pOut2,vin2q2_f32);
-      vst2q_f32(pOut2+N,vin1q2_f32);
-      pOut2-=8;
-  }
-  /*Lst Three VAlues*/
-  /*b[4] b[3] b[2] b[1]   a[4] a[3] a[2] a[1]*/
-  vin1q2_f32 = vld2q_f32(pIn1);
-  pIn1+=8;
-  /*b[N/2-1] b[N/2-2] b[N/2-3] b[N/2-4]   a[N/2-1] a[N/2-2] a[N/2-3] a[N/2-4]*/
-  vin2q2_f32 = vld2q_f32(pIn2);
-  pIn2-=8;
-  /* a[N/2-2] a[N/2-1] a[N/2-4] a[N/2-3]*/
-  vtmp1q_f32 =vrev64q_f32(vin2q2_f32.val[0]);
-  /* b[N/2-2] b[N/2-1] b[N/2-4] b[N/2-3]*/
-  vtmp2q_f32 =vrev64q_f32(vin2q2_f32.val[1]);
-  /*a[N/2-4] a[N/2-3] a[N/2-2] a[N/2-1]*/
-  vtmp1q_f32 = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-  /* b[N/2-4] b[N/2-3] b[N/2-2] b[N/2-1]*/
-  vtmp2q_f32 = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-  /*uRe = (a[k]+a[N/2-k])/2*/
-  vureq_f32 = vaddq_f32(vin1q2_f32.val[0],vtmp1q_f32);
-  /*uIm = (b[k]-b[N/2-k])/2*/
-  vuimq_f32 = vsubq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-  /*VRe = (b[k]+b[N/2-k])/2*/
-  vvreq_f32 = vaddq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-  /*Vim = -(a[k]-a[N/2-k])/2*/
-  vvimq_f32 = vsubq_f32(vtmp1q_f32,vin1q2_f32.val[0]);
-
-  vureq_f32 = vmulq_f32(vureq_f32,vhalfq_f32);
-  vuimq_f32 = vmulq_f32(vuimq_f32,vhalfq_f32);
-  vvreq_f32 = vmulq_f32(vvreq_f32,vhalfq_f32);
-  vvimq_f32 = vmulq_f32(vvimq_f32,vhalfq_f32);
-
-  vretwdq_f32 = vld1q_f32(pCoefRe);
-  vimtwdq_f32 = vld1q_f32(pCoefIm);
-  pCoefRe+=4;
-  pCoefIm+=4;
-
-  /*reTmp = vRe*reTwd + vIm*imTwd */
-  vtmp1q_f32 = vmulq_f32(vvreq_f32,vretwdq_f32);
-  vtmp1q_f32 = vmlaq_f32(vtmp1q_f32,vvimq_f32,vimtwdq_f32);
-  /*imTmp = vIm*reTwd - vRe*imTwd */
-  vtmp2q_f32 = vmulq_f32(vvimq_f32,vretwdq_f32);
-  vtmp2q_f32 = vmlsq_f32(vtmp2q_f32,vvreq_f32,vimtwdq_f32);
-  //pDst[2*k] = uRe + reTmp;
-  //pDst[2*k+1] = uIm + imTmp;
-  vin1q2_f32.val[0] = vaddq_f32(vureq_f32,vtmp1q_f32);
-  vin1q2_f32.val[1] = vaddq_f32(vuimq_f32,vtmp2q_f32);
-
-  vtmpq2_f32 = vld2q_f32(pOut1);
-  vin1q2_f32.val[0] = vbslq_f32(vmaskq_u32,vin1q2_f32.val[0],vtmpq2_f32.val[0]);
-  vin1q2_f32.val[1] = vbslq_f32(vmaskq_u32,vin1q2_f32.val[1],vtmpq2_f32.val[1]);
-  //pDst[2*(NBy2+k)] = uRe-reTmp;
-  //pDst[2*(NBy2+k)+1] = uIm-imTmp;
-  vin2q2_f32.val[0] = vsubq_f32(vureq_f32,vtmp1q_f32);
-  vin2q2_f32.val[1] = vsubq_f32(vuimq_f32,vtmp2q_f32);
-
-  vtmpq2_f32 = vld2q_f32(pOut1+N);
-  vin2q2_f32.val[0] = vbslq_f32(vmaskq_u32,vin2q2_f32.val[0],vtmpq2_f32.val[0]);
-  vin2q2_f32.val[1] = vbslq_f32(vmaskq_u32,vin2q2_f32.val[1],vtmpq2_f32.val[1]);
-
-  vst2q_f32(pOut1,vin1q2_f32);
-  vst2q_f32(pOut1+N,vin2q2_f32);
-  pOut1+=8;
-
-
-  //pDst[2*(NBy2-k)] = uRe-reTmp;
-  //pDst[2*(NBy2-k)+1] = imTmp-uIm;
-  vtmp2q_f32 = vnegq_f32(vin2q2_f32.val[1]);
-  vtmp1q_f32 = vrev64q_f32(vin2q2_f32.val[0]);
-  vtmp2q_f32 = vrev64q_f32(vtmp2q_f32);
-
-  vin2q2_f32.val[0] = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-  vin2q2_f32.val[1] = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-  vtmpq2_f32 = vld2q_f32(pOut2);
-  vin2q2_f32.val[0] = vbslq_f32(vmask1q_u32,vtmpq2_f32.val[0],vin2q2_f32.val[0]);
-  vin2q2_f32.val[1] = vbslq_f32(vmask1q_u32,vtmpq2_f32.val[1],vin2q2_f32.val[1]);
-
-
-  //pDst[2*(N-k)] = uRe + reTmp;
-  //pDst[2*(N-k)+1] = -(uIm + imTmp);
-  vtmp2q_f32 = vnegq_f32(vin1q2_f32.val[1]);
-  vtmp1q_f32 = vrev64q_f32(vin1q2_f32.val[0]);
-  vtmp2q_f32 = vrev64q_f32(vtmp2q_f32);
-
-  vin1q2_f32.val[0] = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-  vin1q2_f32.val[1] = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-  vtmpq2_f32 = vld2q_f32(pOut2+N);
-  vin1q2_f32.val[0] = vbslq_f32(vmask1q_u32,vtmpq2_f32.val[0],vin1q2_f32.val[0]);
-  vin1q2_f32.val[1] = vbslq_f32(vmask1q_u32,vtmpq2_f32.val[1],vin1q2_f32.val[1]);
-
-  vst2q_f32(pOut2,vin2q2_f32);
-  vst2q_f32(pOut2+N,vin1q2_f32);
-  pOut2-=8;
-
-
-
-  /*y[N/4] = a[N/4]-jb[N/4]; y[3*N/4] = a[N/4] + jb[N/4]*/
-  pDst[NBy2] = pSrc[NBy2];
-  pDst[NBy2+1] = -pSrc[NBy2+1];
-  pDst[N+NBy2] = pSrc[NBy2];
-  pDst[N+NBy2+1] = pSrc[NBy2+1];
-
-
-}
-
-
-/**
- * @brief  Core Real IFFT process
- * @param[in]   *pSrc                 points to the Input buffer
- * @param[in]   N                      length of Real FFT
- * @param[in]   *pATable             points to the twiddle Coef A buffer
- * @param[in]   *pBTable             points to the twiddle Coef B buffer
- * @param[out]  *pDst                 points to the Output buffer
- * @return none.
- * The function implements a Real FFT
- */
-
-
-static void ne10_split_rifft_float_neon(
-                     ne10_float32_t * pSrc,
-                     ne10_uint32_t N,
-                     ne10_float32_t * pReTable,
-                     ne10_float32_t * pImTable,
-                     ne10_float32_t * pDst)
-{
-    ne10_uint32_t k,Cnt;                                             /* Loop Counter */
-    ne10_float32_t *pCoefRe,*pCoefIm,*pOut1,*pOut2,*pIn1,*pIn2;      /* Temporary pointers for twiddle factors */
-    ne10_uint32_t NBy2 = N>>1;
-
-    /*NEON Variable Declarations*/
-    float32x4x2_t vin1q2_f32,vin2q2_f32,vtmpq2_f32;
-    float32x4_t vtmp1q_f32,vtmp2q_f32;
-    float32x4_t vureq_f32,vuimq_f32,vvreq_f32,vvimq_f32;
-    float32x4_t vretwdq_f32,vimtwdq_f32;
-    float32x4_t vhalfq_f32;
-    uint32x4_t vmaskq_u32,vmask1q_u32;
-
-    /*Mask value to select three entries*/
-    vmaskq_u32 = vld1q_u32(ne10_qMaskTable32+12);
-    vmask1q_u32 = vld1q_u32(ne10_qMaskTable32+4);
-
-    pCoefRe = pReTable+1;
-    pCoefIm = pImTable+1;
-
-    /*First Result*/
-    pDst[0] = (pSrc[0] + pSrc[N])*0.5;
-    pDst[1] = (pSrc[0] - pSrc[N])*0.5;
-
-    pOut1=pDst+2;
-    pOut2=pDst+N-8;
-    pIn1 = pSrc+2;
-    pIn2 = pSrc +N -8;
-
-    Cnt = ((N>>2)-1)>>2;
-    vhalfq_f32 = vdupq_n_f32(0.5);
-
-    /*for k=1 to N/4-1 and k=N/4+1 to K=N/2-1*/
-    for(k=0;k<Cnt;k++)
-    {
-        /*b[4] b[3] b[2] b[1]   a[4] a[3] a[2] a[1]*/
-        vin1q2_f32 = vld2q_f32(pIn1);
-        pIn1+=8;
-        /*b[N/2-1] b[N/2-2] b[N/2-3] b[N/2-4]   a[N/2-1] a[N/2-2] a[N/2-3] a[N/2-4]*/
-        vin2q2_f32 = vld2q_f32(pIn2);
-        pIn2-=8;
-        /* a[N/2-2] a[N/2-1] a[N/2-4] a[N/2-3]*/
-        vtmp1q_f32 =vrev64q_f32(vin2q2_f32.val[0]);
-        /* b[N/2-2] b[N/2-1] b[N/2-4] b[N/2-3]*/
-        vtmp2q_f32 =vrev64q_f32(vin2q2_f32.val[1]);
-        /*a[N/2-4] a[N/2-3] a[N/2-2] a[N/2-1]*/
-        vtmp1q_f32 = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-        /* b[N/2-4] b[N/2-3] b[N/2-2] b[N/2-1]*/
-        vtmp2q_f32 = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-        /*uRe = (a[k]+a[N/2-k])/2*/
-        vureq_f32 = vaddq_f32(vin1q2_f32.val[0],vtmp1q_f32);
-        /*uIm = (b[k]-b[N/2-k])/2*/
-        vuimq_f32 = vsubq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-
-        /*VRe = (a[k]-a[N/2-k])/2*/
-        vvreq_f32 = vsubq_f32(vin1q2_f32.val[0],vtmp1q_f32);
-        /*Vim = (b[k]+b[N/2-k])/2*/
-        vvimq_f32 = vaddq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-
-        vureq_f32 = vmulq_f32(vureq_f32,vhalfq_f32);
-        vuimq_f32 = vmulq_f32(vuimq_f32,vhalfq_f32);
-        vvreq_f32 = vmulq_f32(vvreq_f32,vhalfq_f32);
-        vvimq_f32 = vmulq_f32(vvimq_f32,vhalfq_f32);
-
-        vretwdq_f32 = vld1q_f32(pCoefRe);
-        vimtwdq_f32 = vld1q_f32(pCoefIm);
-        pCoefRe+=4;
-        pCoefIm+=4;
-
-        /*reTmp = vRe*reTwd - vIm*imTwd */
-        vtmp1q_f32 = vmulq_f32(vvreq_f32,vretwdq_f32);
-        vtmp1q_f32 = vmlsq_f32(vtmp1q_f32,vvimq_f32,vimtwdq_f32);
-        /*imTmp = vIm*reTwd + vRe*imTwd */
-        vtmp2q_f32 = vmulq_f32(vvimq_f32,vretwdq_f32);
-        vtmp2q_f32 = vmlaq_f32(vtmp2q_f32,vvreq_f32,vimtwdq_f32);
-        //pDst[2*k] = uRe - imTmp;
-        //pDst[2*k+1] = uIm + reTmp;
-        vin1q2_f32.val[0] = vsubq_f32(vureq_f32,vtmp2q_f32);
-        vin1q2_f32.val[1] = vaddq_f32(vuimq_f32,vtmp1q_f32);
-
-        //pDst[2*(NBy2-k)] = uRe+imTmp;
-        //pDst[2*(NBy2-k)+1] = reTmp-uIm;
-        vtmp2q_f32 = vaddq_f32(vtmp2q_f32,vureq_f32);
-        vtmp1q_f32 = vsubq_f32(vtmp1q_f32,vuimq_f32);
-        vtmp2q_f32 = vrev64q_f32(vtmp2q_f32);
-        vtmp1q_f32 = vrev64q_f32(vtmp1q_f32);
-
-        vin2q2_f32.val[1] = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-        vin2q2_f32.val[0] = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-
-        vst2q_f32(pOut1,vin1q2_f32);
-        vst2q_f32(pOut2,vin2q2_f32);
-        pOut1+=8;
-        pOut2-=8;
-    }
-    /*b[4] b[3] b[2] b[1]   a[4] a[3] a[2] a[1]*/
-    vin1q2_f32 = vld2q_f32(pIn1);
-    pIn1+=8;
-    /*b[N/2-1] b[N/2-2] b[N/2-3] b[N/2-4]   a[N/2-1] a[N/2-2] a[N/2-3] a[N/2-4]*/
-    vin2q2_f32 = vld2q_f32(pIn2);
-    pIn2-=8;
-    /* a[N/2-2] a[N/2-1] a[N/2-4] a[N/2-3]*/
-    vtmp1q_f32 =vrev64q_f32(vin2q2_f32.val[0]);
-    /* b[N/2-2] b[N/2-1] b[N/2-4] b[N/2-3]*/
-    vtmp2q_f32 =vrev64q_f32(vin2q2_f32.val[1]);
-    /*a[N/2-4] a[N/2-3] a[N/2-2] a[N/2-1]*/
-    vtmp1q_f32 = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-    /* b[N/2-4] b[N/2-3] b[N/2-2] b[N/2-1]*/
-    vtmp2q_f32 = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-    /*uRe = (a[k]+a[N/2-k])/2*/
-    vureq_f32 = vaddq_f32(vin1q2_f32.val[0],vtmp1q_f32);
-    /*uIm = (b[k]-b[N/2-k])/2*/
-    vuimq_f32 = vsubq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-
-    /*VRe = (a[k]-a[N/2-k])/2*/
-    vvreq_f32 = vsubq_f32(vin1q2_f32.val[0],vtmp1q_f32);
-    /*Vim = (b[k]+b[N/2-k])/2*/
-    vvimq_f32 = vaddq_f32(vin1q2_f32.val[1],vtmp2q_f32);
-
-    vureq_f32 = vmulq_f32(vureq_f32,vhalfq_f32);
-    vuimq_f32 = vmulq_f32(vuimq_f32,vhalfq_f32);
-    vvreq_f32 = vmulq_f32(vvreq_f32,vhalfq_f32);
-    vvimq_f32 = vmulq_f32(vvimq_f32,vhalfq_f32);
-
-    vretwdq_f32 = vld1q_f32(pCoefRe);
-    vimtwdq_f32 = vld1q_f32(pCoefIm);
-    pCoefRe+=4;
-    pCoefIm+=4;
-
-    /*reTmp = vRe*reTwd - vIm*imTwd */
-    vtmp1q_f32 = vmulq_f32(vvreq_f32,vretwdq_f32);
-    vtmp1q_f32 = vmlsq_f32(vtmp1q_f32,vvimq_f32,vimtwdq_f32);
-    /*imTmp = vIm*reTwd + vRe*imTwd */
-    vtmp2q_f32 = vmulq_f32(vvimq_f32,vretwdq_f32);
-    vtmp2q_f32 = vmlaq_f32(vtmp2q_f32,vvreq_f32,vimtwdq_f32);
-    //pDst[2*k] = uRe - imTmp;
-    //pDst[2*k+1] = uIm + reTmp;
-    vin1q2_f32.val[0] = vsubq_f32(vureq_f32,vtmp2q_f32);
-    vin1q2_f32.val[1] = vaddq_f32(vuimq_f32,vtmp1q_f32);
-
-    vtmpq2_f32 = vld2q_f32(pOut1);
-    vin1q2_f32.val[0] = vbslq_f32(vmaskq_u32,vin1q2_f32.val[0],vtmpq2_f32.val[0]);
-    vin1q2_f32.val[1] = vbslq_f32(vmaskq_u32,vin1q2_f32.val[1],vtmpq2_f32.val[1]);
-
-    //pDst[2*(NBy2-k)] = uRe+imTmp;
-    //pDst[2*(NBy2-k)+1] = reTmp-uIm;
-    vtmp2q_f32 = vaddq_f32(vtmp2q_f32,vureq_f32);
-    vtmp1q_f32 = vsubq_f32(vtmp1q_f32,vuimq_f32);
-    vtmp2q_f32 = vrev64q_f32(vtmp2q_f32);
-    vtmp1q_f32 = vrev64q_f32(vtmp1q_f32);
-
-    vin2q2_f32.val[1] = vcombine_f32(vget_high_f32(vtmp1q_f32),vget_low_f32(vtmp1q_f32));
-    vin2q2_f32.val[0] = vcombine_f32(vget_high_f32(vtmp2q_f32),vget_low_f32(vtmp2q_f32));
-
-    vtmpq2_f32 = vld2q_f32(pOut2);
-    vin2q2_f32.val[0] = vbslq_f32(vmask1q_u32,vtmpq2_f32.val[0],vin2q2_f32.val[0]);
-    vin2q2_f32.val[1] = vbslq_f32(vmask1q_u32,vtmpq2_f32.val[1],vin2q2_f32.val[1]);
-
-    vst2q_f32(pOut1,vin1q2_f32);
-    vst2q_f32(pOut2,vin2q2_f32);
-    pOut1+=6;
-    pOut2-=6;
-    /*y[N/4] = a[N/4]-jb[N/4]*/
-    pDst[NBy2] = pSrc[NBy2];
-    pDst[NBy2+1] = -pSrc[NBy2+1];
-
-}
-
-/**
- * @addtogroup RFFT_RIFFT
- * @{
- */
-
-/**
- * @brief  Real FFT process
- * @param[in]  *S is an instance for the structure
- * @param[in]  *pSrc point to the input buffer (out-of-place: it's also a tmp buffer, so the input buffer is destroyed)
- * @param[out]  *pDst point to the output buffer (out-of-place)
- * @param[in]  *pTemp point to the temp buffer (used for intermedia buffer)
- * @return none.
- * The function implements a Real FFT/ Real IFFT depending
- * on the direction flag
- * Can support FFT lengths of 128, 512, 2048
- *
- */
-void ne10_rfft_float_neon(
-                     const ne10_rfft_instance_f32_t * S,
-                     ne10_float32_t * pSrc,
-                     ne10_float32_t * pDst,
-                     ne10_float32_t * pTemp)
-{
-    const ne10_cfft_radix4_instance_f32_t *S_CFFT = S->p_cfft;
-
-    /* Caluclation of Real IFFT of input */
-    if(S->ifft_flag_r == 1u)
-    {
-        /*  Real IFFT core process */
-        ne10_split_rifft_float_neon(pSrc, S->fft_len_real, S->p_twiddle_A_real,
-                S->p_twiddle_B_real, pTemp);
-        /* Complex radix-4 IFFT process */
-        ne10_radix4_butterfly_inverse_float_neon(pDst, pTemp, S_CFFT->fft_len, S_CFFT->p_twiddle, S_CFFT->one_by_fft_len);
-    }
-    else
-    {
-        /* Complex radix-4 FFT process */
-        ne10_radix4_butterfly_float_neon(pTemp, pSrc, S_CFFT->fft_len, S_CFFT->p_twiddle);
-        /*  Real FFT core process */
-        ne10_split_rfft_float_neon(pTemp, S->fft_len_real, S->p_twiddle_A_real,
-                S->p_twiddle_B_real, pDst);
-    }
-
-}
-/**
- * @} end of RFFT_RIFFT group
- */
-
diff --git a/modules/dsp/NE10_rfft_init.c b/modules/dsp/NE10_rfft_init.c
deleted file mode 100644 (file)
index b028129..0000000
+++ /dev/null
@@ -1,1180 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "NE10_types.h"
-#include "NE10_dsp.h"
-
-/*
-* @brief  Twiddle factors Table
-*/
-
-/** Pseudo code for Twiddle factor Tables Generation:
-
-for i=1 to N
-    rfft_twiddlecoef(2*i) = cos((i-1) * 2*PI/(float)N))
-    rfft_rfft_twiddle_coef(2*i + 1) = sin((i-1) * 2*PI/(float)N))
-end
-
-where N = 1024    and PI = 3.14159265358979
-
-N is the maximum FFT Size supported and
-Cos and Sin values are interleaved fashion
-*/
-
-static const ne10_float32_t rfft_twiddle_coef[2048] = {
-  1.000000000000000000f, 0.000000000000000000f,
-  0.999981175282601110f, 0.006135884649154475f,
-  0.999924701839144500f, 0.012271538285719925f,
-  0.999830581795823400f, 0.018406729905804820f,
-  0.999698818696204250f, 0.024541228522912288f,
-  0.999529417501093140f, 0.030674803176636626f,
-  0.999322384588349540f, 0.036807222941358832f,
-  0.999077727752645360f, 0.042938256934940820f,
-  0.998795456205172410f, 0.049067674327418015f,
-  0.998475580573294770f, 0.055195244349689934f,
-  0.998118112900149180f, 0.061320736302208578f,
-  0.997723066644191640f, 0.067443919563664051f,
-  0.997290456678690210f, 0.073564563599667426f,
-  0.996820299291165670f, 0.079682437971430126f,
-  0.996312612182778000f, 0.085797312344439894f,
-  0.995767414467659820f, 0.091908956497132724f,
-  0.995184726672196930f, 0.098017140329560604f,
-  0.994564570734255420f, 0.104121633872054590f,
-  0.993906970002356060f, 0.110222207293883060f,
-  0.993211949234794500f, 0.116318630911904750f,
-  0.992479534598709970f, 0.122410675199216200f,
-  0.991709753669099530f, 0.128498110793793170f,
-  0.990902635427780010f, 0.134580708507126170f,
-  0.990058210262297120f, 0.140658239332849210f,
-  0.989176509964781010f, 0.146730474455361750f,
-  0.988257567730749460f, 0.152797185258443440f,
-  0.987301418157858430f, 0.158858143333861450f,
-  0.986308097244598670f, 0.164913120489969890f,
-  0.985277642388941220f, 0.170961888760301220f,
-  0.984210092386929030f, 0.177004220412148750f,
-  0.983105487431216290f, 0.183039887955140950f,
-  0.981963869109555240f, 0.189068664149806190f,
-  0.980785280403230430f, 0.195090322016128250f,
-  0.979569765685440520f, 0.201104634842091900f,
-  0.978317370719627650f, 0.207111376192218560f,
-  0.977028142657754390f, 0.213110319916091360f,
-  0.975702130038528570f, 0.219101240156869800f,
-  0.974339382785575860f, 0.225083911359792830f,
-  0.972939952205560180f, 0.231058108280671110f,
-  0.971503890986251780f, 0.237023605994367200f,
-  0.970031253194543970f, 0.242980179903263870f,
-  0.968522094274417380f, 0.248927605745720150f,
-  0.966976471044852070f, 0.254865659604514570f,
-  0.965394441697689400f, 0.260794117915275510f,
-  0.963776065795439840f, 0.266712757474898370f,
-  0.962121404269041580f, 0.272621355449948980f,
-  0.960430519415565790f, 0.278519689385053060f,
-  0.958703474895871600f, 0.284407537211271880f,
-  0.956940335732208820f, 0.290284677254462330f,
-  0.955141168305770780f, 0.296150888243623790f,
-  0.953306040354193860f, 0.302005949319228080f,
-  0.951435020969008340f, 0.307849640041534870f,
-  0.949528180593036670f, 0.313681740398891520f,
-  0.947585591017741090f, 0.319502030816015690f,
-  0.945607325380521280f, 0.325310292162262930f,
-  0.943593458161960390f, 0.331106305759876430f,
-  0.941544065183020810f, 0.336889853392220050f,
-  0.939459223602189920f, 0.342660717311994380f,
-  0.937339011912574960f, 0.348418680249434560f,
-  0.935183509938947610f, 0.354163525420490340f,
-  0.932992798834738960f, 0.359895036534988110f,
-  0.930766961078983710f, 0.365612997804773850f,
-  0.928506080473215590f, 0.371317193951837540f,
-  0.926210242138311380f, 0.377007410216418260f,
-  0.923879532511286740f, 0.382683432365089780f,
-  0.921514039342042010f, 0.388345046698826250f,
-  0.919113851690057770f, 0.393992040061048100f,
-  0.916679059921042700f, 0.399624199845646790f,
-  0.914209755703530690f, 0.405241314004989860f,
-  0.911706032005429880f, 0.410843171057903910f,
-  0.909167983090522380f, 0.416429560097637150f,
-  0.906595704514915330f, 0.422000270799799680f,
-  0.903989293123443340f, 0.427555093430282080f,
-  0.901348847046022030f, 0.433093818853151960f,
-  0.898674465693953820f, 0.438616238538527660f,
-  0.895966249756185220f, 0.444122144570429200f,
-  0.893224301195515320f, 0.449611329654606540f,
-  0.890448723244757880f, 0.455083587126343840f,
-  0.887639620402853930f, 0.460538710958240010f,
-  0.884797098430937790f, 0.465976495767966180f,
-  0.881921264348355050f, 0.471396736825997640f,
-  0.879012226428633530f, 0.476799230063322090f,
-  0.876070094195406600f, 0.482183772079122720f,
-  0.873094978418290090f, 0.487550160148436000f,
-  0.870086991108711460f, 0.492898192229784040f,
-  0.867046245515692650f, 0.498227666972781870f,
-  0.863972856121586810f, 0.503538383725717580f,
-  0.860866938637767310f, 0.508830142543106990f,
-  0.857728610000272120f, 0.514102744193221660f,
-  0.854557988365400530f, 0.519355990165589640f,
-  0.851355193105265200f, 0.524589682678468950f,
-  0.848120344803297230f, 0.529803624686294610f,
-  0.844853565249707120f, 0.534997619887097150f,
-  0.841554977436898440f, 0.540171472729892850f,
-  0.838224705554838080f, 0.545324988422046460f,
-  0.834862874986380010f, 0.550457972936604810f,
-  0.831469612302545240f, 0.555570233019602180f,
-  0.828045045257755800f, 0.560661576197336030f,
-  0.824589302785025290f, 0.565731810783613120f,
-  0.821102514991104650f, 0.570780745886967260f,
-  0.817584813151583710f, 0.575808191417845340f,
-  0.814036329705948410f, 0.580813958095764530f,
-  0.810457198252594770f, 0.585797857456438860f,
-  0.806847553543799330f, 0.590759701858874160f,
-  0.803207531480644940f, 0.595699304492433360f,
-  0.799537269107905010f, 0.600616479383868970f,
-  0.795836904608883570f, 0.605511041404325550f,
-  0.792106577300212390f, 0.610382806276309480f,
-  0.788346427626606340f, 0.615231590580626820f,
-  0.784556597155575240f, 0.620057211763289100f,
-  0.780737228572094490f, 0.624859488142386340f,
-  0.776888465673232440f, 0.629638238914926980f,
-  0.773010453362736990f, 0.634393284163645490f,
-  0.769103337645579700f, 0.639124444863775730f,
-  0.765167265622458960f, 0.643831542889791390f,
-  0.761202385484261780f, 0.648514401022112440f,
-  0.757208846506484570f, 0.653172842953776760f,
-  0.753186799043612520f, 0.657806693297078640f,
-  0.749136394523459370f, 0.662415777590171780f,
-  0.745057785441466060f, 0.666999922303637470f,
-  0.740951125354959110f, 0.671558954847018330f,
-  0.736816568877369900f, 0.676092703575315920f,
-  0.732654271672412820f, 0.680600997795453020f,
-  0.728464390448225200f, 0.685083667772700360f,
-  0.724247082951467000f, 0.689540544737066830f,
-  0.720002507961381650f, 0.693971460889654000f,
-  0.715730825283818590f, 0.698376249408972920f,
-  0.711432195745216430f, 0.702754744457225300f,
-  0.707106781186547570f, 0.707106781186547460f,
-  0.702754744457225300f, 0.711432195745216430f,
-  0.698376249408972920f, 0.715730825283818590f,
-  0.693971460889654000f, 0.720002507961381650f,
-  0.689540544737066940f, 0.724247082951466890f,
-  0.685083667772700360f, 0.728464390448225200f,
-  0.680600997795453130f, 0.732654271672412820f,
-  0.676092703575316030f, 0.736816568877369790f,
-  0.671558954847018330f, 0.740951125354959110f,
-  0.666999922303637470f, 0.745057785441465950f,
-  0.662415777590171780f, 0.749136394523459260f,
-  0.657806693297078640f, 0.753186799043612410f,
-  0.653172842953776760f, 0.757208846506484460f,
-  0.648514401022112550f, 0.761202385484261780f,
-  0.643831542889791500f, 0.765167265622458960f,
-  0.639124444863775730f, 0.769103337645579590f,
-  0.634393284163645490f, 0.773010453362736990f,
-  0.629638238914927100f, 0.776888465673232440f,
-  0.624859488142386450f, 0.780737228572094380f,
-  0.620057211763289210f, 0.784556597155575240f,
-  0.615231590580626820f, 0.788346427626606230f,
-  0.610382806276309480f, 0.792106577300212390f,
-  0.605511041404325550f, 0.795836904608883460f,
-  0.600616479383868970f, 0.799537269107905010f,
-  0.595699304492433470f, 0.803207531480644830f,
-  0.590759701858874280f, 0.806847553543799220f,
-  0.585797857456438860f, 0.810457198252594770f,
-  0.580813958095764530f, 0.814036329705948300f,
-  0.575808191417845340f, 0.817584813151583710f,
-  0.570780745886967370f, 0.821102514991104650f,
-  0.565731810783613230f, 0.824589302785025290f,
-  0.560661576197336030f, 0.828045045257755800f,
-  0.555570233019602290f, 0.831469612302545240f,
-  0.550457972936604810f, 0.834862874986380010f,
-  0.545324988422046460f, 0.838224705554837970f,
-  0.540171472729892970f, 0.841554977436898330f,
-  0.534997619887097260f, 0.844853565249707010f,
-  0.529803624686294830f, 0.848120344803297120f,
-  0.524589682678468840f, 0.851355193105265200f,
-  0.519355990165589530f, 0.854557988365400530f,
-  0.514102744193221660f, 0.857728610000272120f,
-  0.508830142543106990f, 0.860866938637767310f,
-  0.503538383725717580f, 0.863972856121586700f,
-  0.498227666972781870f, 0.867046245515692650f,
-  0.492898192229784090f, 0.870086991108711350f,
-  0.487550160148436050f, 0.873094978418290090f,
-  0.482183772079122830f, 0.876070094195406600f,
-  0.476799230063322250f, 0.879012226428633410f,
-  0.471396736825997810f, 0.881921264348354940f,
-  0.465976495767966130f, 0.884797098430937790f,
-  0.460538710958240010f, 0.887639620402853930f,
-  0.455083587126343840f, 0.890448723244757880f,
-  0.449611329654606600f, 0.893224301195515320f,
-  0.444122144570429260f, 0.895966249756185110f,
-  0.438616238538527710f, 0.898674465693953820f,
-  0.433093818853152010f, 0.901348847046022030f,
-  0.427555093430282200f, 0.903989293123443340f,
-  0.422000270799799790f, 0.906595704514915330f,
-  0.416429560097637320f, 0.909167983090522270f,
-  0.410843171057903910f, 0.911706032005429880f,
-  0.405241314004989860f, 0.914209755703530690f,
-  0.399624199845646790f, 0.916679059921042700f,
-  0.393992040061048100f, 0.919113851690057770f,
-  0.388345046698826300f, 0.921514039342041900f,
-  0.382683432365089840f, 0.923879532511286740f,
-  0.377007410216418310f, 0.926210242138311270f,
-  0.371317193951837600f, 0.928506080473215480f,
-  0.365612997804773960f, 0.930766961078983710f,
-  0.359895036534988280f, 0.932992798834738850f,
-  0.354163525420490510f, 0.935183509938947500f,
-  0.348418680249434510f, 0.937339011912574960f,
-  0.342660717311994380f, 0.939459223602189920f,
-  0.336889853392220050f, 0.941544065183020810f,
-  0.331106305759876430f, 0.943593458161960390f,
-  0.325310292162262980f, 0.945607325380521280f,
-  0.319502030816015750f, 0.947585591017741090f,
-  0.313681740398891570f, 0.949528180593036670f,
-  0.307849640041534980f, 0.951435020969008340f,
-  0.302005949319228200f, 0.953306040354193750f,
-  0.296150888243623960f, 0.955141168305770670f,
-  0.290284677254462330f, 0.956940335732208940f,
-  0.284407537211271820f, 0.958703474895871600f,
-  0.278519689385053060f, 0.960430519415565790f,
-  0.272621355449948980f, 0.962121404269041580f,
-  0.266712757474898420f, 0.963776065795439840f,
-  0.260794117915275570f, 0.965394441697689400f,
-  0.254865659604514630f, 0.966976471044852070f,
-  0.248927605745720260f, 0.968522094274417270f,
-  0.242980179903263980f, 0.970031253194543970f,
-  0.237023605994367340f, 0.971503890986251780f,
-  0.231058108280671280f, 0.972939952205560070f,
-  0.225083911359792780f, 0.974339382785575860f,
-  0.219101240156869770f, 0.975702130038528570f,
-  0.213110319916091360f, 0.977028142657754390f,
-  0.207111376192218560f, 0.978317370719627650f,
-  0.201104634842091960f, 0.979569765685440520f,
-  0.195090322016128330f, 0.980785280403230430f,
-  0.189068664149806280f, 0.981963869109555240f,
-  0.183039887955141060f, 0.983105487431216290f,
-  0.177004220412148860f, 0.984210092386929030f,
-  0.170961888760301360f, 0.985277642388941220f,
-  0.164913120489970090f, 0.986308097244598670f,
-  0.158858143333861390f, 0.987301418157858430f,
-  0.152797185258443410f, 0.988257567730749460f,
-  0.146730474455361750f, 0.989176509964781010f,
-  0.140658239332849240f, 0.990058210262297120f,
-  0.134580708507126220f, 0.990902635427780010f,
-  0.128498110793793220f, 0.991709753669099530f,
-  0.122410675199216280f, 0.992479534598709970f,
-  0.116318630911904880f, 0.993211949234794500f,
-  0.110222207293883180f, 0.993906970002356060f,
-  0.104121633872054730f, 0.994564570734255420f,
-  0.098017140329560770f, 0.995184726672196820f,
-  0.091908956497132696f, 0.995767414467659820f,
-  0.085797312344439880f, 0.996312612182778000f,
-  0.079682437971430126f, 0.996820299291165670f,
-  0.073564563599667454f, 0.997290456678690210f,
-  0.067443919563664106f, 0.997723066644191640f,
-  0.061320736302208648f, 0.998118112900149180f,
-  0.055195244349690031f, 0.998475580573294770f,
-  0.049067674327418126f, 0.998795456205172410f,
-  0.042938256934940959f, 0.999077727752645360f,
-  0.036807222941358991f, 0.999322384588349540f,
-  0.030674803176636581f, 0.999529417501093140f,
-  0.024541228522912264f, 0.999698818696204250f,
-  0.018406729905804820f, 0.999830581795823400f,
-  0.012271538285719944f, 0.999924701839144500f,
-  0.006135884649154515f, 0.999981175282601110f,
-  0.000000000000000061f, 1.000000000000000000f,
-  -0.006135884649154393f, 0.999981175282601110f,
-  -0.012271538285719823f, 0.999924701839144500f,
-  -0.018406729905804695f, 0.999830581795823400f,
-  -0.024541228522912142f, 0.999698818696204250f,
-  -0.030674803176636459f, 0.999529417501093140f,
-  -0.036807222941358866f, 0.999322384588349540f,
-  -0.042938256934940834f, 0.999077727752645360f,
-  -0.049067674327418008f, 0.998795456205172410f,
-  -0.055195244349689913f, 0.998475580573294770f,
-  -0.061320736302208530f, 0.998118112900149180f,
-  -0.067443919563663982f, 0.997723066644191640f,
-  -0.073564563599667329f, 0.997290456678690210f,
-  -0.079682437971430015f, 0.996820299291165780f,
-  -0.085797312344439755f, 0.996312612182778000f,
-  -0.091908956497132571f, 0.995767414467659820f,
-  -0.098017140329560645f, 0.995184726672196930f,
-  -0.104121633872054600f, 0.994564570734255420f,
-  -0.110222207293883060f, 0.993906970002356060f,
-  -0.116318630911904750f, 0.993211949234794500f,
-  -0.122410675199216150f, 0.992479534598709970f,
-  -0.128498110793793110f, 0.991709753669099530f,
-  -0.134580708507126110f, 0.990902635427780010f,
-  -0.140658239332849130f, 0.990058210262297120f,
-  -0.146730474455361640f, 0.989176509964781010f,
-  -0.152797185258443300f, 0.988257567730749460f,
-  -0.158858143333861280f, 0.987301418157858430f,
-  -0.164913120489969950f, 0.986308097244598670f,
-  -0.170961888760301240f, 0.985277642388941220f,
-  -0.177004220412148750f, 0.984210092386929030f,
-  -0.183039887955140920f, 0.983105487431216290f,
-  -0.189068664149806160f, 0.981963869109555240f,
-  -0.195090322016128190f, 0.980785280403230430f,
-  -0.201104634842091820f, 0.979569765685440520f,
-  -0.207111376192218450f, 0.978317370719627650f,
-  -0.213110319916091250f, 0.977028142657754390f,
-  -0.219101240156869660f, 0.975702130038528570f,
-  -0.225083911359792670f, 0.974339382785575860f,
-  -0.231058108280671140f, 0.972939952205560180f,
-  -0.237023605994367230f, 0.971503890986251780f,
-  -0.242980179903263870f, 0.970031253194543970f,
-  -0.248927605745720120f, 0.968522094274417380f,
-  -0.254865659604514520f, 0.966976471044852070f,
-  -0.260794117915275460f, 0.965394441697689400f,
-  -0.266712757474898310f, 0.963776065795439840f,
-  -0.272621355449948870f, 0.962121404269041580f,
-  -0.278519689385052950f, 0.960430519415565900f,
-  -0.284407537211271710f, 0.958703474895871600f,
-  -0.290284677254462160f, 0.956940335732208940f,
-  -0.296150888243623840f, 0.955141168305770670f,
-  -0.302005949319228080f, 0.953306040354193860f,
-  -0.307849640041534870f, 0.951435020969008340f,
-  -0.313681740398891410f, 0.949528180593036670f,
-  -0.319502030816015640f, 0.947585591017741200f,
-  -0.325310292162262870f, 0.945607325380521390f,
-  -0.331106305759876320f, 0.943593458161960390f,
-  -0.336889853392219940f, 0.941544065183020810f,
-  -0.342660717311994270f, 0.939459223602189920f,
-  -0.348418680249434400f, 0.937339011912574960f,
-  -0.354163525420490400f, 0.935183509938947610f,
-  -0.359895036534988170f, 0.932992798834738850f,
-  -0.365612997804773850f, 0.930766961078983710f,
-  -0.371317193951837490f, 0.928506080473215590f,
-  -0.377007410216418200f, 0.926210242138311380f,
-  -0.382683432365089730f, 0.923879532511286740f,
-  -0.388345046698826190f, 0.921514039342042010f,
-  -0.393992040061047990f, 0.919113851690057770f,
-  -0.399624199845646680f, 0.916679059921042700f,
-  -0.405241314004989750f, 0.914209755703530690f,
-  -0.410843171057903800f, 0.911706032005429880f,
-  -0.416429560097636990f, 0.909167983090522490f,
-  -0.422000270799799680f, 0.906595704514915330f,
-  -0.427555093430281860f, 0.903989293123443450f,
-  -0.433093818853151900f, 0.901348847046022030f,
-  -0.438616238538527380f, 0.898674465693953930f,
-  -0.444122144570429140f, 0.895966249756185220f,
-  -0.449611329654606710f, 0.893224301195515210f,
-  -0.455083587126343720f, 0.890448723244757990f,
-  -0.460538710958240060f, 0.887639620402853930f,
-  -0.465976495767966010f, 0.884797098430937900f,
-  -0.471396736825997700f, 0.881921264348355050f,
-  -0.476799230063321920f, 0.879012226428633530f,
-  -0.482183772079122720f, 0.876070094195406600f,
-  -0.487550160148435720f, 0.873094978418290200f,
-  -0.492898192229783980f, 0.870086991108711460f,
-  -0.498227666972781590f, 0.867046245515692760f,
-  -0.503538383725717460f, 0.863972856121586810f,
-  -0.508830142543107100f, 0.860866938637767200f,
-  -0.514102744193221660f, 0.857728610000272120f,
-  -0.519355990165589640f, 0.854557988365400530f,
-  -0.524589682678468730f, 0.851355193105265200f,
-  -0.529803624686294720f, 0.848120344803297230f,
-  -0.534997619887097040f, 0.844853565249707230f,
-  -0.540171472729892850f, 0.841554977436898440f,
-  -0.545324988422046240f, 0.838224705554838190f,
-  -0.550457972936604700f, 0.834862874986380120f,
-  -0.555570233019601960f, 0.831469612302545460f,
-  -0.560661576197335920f, 0.828045045257755800f,
-  -0.565731810783613230f, 0.824589302785025180f,
-  -0.570780745886967140f, 0.821102514991104760f,
-  -0.575808191417845340f, 0.817584813151583710f,
-  -0.580813958095764420f, 0.814036329705948520f,
-  -0.585797857456438860f, 0.810457198252594770f,
-  -0.590759701858874050f, 0.806847553543799450f,
-  -0.595699304492433360f, 0.803207531480644940f,
-  -0.600616479383868750f, 0.799537269107905240f,
-  -0.605511041404325430f, 0.795836904608883570f,
-  -0.610382806276309590f, 0.792106577300212280f,
-  -0.615231590580626710f, 0.788346427626606340f,
-  -0.620057211763289210f, 0.784556597155575130f,
-  -0.624859488142386230f, 0.780737228572094600f,
-  -0.629638238914927100f, 0.776888465673232440f,
-  -0.634393284163645380f, 0.773010453362737100f,
-  -0.639124444863775730f, 0.769103337645579590f,
-  -0.643831542889791280f, 0.765167265622459070f,
-  -0.648514401022112440f, 0.761202385484261890f,
-  -0.653172842953776530f, 0.757208846506484680f,
-  -0.657806693297078640f, 0.753186799043612520f,
-  -0.662415777590171890f, 0.749136394523459260f,
-  -0.666999922303637360f, 0.745057785441466060f,
-  -0.671558954847018440f, 0.740951125354958990f,
-  -0.676092703575315810f, 0.736816568877370020f,
-  -0.680600997795453020f, 0.732654271672412820f,
-  -0.685083667772700240f, 0.728464390448225310f,
-  -0.689540544737066940f, 0.724247082951466890f,
-  -0.693971460889653780f, 0.720002507961381770f,
-  -0.698376249408972800f, 0.715730825283818710f,
-  -0.702754744457225080f, 0.711432195745216660f,
-  -0.707106781186547460f, 0.707106781186547570f,
-  -0.711432195745216540f, 0.702754744457225190f,
-  -0.715730825283818590f, 0.698376249408972920f,
-  -0.720002507961381650f, 0.693971460889654000f,
-  -0.724247082951466780f, 0.689540544737067050f,
-  -0.728464390448225200f, 0.685083667772700360f,
-  -0.732654271672412700f, 0.680600997795453240f,
-  -0.736816568877369900f, 0.676092703575315920f,
-  -0.740951125354958880f, 0.671558954847018550f,
-  -0.745057785441465950f, 0.666999922303637580f,
-  -0.749136394523459150f, 0.662415777590172010f,
-  -0.753186799043612410f, 0.657806693297078750f,
-  -0.757208846506484570f, 0.653172842953776640f,
-  -0.761202385484261670f, 0.648514401022112550f,
-  -0.765167265622458960f, 0.643831542889791390f,
-  -0.769103337645579480f, 0.639124444863775840f,
-  -0.773010453362736990f, 0.634393284163645490f,
-  -0.776888465673232330f, 0.629638238914927210f,
-  -0.780737228572094490f, 0.624859488142386340f,
-  -0.784556597155575020f, 0.620057211763289430f,
-  -0.788346427626606230f, 0.615231590580626930f,
-  -0.792106577300212170f, 0.610382806276309700f,
-  -0.795836904608883460f, 0.605511041404325660f,
-  -0.799537269107905120f, 0.600616479383868860f,
-  -0.803207531480644830f, 0.595699304492433470f,
-  -0.806847553543799330f, 0.590759701858874160f,
-  -0.810457198252594660f, 0.585797857456438980f,
-  -0.814036329705948410f, 0.580813958095764530f,
-  -0.817584813151583600f, 0.575808191417845450f,
-  -0.821102514991104650f, 0.570780745886967260f,
-  -0.824589302785025070f, 0.565731810783613450f,
-  -0.828045045257755690f, 0.560661576197336140f,
-  -0.831469612302545350f, 0.555570233019602180f,
-  -0.834862874986380010f, 0.550457972936604920f,
-  -0.838224705554838080f, 0.545324988422046350f,
-  -0.841554977436898330f, 0.540171472729892970f,
-  -0.844853565249707120f, 0.534997619887097150f,
-  -0.848120344803297120f, 0.529803624686294830f,
-  -0.851355193105265200f, 0.524589682678468950f,
-  -0.854557988365400420f, 0.519355990165589750f,
-  -0.857728610000272010f, 0.514102744193221770f,
-  -0.860866938637767090f, 0.508830142543107320f,
-  -0.863972856121586700f, 0.503538383725717690f,
-  -0.867046245515692760f, 0.498227666972781760f,
-  -0.870086991108711350f, 0.492898192229784150f,
-  -0.873094978418290090f, 0.487550160148435880f,
-  -0.876070094195406490f, 0.482183772079122890f,
-  -0.879012226428633530f, 0.476799230063322090f,
-  -0.881921264348354940f, 0.471396736825997860f,
-  -0.884797098430937790f, 0.465976495767966180f,
-  -0.887639620402853820f, 0.460538710958240230f,
-  -0.890448723244757880f, 0.455083587126343890f,
-  -0.893224301195515210f, 0.449611329654606870f,
-  -0.895966249756185110f, 0.444122144570429310f,
-  -0.898674465693953930f, 0.438616238538527550f,
-  -0.901348847046021920f, 0.433093818853152070f,
-  -0.903989293123443340f, 0.427555093430282030f,
-  -0.906595704514915330f, 0.422000270799799850f,
-  -0.909167983090522380f, 0.416429560097637150f,
-  -0.911706032005429770f, 0.410843171057904130f,
-  -0.914209755703530690f, 0.405241314004989920f,
-  -0.916679059921042590f, 0.399624199845647070f,
-  -0.919113851690057770f, 0.393992040061048150f,
-  -0.921514039342041790f, 0.388345046698826580f,
-  -0.923879532511286740f, 0.382683432365089890f,
-  -0.926210242138311380f, 0.377007410216418150f,
-  -0.928506080473215480f, 0.371317193951837710f,
-  -0.930766961078983710f, 0.365612997804773800f,
-  -0.932992798834738850f, 0.359895036534988330f,
-  -0.935183509938947610f, 0.354163525420490400f,
-  -0.937339011912574850f, 0.348418680249434790f,
-  -0.939459223602189920f, 0.342660717311994430f,
-  -0.941544065183020700f, 0.336889853392220330f,
-  -0.943593458161960390f, 0.331106305759876480f,
-  -0.945607325380521170f, 0.325310292162263260f,
-  -0.947585591017741090f, 0.319502030816015800f,
-  -0.949528180593036670f, 0.313681740398891410f,
-  -0.951435020969008340f, 0.307849640041535030f,
-  -0.953306040354193860f, 0.302005949319228030f,
-  -0.955141168305770670f, 0.296150888243624010f,
-  -0.956940335732208820f, 0.290284677254462390f,
-  -0.958703474895871490f, 0.284407537211272100f,
-  -0.960430519415565790f, 0.278519689385053170f,
-  -0.962121404269041470f, 0.272621355449949250f,
-  -0.963776065795439840f, 0.266712757474898480f,
-  -0.965394441697689290f, 0.260794117915275850f,
-  -0.966976471044852070f, 0.254865659604514680f,
-  -0.968522094274417380f, 0.248927605745720090f,
-  -0.970031253194543970f, 0.242980179903264070f,
-  -0.971503890986251780f, 0.237023605994367170f,
-  -0.972939952205560070f, 0.231058108280671330f,
-  -0.974339382785575860f, 0.225083911359792830f,
-  -0.975702130038528460f, 0.219101240156870050f,
-  -0.977028142657754390f, 0.213110319916091420f,
-  -0.978317370719627540f, 0.207111376192218840f,
-  -0.979569765685440520f, 0.201104634842092010f,
-  -0.980785280403230430f, 0.195090322016128610f,
-  -0.981963869109555240f, 0.189068664149806360f,
-  -0.983105487431216290f, 0.183039887955140900f,
-  -0.984210092386929030f, 0.177004220412148940f,
-  -0.985277642388941220f, 0.170961888760301220f,
-  -0.986308097244598560f, 0.164913120489970140f,
-  -0.987301418157858430f, 0.158858143333861470f,
-  -0.988257567730749460f, 0.152797185258443690f,
-  -0.989176509964781010f, 0.146730474455361800f,
-  -0.990058210262297010f, 0.140658239332849540f,
-  -0.990902635427780010f, 0.134580708507126280f,
-  -0.991709753669099530f, 0.128498110793793090f,
-  -0.992479534598709970f, 0.122410675199216350f,
-  -0.993211949234794500f, 0.116318630911904710f,
-  -0.993906970002356060f, 0.110222207293883240f,
-  -0.994564570734255420f, 0.104121633872054570f,
-  -0.995184726672196820f, 0.098017140329560826f,
-  -0.995767414467659820f, 0.091908956497132752f,
-  -0.996312612182778000f, 0.085797312344440158f,
-  -0.996820299291165670f, 0.079682437971430195f,
-  -0.997290456678690210f, 0.073564563599667732f,
-  -0.997723066644191640f, 0.067443919563664176f,
-  -0.998118112900149180f, 0.061320736302208488f,
-  -0.998475580573294770f, 0.055195244349690094f,
-  -0.998795456205172410f, 0.049067674327417966f,
-  -0.999077727752645360f, 0.042938256934941021f,
-  -0.999322384588349540f, 0.036807222941358832f,
-  -0.999529417501093140f, 0.030674803176636865f,
-  -0.999698818696204250f, 0.024541228522912326f,
-  -0.999830581795823400f, 0.018406729905805101f,
-  -0.999924701839144500f, 0.012271538285720007f,
-  -0.999981175282601110f, 0.006135884649154799f,
-  -1.000000000000000000f, 0.000000000000000122f,
-  -0.999981175282601110f, -0.006135884649154554f,
-  -0.999924701839144500f, -0.012271538285719762f,
-  -0.999830581795823400f, -0.018406729905804858f,
-  -0.999698818696204250f, -0.024541228522912080f,
-  -0.999529417501093140f, -0.030674803176636619f,
-  -0.999322384588349540f, -0.036807222941358582f,
-  -0.999077727752645360f, -0.042938256934940779f,
-  -0.998795456205172410f, -0.049067674327417724f,
-  -0.998475580573294770f, -0.055195244349689851f,
-  -0.998118112900149180f, -0.061320736302208245f,
-  -0.997723066644191640f, -0.067443919563663926f,
-  -0.997290456678690210f, -0.073564563599667496f,
-  -0.996820299291165780f, -0.079682437971429945f,
-  -0.996312612182778000f, -0.085797312344439922f,
-  -0.995767414467659820f, -0.091908956497132516f,
-  -0.995184726672196930f, -0.098017140329560590f,
-  -0.994564570734255530f, -0.104121633872054320f,
-  -0.993906970002356060f, -0.110222207293883000f,
-  -0.993211949234794610f, -0.116318630911904470f,
-  -0.992479534598709970f, -0.122410675199216100f,
-  -0.991709753669099530f, -0.128498110793792840f,
-  -0.990902635427780010f, -0.134580708507126060f,
-  -0.990058210262297120f, -0.140658239332849290f,
-  -0.989176509964781010f, -0.146730474455361580f,
-  -0.988257567730749460f, -0.152797185258443440f,
-  -0.987301418157858430f, -0.158858143333861220f,
-  -0.986308097244598670f, -0.164913120489969890f,
-  -0.985277642388941330f, -0.170961888760300970f,
-  -0.984210092386929140f, -0.177004220412148690f,
-  -0.983105487431216400f, -0.183039887955140650f,
-  -0.981963869109555240f, -0.189068664149806110f,
-  -0.980785280403230430f, -0.195090322016128360f,
-  -0.979569765685440520f, -0.201104634842091760f,
-  -0.978317370719627650f, -0.207111376192218590f,
-  -0.977028142657754390f, -0.213110319916091200f,
-  -0.975702130038528570f, -0.219101240156869800f,
-  -0.974339382785575860f, -0.225083911359792610f,
-  -0.972939952205560180f, -0.231058108280671080f,
-  -0.971503890986251890f, -0.237023605994366950f,
-  -0.970031253194543970f, -0.242980179903263820f,
-  -0.968522094274417380f, -0.248927605745719870f,
-  -0.966976471044852180f, -0.254865659604514460f,
-  -0.965394441697689400f, -0.260794117915275630f,
-  -0.963776065795439950f, -0.266712757474898250f,
-  -0.962121404269041580f, -0.272621355449949030f,
-  -0.960430519415565900f, -0.278519689385052890f,
-  -0.958703474895871600f, -0.284407537211271820f,
-  -0.956940335732208940f, -0.290284677254462110f,
-  -0.955141168305770780f, -0.296150888243623790f,
-  -0.953306040354193970f, -0.302005949319227810f,
-  -0.951435020969008450f, -0.307849640041534810f,
-  -0.949528180593036790f, -0.313681740398891180f,
-  -0.947585591017741200f, -0.319502030816015580f,
-  -0.945607325380521280f, -0.325310292162262980f,
-  -0.943593458161960390f, -0.331106305759876260f,
-  -0.941544065183020810f, -0.336889853392220110f,
-  -0.939459223602190030f, -0.342660717311994210f,
-  -0.937339011912574960f, -0.348418680249434560f,
-  -0.935183509938947720f, -0.354163525420490120f,
-  -0.932992798834738960f, -0.359895036534988110f,
-  -0.930766961078983820f, -0.365612997804773580f,
-  -0.928506080473215590f, -0.371317193951837430f,
-  -0.926210242138311490f, -0.377007410216417930f,
-  -0.923879532511286850f, -0.382683432365089670f,
-  -0.921514039342041900f, -0.388345046698826360f,
-  -0.919113851690057770f, -0.393992040061047930f,
-  -0.916679059921042700f, -0.399624199845646840f,
-  -0.914209755703530690f, -0.405241314004989690f,
-  -0.911706032005429880f, -0.410843171057903910f,
-  -0.909167983090522490f, -0.416429560097636930f,
-  -0.906595704514915450f, -0.422000270799799630f,
-  -0.903989293123443450f, -0.427555093430281810f,
-  -0.901348847046022030f, -0.433093818853151850f,
-  -0.898674465693954040f, -0.438616238538527330f,
-  -0.895966249756185220f, -0.444122144570429090f,
-  -0.893224301195515320f, -0.449611329654606650f,
-  -0.890448723244757990f, -0.455083587126343670f,
-  -0.887639620402853930f, -0.460538710958240060f,
-  -0.884797098430937900f, -0.465976495767965960f,
-  -0.881921264348355050f, -0.471396736825997640f,
-  -0.879012226428633640f, -0.476799230063321870f,
-  -0.876070094195406600f, -0.482183772079122660f,
-  -0.873094978418290200f, -0.487550160148435660f,
-  -0.870086991108711460f, -0.492898192229783930f,
-  -0.867046245515692870f, -0.498227666972781540f,
-  -0.863972856121586810f, -0.503538383725717460f,
-  -0.860866938637767310f, -0.508830142543107100f,
-  -0.857728610000272120f, -0.514102744193221550f,
-  -0.854557988365400530f, -0.519355990165589640f,
-  -0.851355193105265310f, -0.524589682678468730f,
-  -0.848120344803297230f, -0.529803624686294610f,
-  -0.844853565249707230f, -0.534997619887096930f,
-  -0.841554977436898440f, -0.540171472729892850f,
-  -0.838224705554838190f, -0.545324988422046130f,
-  -0.834862874986380120f, -0.550457972936604700f,
-  -0.831469612302545460f, -0.555570233019601960f,
-  -0.828045045257755800f, -0.560661576197335920f,
-  -0.824589302785025290f, -0.565731810783613230f,
-  -0.821102514991104760f, -0.570780745886967140f,
-  -0.817584813151583710f, -0.575808191417845340f,
-  -0.814036329705948520f, -0.580813958095764300f,
-  -0.810457198252594770f, -0.585797857456438860f,
-  -0.806847553543799450f, -0.590759701858873940f,
-  -0.803207531480644940f, -0.595699304492433250f,
-  -0.799537269107905240f, -0.600616479383868640f,
-  -0.795836904608883570f, -0.605511041404325430f,
-  -0.792106577300212280f, -0.610382806276309480f,
-  -0.788346427626606340f, -0.615231590580626710f,
-  -0.784556597155575240f, -0.620057211763289210f,
-  -0.780737228572094600f, -0.624859488142386230f,
-  -0.776888465673232440f, -0.629638238914926980f,
-  -0.773010453362737100f, -0.634393284163645270f,
-  -0.769103337645579700f, -0.639124444863775730f,
-  -0.765167265622459070f, -0.643831542889791280f,
-  -0.761202385484261890f, -0.648514401022112330f,
-  -0.757208846506484790f, -0.653172842953776530f,
-  -0.753186799043612630f, -0.657806693297078530f,
-  -0.749136394523459260f, -0.662415777590171780f,
-  -0.745057785441466060f, -0.666999922303637360f,
-  -0.740951125354959110f, -0.671558954847018440f,
-  -0.736816568877370020f, -0.676092703575315810f,
-  -0.732654271672412820f, -0.680600997795453020f,
-  -0.728464390448225420f, -0.685083667772700130f,
-  -0.724247082951467000f, -0.689540544737066830f,
-  -0.720002507961381880f, -0.693971460889653780f,
-  -0.715730825283818710f, -0.698376249408972800f,
-  -0.711432195745216660f, -0.702754744457225080f,
-  -0.707106781186547680f, -0.707106781186547460f,
-  -0.702754744457225300f, -0.711432195745216430f,
-  -0.698376249408973030f, -0.715730825283818480f,
-  -0.693971460889654000f, -0.720002507961381650f,
-  -0.689540544737067050f, -0.724247082951466780f,
-  -0.685083667772700360f, -0.728464390448225200f,
-  -0.680600997795453240f, -0.732654271672412590f,
-  -0.676092703575316030f, -0.736816568877369790f,
-  -0.671558954847018660f, -0.740951125354958880f,
-  -0.666999922303637580f, -0.745057785441465840f,
-  -0.662415777590172010f, -0.749136394523459040f,
-  -0.657806693297078750f, -0.753186799043612410f,
-  -0.653172842953777090f, -0.757208846506484230f,
-  -0.648514401022112220f, -0.761202385484262000f,
-  -0.643831542889791500f, -0.765167265622458960f,
-  -0.639124444863775950f, -0.769103337645579480f,
-  -0.634393284163645930f, -0.773010453362736660f,
-  -0.629638238914926870f, -0.776888465673232550f,
-  -0.624859488142386450f, -0.780737228572094380f,
-  -0.620057211763289430f, -0.784556597155575020f,
-  -0.615231590580627260f, -0.788346427626605890f,
-  -0.610382806276309360f, -0.792106577300212390f,
-  -0.605511041404325660f, -0.795836904608883460f,
-  -0.600616479383869310f, -0.799537269107904790f,
-  -0.595699304492433130f, -0.803207531480645050f,
-  -0.590759701858874280f, -0.806847553543799220f,
-  -0.585797857456439090f, -0.810457198252594660f,
-  -0.580813958095764970f, -0.814036329705948080f,
-  -0.575808191417845230f, -0.817584813151583820f,
-  -0.570780745886967370f, -0.821102514991104650f,
-  -0.565731810783613450f, -0.824589302785025070f,
-  -0.560661576197336480f, -0.828045045257755460f,
-  -0.555570233019602180f, -0.831469612302545240f,
-  -0.550457972936604920f, -0.834862874986380010f,
-  -0.545324988422046800f, -0.838224705554837860f,
-  -0.540171472729892740f, -0.841554977436898550f,
-  -0.534997619887097260f, -0.844853565249707010f,
-  -0.529803624686294940f, -0.848120344803297120f,
-  -0.524589682678469390f, -0.851355193105264860f,
-  -0.519355990165589420f, -0.854557988365400640f,
-  -0.514102744193221770f, -0.857728610000272010f,
-  -0.508830142543107320f, -0.860866938637767090f,
-  -0.503538383725718020f, -0.863972856121586470f,
-  -0.498227666972781810f, -0.867046245515692650f,
-  -0.492898192229784200f, -0.870086991108711350f,
-  -0.487550160148436330f, -0.873094978418289870f,
-  -0.482183772079122550f, -0.876070094195406710f,
-  -0.476799230063322140f, -0.879012226428633410f,
-  -0.471396736825997860f, -0.881921264348354940f,
-  -0.465976495767966630f, -0.884797098430937570f,
-  -0.460538710958239890f, -0.887639620402854050f,
-  -0.455083587126343950f, -0.890448723244757880f,
-  -0.449611329654606930f, -0.893224301195515210f,
-  -0.444122144570429760f, -0.895966249756184880f,
-  -0.438616238538527600f, -0.898674465693953820f,
-  -0.433093818853152120f, -0.901348847046021920f,
-  -0.427555093430282470f, -0.903989293123443120f,
-  -0.422000270799799520f, -0.906595704514915450f,
-  -0.416429560097637210f, -0.909167983090522380f,
-  -0.410843171057904190f, -0.911706032005429770f,
-  -0.405241314004990360f, -0.914209755703530470f,
-  -0.399624199845646730f, -0.916679059921042700f,
-  -0.393992040061048210f, -0.919113851690057660f,
-  -0.388345046698826630f, -0.921514039342041790f,
-  -0.382683432365090340f, -0.923879532511286520f,
-  -0.377007410216418200f, -0.926210242138311380f,
-  -0.371317193951837770f, -0.928506080473215480f,
-  -0.365612997804774300f, -0.930766961078983600f,
-  -0.359895036534987940f, -0.932992798834738960f,
-  -0.354163525420490450f, -0.935183509938947610f,
-  -0.348418680249434840f, -0.937339011912574850f,
-  -0.342660717311994880f, -0.939459223602189700f,
-  -0.336889853392219940f, -0.941544065183020810f,
-  -0.331106305759876540f, -0.943593458161960270f,
-  -0.325310292162263310f, -0.945607325380521170f,
-  -0.319502030816015410f, -0.947585591017741200f,
-  -0.313681740398891460f, -0.949528180593036670f,
-  -0.307849640041535090f, -0.951435020969008340f,
-  -0.302005949319228530f, -0.953306040354193750f,
-  -0.296150888243623680f, -0.955141168305770780f,
-  -0.290284677254462440f, -0.956940335732208820f,
-  -0.284407537211272150f, -0.958703474895871490f,
-  -0.278519689385053610f, -0.960430519415565680f,
-  -0.272621355449948870f, -0.962121404269041580f,
-  -0.266712757474898530f, -0.963776065795439840f,
-  -0.260794117915275900f, -0.965394441697689290f,
-  -0.254865659604514350f, -0.966976471044852180f,
-  -0.248927605745720150f, -0.968522094274417270f,
-  -0.242980179903264120f, -0.970031253194543970f,
-  -0.237023605994367670f, -0.971503890986251670f,
-  -0.231058108280670940f, -0.972939952205560180f,
-  -0.225083911359792920f, -0.974339382785575860f,
-  -0.219101240156870100f, -0.975702130038528460f,
-  -0.213110319916091920f, -0.977028142657754280f,
-  -0.207111376192218480f, -0.978317370719627650f,
-  -0.201104634842092070f, -0.979569765685440520f,
-  -0.195090322016128660f, -0.980785280403230320f,
-  -0.189068664149805970f, -0.981963869109555350f,
-  -0.183039887955140950f, -0.983105487431216290f,
-  -0.177004220412149000f, -0.984210092386929030f,
-  -0.170961888760301690f, -0.985277642388941110f,
-  -0.164913120489969760f, -0.986308097244598670f,
-  -0.158858143333861530f, -0.987301418157858320f,
-  -0.152797185258443740f, -0.988257567730749460f,
-  -0.146730474455362300f, -0.989176509964780900f,
-  -0.140658239332849160f, -0.990058210262297120f,
-  -0.134580708507126360f, -0.990902635427780010f,
-  -0.128498110793793590f, -0.991709753669099530f,
-  -0.122410675199215960f, -0.992479534598710080f,
-  -0.116318630911904770f, -0.993211949234794500f,
-  -0.110222207293883310f, -0.993906970002356060f,
-  -0.104121633872055070f, -0.994564570734255420f,
-  -0.098017140329560451f, -0.995184726672196930f,
-  -0.091908956497132821f, -0.995767414467659820f,
-  -0.085797312344440227f, -0.996312612182778000f,
-  -0.079682437971430695f, -0.996820299291165670f,
-  -0.073564563599667357f, -0.997290456678690210f,
-  -0.067443919563664231f, -0.997723066644191640f,
-  -0.061320736302208995f, -0.998118112900149180f,
-  -0.055195244349689712f, -0.998475580573294770f,
-  -0.049067674327418029f, -0.998795456205172410f,
-  -0.042938256934941084f, -0.999077727752645360f,
-  -0.036807222941359331f, -0.999322384588349430f,
-  -0.030674803176636484f, -0.999529417501093140f,
-  -0.024541228522912389f, -0.999698818696204250f,
-  -0.018406729905805164f, -0.999830581795823400f,
-  -0.012271538285720512f, -0.999924701839144500f,
-  -0.006135884649154416f, -0.999981175282601110f,
-  -0.000000000000000184f, -1.000000000000000000f,
-  0.006135884649154049f, -0.999981175282601110f,
-  0.012271538285720144f, -0.999924701839144500f,
-  0.018406729905804796f, -0.999830581795823400f,
-  0.024541228522912021f, -0.999698818696204250f,
-  0.030674803176636116f, -0.999529417501093140f,
-  0.036807222941358964f, -0.999322384588349540f,
-  0.042938256934940716f, -0.999077727752645360f,
-  0.049067674327417661f, -0.998795456205172410f,
-  0.055195244349689344f, -0.998475580573294770f,
-  0.061320736302208627f, -0.998118112900149180f,
-  0.067443919563663871f, -0.997723066644191640f,
-  0.073564563599666982f, -0.997290456678690210f,
-  0.079682437971430334f, -0.996820299291165670f,
-  0.085797312344439852f, -0.996312612182778000f,
-  0.091908956497132446f, -0.995767414467659820f,
-  0.098017140329560090f, -0.995184726672196930f,
-  0.104121633872054700f, -0.994564570734255420f,
-  0.110222207293882930f, -0.993906970002356060f,
-  0.116318630911904410f, -0.993211949234794610f,
-  0.122410675199215600f, -0.992479534598710080f,
-  0.128498110793793220f, -0.991709753669099530f,
-  0.134580708507125970f, -0.990902635427780010f,
-  0.140658239332848790f, -0.990058210262297120f,
-  0.146730474455361940f, -0.989176509964780900f,
-  0.152797185258443380f, -0.988257567730749460f,
-  0.158858143333861170f, -0.987301418157858430f,
-  0.164913120489969390f, -0.986308097244598780f,
-  0.170961888760301330f, -0.985277642388941220f,
-  0.177004220412148640f, -0.984210092386929140f,
-  0.183039887955140590f, -0.983105487431216400f,
-  0.189068664149805610f, -0.981963869109555350f,
-  0.195090322016128300f, -0.980785280403230430f,
-  0.201104634842091710f, -0.979569765685440630f,
-  0.207111376192218120f, -0.978317370719627770f,
-  0.213110319916091560f, -0.977028142657754280f,
-  0.219101240156869740f, -0.975702130038528570f,
-  0.225083911359792550f, -0.974339382785575970f,
-  0.231058108280670580f, -0.972939952205560290f,
-  0.237023605994367310f, -0.971503890986251780f,
-  0.242980179903263760f, -0.970031253194543970f,
-  0.248927605745719790f, -0.968522094274417380f,
-  0.254865659604513960f, -0.966976471044852290f,
-  0.260794117915275510f, -0.965394441697689400f,
-  0.266712757474898200f, -0.963776065795439950f,
-  0.272621355449948530f, -0.962121404269041690f,
-  0.278519689385053280f, -0.960430519415565790f,
-  0.284407537211271770f, -0.958703474895871600f,
-  0.290284677254462050f, -0.956940335732208940f,
-  0.296150888243623290f, -0.955141168305770890f,
-  0.302005949319228140f, -0.953306040354193860f,
-  0.307849640041534760f, -0.951435020969008450f,
-  0.313681740398891130f, -0.949528180593036790f,
-  0.319502030816015080f, -0.947585591017741310f,
-  0.325310292162262930f, -0.945607325380521280f,
-  0.331106305759876210f, -0.943593458161960390f,
-  0.336889853392219610f, -0.941544065183020920f,
-  0.342660717311994540f, -0.939459223602189810f,
-  0.348418680249434510f, -0.937339011912574960f,
-  0.354163525420490070f, -0.935183509938947720f,
-  0.359895036534987610f, -0.932992798834739070f,
-  0.365612997804773960f, -0.930766961078983710f,
-  0.371317193951837380f, -0.928506080473215590f,
-  0.377007410216417870f, -0.926210242138311490f,
-  0.382683432365090000f, -0.923879532511286630f,
-  0.388345046698826300f, -0.921514039342041900f,
-  0.393992040061047880f, -0.919113851690057880f,
-  0.399624199845646400f, -0.916679059921042820f,
-  0.405241314004990030f, -0.914209755703530580f,
-  0.410843171057903860f, -0.911706032005429880f,
-  0.416429560097636870f, -0.909167983090522490f,
-  0.422000270799799180f, -0.906595704514915560f,
-  0.427555093430282140f, -0.903989293123443340f,
-  0.433093818853151790f, -0.901348847046022140f,
-  0.438616238538527270f, -0.898674465693954040f,
-  0.444122144570429420f, -0.895966249756185000f,
-  0.449611329654606600f, -0.893224301195515320f,
-  0.455083587126343610f, -0.890448723244757990f,
-  0.460538710958239560f, -0.887639620402854160f,
-  0.465976495767966290f, -0.884797098430937680f,
-  0.471396736825997590f, -0.881921264348355050f,
-  0.476799230063321870f, -0.879012226428633640f,
-  0.482183772079122220f, -0.876070094195406930f,
-  0.487550160148436000f, -0.873094978418290090f,
-  0.492898192229783870f, -0.870086991108711460f,
-  0.498227666972781480f, -0.867046245515692870f,
-  0.503538383725717800f, -0.863972856121586590f,
-  0.508830142543106990f, -0.860866938637767310f,
-  0.514102744193221550f, -0.857728610000272230f,
-  0.519355990165589200f, -0.854557988365400760f,
-  0.524589682678469060f, -0.851355193105265080f,
-  0.529803624686294610f, -0.848120344803297340f,
-  0.534997619887096930f, -0.844853565249707230f,
-  0.540171472729892410f, -0.841554977436898780f,
-  0.545324988422046460f, -0.838224705554837970f,
-  0.550457972936604700f, -0.834862874986380120f,
-  0.555570233019601840f, -0.831469612302545460f,
-  0.560661576197336250f, -0.828045045257755690f,
-  0.565731810783613120f, -0.824589302785025290f,
-  0.570780745886967030f, -0.821102514991104870f,
-  0.575808191417844890f, -0.817584813151584040f,
-  0.580813958095764640f, -0.814036329705948300f,
-  0.585797857456438750f, -0.810457198252594880f,
-  0.590759701858873940f, -0.806847553543799450f,
-  0.595699304492432910f, -0.803207531480645280f,
-  0.600616479383868970f, -0.799537269107905010f,
-  0.605511041404325320f, -0.795836904608883680f,
-  0.610382806276309140f, -0.792106577300212610f,
-  0.615231590580627040f, -0.788346427626606120f,
-  0.620057211763289100f, -0.784556597155575240f,
-  0.624859488142386120f, -0.780737228572094600f,
-  0.629638238914926650f, -0.776888465673232780f,
-  0.634393284163645600f, -0.773010453362736880f,
-  0.639124444863775620f, -0.769103337645579700f,
-  0.643831542889791160f, -0.765167265622459180f,
-  0.648514401022112000f, -0.761202385484262220f,
-  0.653172842953776760f, -0.757208846506484570f,
-  0.657806693297078530f, -0.753186799043612630f,
-  0.662415777590171450f, -0.749136394523459590f,
-  0.666999922303637690f, -0.745057785441465840f,
-  0.671558954847018330f, -0.740951125354959110f,
-  0.676092703575315700f, -0.736816568877370020f,
-  0.680600997795452690f, -0.732654271672413150f,
-  0.685083667772700470f, -0.728464390448225090f,
-  0.689540544737066830f, -0.724247082951467000f,
-  0.693971460889653780f, -0.720002507961381880f,
-  0.698376249408972360f, -0.715730825283819040f,
-  0.702754744457225300f, -0.711432195745216430f,
-  0.707106781186547350f, -0.707106781186547680f,
-  0.711432195745216100f, -0.702754744457225630f,
-  0.715730825283818820f, -0.698376249408972690f,
-  0.720002507961381540f, -0.693971460889654000f,
-  0.724247082951466670f, -0.689540544737067160f,
-  0.728464390448224860f, -0.685083667772700800f,
-  0.732654271672412930f, -0.680600997795453020f,
-  0.736816568877369790f, -0.676092703575316030f,
-  0.740951125354958880f, -0.671558954847018660f,
-  0.745057785441465500f, -0.666999922303638030f,
-  0.749136394523459370f, -0.662415777590171780f,
-  0.753186799043612300f, -0.657806693297078860f,
-  0.757208846506484230f, -0.653172842953777090f,
-  0.761202385484261890f, -0.648514401022112330f,
-  0.765167265622458850f, -0.643831542889791500f,
-  0.769103337645579480f, -0.639124444863775950f,
-  0.773010453362736660f, -0.634393284163645930f,
-  0.776888465673232550f, -0.629638238914926980f,
-  0.780737228572094380f, -0.624859488142386450f,
-  0.784556597155575020f, -0.620057211763289540f,
-  0.788346427626605890f, -0.615231590580627370f,
-  0.792106577300212390f, -0.610382806276309480f,
-  0.795836904608883340f, -0.605511041404325660f,
-  0.799537269107904790f, -0.600616479383869310f,
-  0.803207531480645050f, -0.595699304492433250f,
-  0.806847553543799220f, -0.590759701858874280f,
-  0.810457198252594660f, -0.585797857456439090f,
-  0.814036329705948080f, -0.580813958095764970f,
-  0.817584813151583710f, -0.575808191417845230f,
-  0.821102514991104540f, -0.570780745886967370f,
-  0.824589302785025070f, -0.565731810783613560f,
-  0.828045045257755350f, -0.560661576197336590f,
-  0.831469612302545240f, -0.555570233019602180f,
-  0.834862874986379900f, -0.550457972936605030f,
-  0.838224705554837750f, -0.545324988422046800f,
-  0.841554977436898440f, -0.540171472729892740f,
-  0.844853565249707010f, -0.534997619887097260f,
-  0.848120344803297120f, -0.529803624686294940f,
-  0.851355193105264860f, -0.524589682678469390f,
-  0.854557988365400530f, -0.519355990165589530f,
-  0.857728610000272010f, -0.514102744193221880f,
-  0.860866938637767090f, -0.508830142543107430f,
-  0.863972856121586360f, -0.503538383725718130f,
-  0.867046245515692650f, -0.498227666972781870f,
-  0.870086991108711350f, -0.492898192229784260f,
-  0.873094978418289870f, -0.487550160148436380f,
-  0.876070094195406710f, -0.482183772079122610f,
-  0.879012226428633410f, -0.476799230063322200f,
-  0.881921264348354830f, -0.471396736825997920f,
-  0.884797098430937460f, -0.465976495767966680f,
-  0.887639620402853930f, -0.460538710958239950f,
-  0.890448723244757770f, -0.455083587126344000f,
-  0.893224301195515100f, -0.449611329654606980f,
-  0.895966249756184880f, -0.444122144570429810f,
-  0.898674465693953820f, -0.438616238538527660f,
-  0.901348847046021920f, -0.433093818853152180f,
-  0.903989293123443120f, -0.427555093430282530f,
-  0.906595704514915450f, -0.422000270799799570f,
-  0.909167983090522380f, -0.416429560097637260f,
-  0.911706032005429660f, -0.410843171057904240f,
-  0.914209755703530470f, -0.405241314004990420f,
-  0.916679059921042700f, -0.399624199845646790f,
-  0.919113851690057660f, -0.393992040061048270f,
-  0.921514039342041790f, -0.388345046698826690f,
-  0.923879532511286520f, -0.382683432365090390f,
-  0.926210242138311380f, -0.377007410216418260f,
-  0.928506080473215480f, -0.371317193951837820f,
-  0.930766961078983490f, -0.365612997804774350f,
-  0.932992798834738960f, -0.359895036534988000f,
-  0.935183509938947500f, -0.354163525420490510f,
-  0.937339011912574850f, -0.348418680249434900f,
-  0.939459223602189700f, -0.342660717311994930f,
-  0.941544065183020810f, -0.336889853392220000f,
-  0.943593458161960270f, -0.331106305759876600f,
-  0.945607325380521170f, -0.325310292162263370f,
-  0.947585591017741200f, -0.319502030816015470f,
-  0.949528180593036670f, -0.313681740398891520f,
-  0.951435020969008340f, -0.307849640041535140f,
-  0.953306040354193640f, -0.302005949319228580f,
-  0.955141168305770780f, -0.296150888243623730f,
-  0.956940335732208820f, -0.290284677254462500f,
-  0.958703474895871490f, -0.284407537211272210f,
-  0.960430519415565680f, -0.278519689385053670f,
-  0.962121404269041580f, -0.272621355449948980f,
-  0.963776065795439840f, -0.266712757474898590f,
-  0.965394441697689290f, -0.260794117915275960f,
-  0.966976471044852180f, -0.254865659604514410f,
-  0.968522094274417270f, -0.248927605745720200f,
-  0.970031253194543970f, -0.242980179903264180f,
-  0.971503890986251670f, -0.237023605994367730f,
-  0.972939952205560180f, -0.231058108280671000f,
-  0.974339382785575860f, -0.225083911359792970f,
-  0.975702130038528460f, -0.219101240156870160f,
-  0.977028142657754170f, -0.213110319916091970f,
-  0.978317370719627650f, -0.207111376192218530f,
-  0.979569765685440520f, -0.201104634842092120f,
-  0.980785280403230320f, -0.195090322016128720f,
-  0.981963869109555350f, -0.189068664149806030f,
-  0.983105487431216290f, -0.183039887955141010f,
-  0.984210092386929030f, -0.177004220412149050f,
-  0.985277642388941110f, -0.170961888760301770f,
-  0.986308097244598670f, -0.164913120489969810f,
-  0.987301418157858320f, -0.158858143333861580f,
-  0.988257567730749460f, -0.152797185258443800f,
-  0.989176509964780900f, -0.146730474455362390f,
-  0.990058210262297120f, -0.140658239332849210f,
-  0.990902635427780010f, -0.134580708507126420f,
-  0.991709753669099410f, -0.128498110793793640f,
-  0.992479534598709970f, -0.122410675199216030f,
-  0.993211949234794500f, -0.116318630911904840f,
-  0.993906970002356060f, -0.110222207293883360f,
-  0.994564570734255420f, -0.104121633872055130f,
-  0.995184726672196930f, -0.098017140329560506f,
-  0.995767414467659820f, -0.091908956497132877f,
-  0.996312612182778000f, -0.085797312344440282f,
-  0.996820299291165670f, -0.079682437971430750f,
-  0.997290456678690210f, -0.073564563599667412f,
-  0.997723066644191640f, -0.067443919563664287f,
-  0.998118112900149180f, -0.061320736302209057f,
-  0.998475580573294770f, -0.055195244349689775f,
-  0.998795456205172410f, -0.049067674327418091f,
-  0.999077727752645360f, -0.042938256934941139f,
-  0.999322384588349430f, -0.036807222941359394f,
-  0.999529417501093140f, -0.030674803176636543f,
-  0.999698818696204250f, -0.024541228522912448f,
-  0.999830581795823400f, -0.018406729905805226f,
-  0.999924701839144500f, -0.012271538285720572f,
-  0.999981175282601110f, -0.006135884649154477f
-};
-
-static ne10_float32_t rfft_twiddle_coef_re[1024];
-static ne10_float32_t rfft_twiddle_coef_im[1024];
-
-
-/**
-* @brief  Initializations for Real FFT module
-* @param[in] *S                Instance pointer of Real FFT data structure.
-* @param[in] *S_CFFT        Instance pointer of Complex FFT data structure.
-* @param[in] fftLen            FFT length.
-* @param[in] ifftFlagR        0 = forward Real FFT.  1 = inverse Real FFT
-* @param[in] bitReverseFlag    0 = Result will be in bit-reversed order.  1 = Result will be in normal order
-* @return none.
-* The function initializes the Twiddle factors table and bit reverse table
-*/
-
-ne10_result_t ne10_rfft_init_float(
-  ne10_rfft_instance_f32_t * S,
-  ne10_cfft_radix4_instance_f32_t * S_CFFT,
-  ne10_uint32_t fftLen,
-  ne10_uint32_t ifftFlagR)
-{
-    ne10_uint32_t i,j;
-
-  /*  Initialise the default arm status */
-  ne10_result_t status = NE10_OK;
-
-  /*  Initialize the Real FFT length */
-  S->fft_len_real = (ne10_uint16_t) fftLen;
-
-  /*  Initialize the Complex FFT length */
-  S->fft_len_by2 = (ne10_uint16_t) fftLen / 2u;
-
-  /*  Initialize the Flag for selection of RFFT or RIFFT */
-  S->ifft_flag_r = (ne10_uint8_t) ifftFlagR;
-
-  /*  Initialize the Flag for calculation Bit reversal or not */
-  //S->bit_reverse_flag_r = (ne10_uint8_t) bitReverseFlag;
-
-  S->twid_coef_r_modifier = 1u;
-
-  /*  Initializations of structure parameters depending on the FFT length */
-  switch (S->fft_len_real)
-  {
-      /* Init table modifier value */
-      case 2048u:
-          for(i=0,j=0;i<1024;i++)
-          {
-              rfft_twiddle_coef_re[i] = rfft_twiddle_coef[2*j];
-              rfft_twiddle_coef_im[i] = rfft_twiddle_coef[2*j+1];
-              j= j+ 1;
-          }
-          /*  Initialize the Twiddle coefficientA pointer */
-          S->p_twiddle_A_real = (ne10_float32_t *) rfft_twiddle_coef_re;
-          /*  Initialize the Twiddle coefficientB pointer */
-          S->p_twiddle_B_real = (ne10_float32_t *) rfft_twiddle_coef_im;
-          break;
-      case 512u:
-          for(i=0,j=0;i<512;i++)
-          {
-              rfft_twiddle_coef_re[i] = rfft_twiddle_coef[2*j];
-              rfft_twiddle_coef_im[i] = rfft_twiddle_coef[2*j+1];
-              j= j+ 2;
-          }
-          /*  Initialize the Twiddle coefficientA pointer */
-          S->p_twiddle_A_real = (ne10_float32_t *) rfft_twiddle_coef_re;
-          /*  Initialize the Twiddle coefficientB pointer */
-          S->p_twiddle_B_real = (ne10_float32_t *) rfft_twiddle_coef_im;
-          break;
-      case 128u:
-          for(i=0,j=0;i<128;i++)
-          {
-              rfft_twiddle_coef_re[i] = rfft_twiddle_coef[2*j];
-              rfft_twiddle_coef_im[i] = rfft_twiddle_coef[2*j+1];
-              j= j+ 8;
-          }
-          /*  Initialize the Twiddle coefficientA pointer */
-          S->p_twiddle_A_real = (ne10_float32_t *) rfft_twiddle_coef_re;
-          /*  Initialize the Twiddle coefficientB pointer */
-          S->p_twiddle_B_real = (ne10_float32_t *) rfft_twiddle_coef_im;
-          break;
-      default:
-          /*  Reporting argument error if rfftSize is not valid value */
-          status = NE10_ERR;
-          break;
-  }
-
-  /* Init Complex FFT Instance */
-  S->p_cfft = S_CFFT;
-
-  if(S->ifft_flag_r)
-  {
-    /* Initializes the CIFFT Module for Nreal/2 length */
-    ne10_cfft_radix4_init_float(S->p_cfft, S->fft_len_by2, 1u);
-  }
-  else
-  {
-    /* Initializes the CFFT Module for Nreal/2 length */
-    ne10_cfft_radix4_init_float(S->p_cfft, S->fft_len_by2, 0u);
-  }
-
-  /* return the status of RFFT Init function */
-  return (status);
-
-}
-
-
index 34588f1..2054e74 100644 (file)
 
 #include "seatest.h"
 
-void test_fixture_cfft (void);
-void test_fixture_rfft (void);
-void test_fixture_fft_c2c_1d_float32(void);
-void test_fixture_fft_c2c_1d_int32(void);
-void test_fixture_fft_c2c_1d_int16(void);
-void test_fixture_fft_r2c_1d_float32(void);
-void test_fixture_fft_r2c_1d_int32(void);
-void test_fixture_fft_r2c_1d_int16(void);
+void test_fixture_fft_c2c_1d_float32 (void);
+void test_fixture_fft_c2c_1d_int32 (void);
+void test_fixture_fft_c2c_1d_int16 (void);
+void test_fixture_fft_r2c_1d_float32 (void);
+void test_fixture_fft_r2c_1d_int32 (void);
+void test_fixture_fft_r2c_1d_int16 (void);
 void test_fixture_fir (void);
 void test_fixture_fir_decimate (void);
 void test_fixture_fir_interpolate (void);
@@ -48,8 +46,6 @@ void test_fixture_iir_lattice (void);
 
 void all_tests (void)
 {
-    test_fixture_cfft();
-    test_fixture_rfft();
     test_fixture_fft_c2c_1d_float32();
     test_fixture_fft_c2c_1d_int32();
     test_fixture_fft_c2c_1d_int16();
diff --git a/modules/dsp/test/test_suite_cfft.c b/modules/dsp/test/test_suite_cfft.c
deleted file mode 100644 (file)
index cc640bc..0000000
+++ /dev/null
@@ -1,648 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * NE10 Library : test_suite_cfft.c
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <string.h>
-
-#include "NE10_dsp.h"
-#include "seatest.h"
-#include "unit_test_common.h"
-
-/* ----------------------------------------------------------------------
-** Global defines
-** ------------------------------------------------------------------- */
-
-/* Max FFT Length 1024 and double buffer for real and imag */
-#define TEST_LENGTH_SAMPLES (1024 * 2)
-
-#define TEST_COUNT 5000
-
-/* ----------------------------------------------------------------------
-** Test input data for F32
-** Generated by the MATLAB rand() function
-** ------------------------------------------------------------------- */
-
-static ne10_float32_t testInput_f32[TEST_LENGTH_SAMPLES] =
-{
-    -0.432565,    0.864397,    -1.665584,    0.094203,    0.125332,    -0.851909,    0.287676,    0.873504,
-    -1.146471,    -0.438039,    1.190915,    -0.429661,    1.189164,    -1.102729,    -0.037633,    0.396247,
-    0.327292,    -0.964925,    0.174639,    0.168449,    -0.186709,    -1.965359,    0.725791,    -0.744302,
-    -0.588317,    -0.552307,    2.183186,    -0.819726,    -0.136396,    1.109142,    0.113931,    -0.614946,
-    1.066768,    -0.254635,    0.059281,    -0.269830,    -0.095648,    -1.671994,    -0.832349,    -1.876045,
-    0.294411,    0.575006,    -1.336182,    -0.866133,    0.714325,    -2.116523,    1.623562,    -0.964466,
-    -0.691776,    0.212729,    0.857997,    0.477917,    1.254001,    0.100658,    -1.593730,    0.297433,
-    -1.440964,    0.570148,    0.571148,    -1.624496,    -0.399886,    0.643443,    0.689997,    0.681861,
-    0.815622,    0.014655,    0.711908,    -1.301541,    1.290250,    -1.284587,    0.668601,    0.812213,
-    1.190838,    0.838548,    -1.202457,    1.420321,    -0.019790,    -0.989752,    -0.156717,    -1.183229,
-    -1.604086,    -0.466259,    0.257304,    -0.365943,    -1.056473,    1.118333,    1.415141,    -0.465615,
-    -0.805090,    -1.560800,    0.528743,    -0.283103,    0.219321,    -1.322941,    -0.921902,    -0.196238,
-    -2.170674,    0.419039,    -0.059188,    0.742318,    -1.010634,    -0.143032,    0.614463,    -2.161943,
-    0.507741,    -0.644226,    1.692430,    1.439590,    0.591283,    -0.846917,    -0.643595,    0.057340,
-    0.380337,    0.643408,    -1.009116,    -0.670431,    -0.019511,    -0.003142,    -0.048221,    0.352931,
-    0.000043,    1.179502,    -0.317859,    -0.685902,    1.095004,    1.676789,    -1.873990,    -0.255309,
-    0.428183,    -0.647548,    0.895638,    -0.182214,    0.730957,    0.851800,    0.577857,    -0.306550,
-    0.040314,    -0.440529,    0.677089,    -0.611472,    0.568900,    -0.485207,    -0.255645,    1.197019,
-    -0.377469,    1.394788,    -0.295887,    0.165368,    -1.475135,    -0.509967,    -0.234004,    1.377717,
-    0.118445,    1.298518,    0.314809,    -0.130117,    1.443508,    0.740249,    -0.350975,    1.332017,
-    0.623234,    -0.278071,    0.799049,    -0.327993,    0.940890,    -0.012527,    -0.992092,    0.903179,
-    0.212035,    -1.112463,    0.237882,    -0.839211,    -1.007763,    0.035534,    -0.742045,    -1.246529,
-    1.082295,    0.884505,    -0.131500,    2.538334,    0.389880,    1.316795,    0.087987,    1.442213,
-    -0.635465,    1.466919,    -0.559573,    -1.107052,    0.443653,    -0.460936,    -0.949904,    -0.020296,
-    0.781182,    -0.045998,    0.568961,    -0.544487,    -0.821714,    0.917035,    -0.265607,    -0.019418,
-    -1.187777,    0.774630,    -2.202321,    -0.594053,    0.986337,    1.820276,    -0.518635,    0.524719,
-    0.327368,    0.685938,    0.234057,    -0.901304,    0.021466,    2.136023,    -1.003944,    0.320126,
-    -0.947146,    -1.584119,    -0.374429,    -0.502514,    -1.185886,    0.737926,    -1.055903,    -0.525392,
-    1.472480,    -1.532115,    0.055744,    -0.153786,    -1.217317,    -0.646732,    -0.041227,    -1.341450,
-    -1.128344,    0.271534,    -1.349278,    0.339541,    -0.261102,    1.674580,    0.953465,    0.335636,
-    0.128644,    -0.550556,    0.656468,    -0.286507,    -1.167819,    -0.814791,    -0.460605,    0.053508,
-    -0.262440,    -0.427841,    -1.213152,    0.463860,    -1.319437,    0.416588,    0.931218,    0.191634,
-    0.011245,    -1.284328,    -0.645146,    -1.006709,    0.805729,    0.041786,    0.231626,    -0.757276,
-    -0.989760,    2.278871,    1.339586,    -1.800414,    0.289502,    0.176299,    1.478917,    -0.263794,
-    1.138028,    -0.833888,    -0.684139,    0.220767,    -1.291936,    -0.882230,    -0.072926,    0.856510,
-    -0.330599,    -0.925690,    -0.843628,    -0.914070,    0.497770,    -1.327629,    1.488490,    1.611727,
-    -0.546476,    -0.561827,    -0.846758,    0.276041,    -0.246337,    -0.227653,    0.663024,    0.184183,
-    -0.854197,    0.082830,    -1.201315,    0.452035,    -0.119869,    0.101411,    -0.065294,    -0.365760,
-    0.485296,    -0.091035,    -0.595491,    0.739457,    -0.149668,    0.940328,    -0.434752,    -0.028961,
-    -0.079330,    -0.928710,    1.535152,    0.745038,    -0.606483,    2.488098,    -1.347363,    0.691925,
-    0.469383,    -0.941710,    -0.903567,    0.384997,    0.035880,    -0.278887,    -0.627531,    -0.982944,
-    0.535398,    1.620751,    0.552884,    -3.051825,    -0.203690,    -0.048454,    -2.054325,    0.318202,
-    0.132561,    -0.635514,    1.592941,    -1.028736,    1.018412,    1.641380,    -1.580402,    0.019495,
-    -0.078662,    -2.047269,    -0.681657,    -1.129305,    -1.024553,    -2.355586,    -1.234353,    -0.561249,
-    0.288807,    -0.087973,    -0.429303,    1.073777,    0.055801,    -0.311909,    -0.367874,    -1.478774,
-    -0.464973,    -0.043979,    0.370961,    -0.799868,    0.728283,    -0.865158,    2.112160,    -0.119007,
-    -1.357298,    -0.214830,    -1.022610,    0.007315,    1.037834,    -1.039472,    -0.389800,    0.832836,
-    -1.381266,    -0.746695,    0.315543,    0.349276,    1.553243,    0.484013,    0.707894,    -1.007859,
-    1.957385,    1.003469,    0.504542,    -2.676089,    1.864529,    0.016822,    -0.339812,    -1.443245,
-    -1.139779,    0.106502,    -0.211123,    -0.523471,    1.190245,    0.968581,    -1.116209,    -0.675762,
-    0.635274,    -1.086512,    -0.601412,    0.792917,    0.551185,    1.607967,    -1.099840,    -1.386200,
-    0.085991,    0.858656,    -2.004563,    0.207575,    -0.493088,    1.048865,    0.462048,    -0.784071,
-    -0.321005,    -0.326146,    1.236556,    -0.415365,    -0.631280,    -0.340785,    -2.325211,    0.565016,
-    -1.231637,    0.441829,    1.055648,    -0.109207,    -0.113224,    0.430549,    0.379224,    0.693041,
-    0.944200,    -0.547589,    -2.120427,    0.944736,    -0.644679,    -0.792557,    -0.704302,    0.280168,
-    -1.018137,    -1.642974,    -0.182082,    0.314746,    1.521013,    1.030286,    -0.038439,    1.751701,
-    1.227448,    -0.251608,    -0.696205,    1.819214,    0.007524,    1.234399,    -0.782893,    -2.339612,
-    0.586939,    -0.038625,    -0.251207,    0.007293,    0.480136,    -0.565029,    0.668155,    1.108257,
-    -0.078321,    0.520474,    0.889173,    -0.497671,    2.309287,    -0.177898,    0.524639,    1.091016,
-    -0.011787,    1.159731,    0.913141,    0.675004,    0.055941,    2.291756,    -1.107070,    -1.398845,
-    0.485498,    -1.532820,    -0.005005,    0.403012,    -0.276218,    -0.466509,    1.276452,    0.428272,
-    1.863401,    -1.390515,    -0.522559,    -0.613866,    0.103424,    -0.995531,    -0.807649,    -1.106047,
-    0.680439,    0.345156,    -2.364590,    1.638406,    0.990115,    -0.550912,    0.218899,    1.664607,
-    0.261662,    -0.048037,    1.213444,    0.662008,    -0.274667,    -0.296988,    -0.133134,    1.660689,
-    -1.270500,    0.057131,    -1.663606,    -2.227418,    -0.703554,    1.245199,    0.280880,    -1.158628,
-    -0.541209,    0.867397,    -1.333531,    -0.801315,    1.072686,    -0.263610,    -0.712085,    0.751058,
-    -0.011286,    1.795228,    -0.000817,    0.984351,    -0.249436,    0.046669,    0.396575,    0.323443,
-    -0.264013,    0.522442,    -1.664011,    -0.788527,    -1.028975,    0.734071,    0.243095,    0.080416,
-    -1.256590,    -0.543988,    -0.347183,    0.316257,    -0.941372,    -1.408710,    -1.174560,    0.186814,
-    -1.021142,    -2.262433,    -0.401667,    0.500375,    0.173666,    -0.224826,    -0.116118,    -1.455474,
-    1.064119,    -0.015503,    -0.245386,    -0.437796,    -1.517539,    0.907150,    0.009734,    1.284133,
-    0.071373,    -0.730091,    0.316536,    -1.472669,    0.499826,    -1.594354,    1.278084,    0.497586,
-    -0.547816,    0.741050,    0.260808,    -0.355039,    -0.013177,    -0.810574,    -0.580264,    0.238212,
-    2.136308,    1.505073,    -0.257617,    -1.189561,    -1.409528,    -0.194823,    1.770101,    0.624787,
-    0.325546,    -1.278067,    -1.119040,    0.100259,    0.620350,    -0.342182,    1.269782,    -0.002015,
-    -0.896043,    -0.498406,    0.135175,    1.049755,    -0.139040,    -1.670559,    -1.163395,    -2.014370,
-    1.183720,    0.986616,    -0.015430,    -0.060483,    0.536219,    1.192941,    -0.716429,    2.685580,
-    -0.655559,    0.853734,    0.314363,    1.005549,    0.106814,    -0.000982,    1.848216,    -0.560458,
-    -0.275106,    -0.191396,    2.212554,    -0.048913,    1.508526,    0.600460,    -1.945079,    -1.994642,
-    -1.680543,    -0.965134,    -0.573534,    -0.943199,    -0.185817,    -0.200671,    0.008934,    0.556167,
-    0.836950,    2.018381,    -0.722271,    1.813736,    -0.721490,    -0.112448,    -0.201181,    -0.889976,
-    -0.020464,    -0.726843,    0.278890,    0.763502,    1.058295,    -0.598514,    0.621673,    0.723730,
-    -1.750615,    -0.867938,    0.697348,    0.841673,    0.811486,    -0.850938,    0.636345,    0.933427,
-    1.310080,    0.485960,    0.327098,    -0.216203,    -0.672993,    -0.381497,    -0.149327,    -1.427041,
-    -2.449018,    -1.487669,    0.473286,    -2.515103,    0.116946,    -1.306210,    -0.591104,    -0.376950,
-    -0.654708,    -1.107504,    -1.080662,    0.312778,    -0.047731,    -0.845240,    0.379345,    0.237598,
-    -0.330361,    -0.918767,    -0.499898,    2.441691,    -0.035979,    0.083121,    -0.174760,    0.266263,
-    -0.957265,    -0.762727,    1.292548,    -2.492805,    0.440910,    -0.163872,    1.280941,    0.701879,
-    -0.497730,    -0.855063,    -1.118717,    0.373834,    0.807650,    -0.504156,    0.041200,    -1.074581,
-    -0.756209,    -0.632952,    -0.089129,    1.854859,    -2.008850,    0.467423,    1.083918,    1.316068,
-    -0.981191,    1.779038,    -0.688489,    -0.384638,    1.339479,    0.895129,    -0.909243,    0.473642,
-    -0.412858,    -0.023571,    -0.506163,    1.612449,    1.619748,    0.839672,    0.080901,    0.247906,
-    -1.081056,    -0.540454,    -1.124518,    -1.808434,    1.735676,    -0.266203,    1.937459,    0.769024,
-    1.635068,    0.076724,    -1.255940,    0.078595,    -0.213538,    1.063096,    -0.198932,    0.349197,
-    0.307499,    0.755430,    -0.572325,    -0.624003,    -0.977648,    -0.421374,    -0.446809,    0.596029,
-    1.082092,    -1.389987,    2.372648,    -2.634668,    0.229288,    -0.806934,    -0.266623,    0.091930,
-    0.701672,    2.371014,    -0.487590,    -0.008736,    1.862480,    2.122155,    1.106851,    -0.684231,
-    -1.227566,    -0.413033,    -0.669885,    -0.857683,    1.340929,    0.970899,    0.388083,    -1.064209,
-    0.393059,    1.500750,    -1.707334,    -0.470707,    0.227859,    1.549526,    0.685633,    0.089955,
-    -0.636790,    -1.859541,    -1.002606,    -1.408604,    -0.185621,    0.115434,    -1.054033,    -0.480661,
-    -0.071539,    1.236739,    0.279198,    -2.015435,    1.373275,    0.563520,    0.179841,    -0.043520,
-    -0.542017,    0.460448,    1.634191,    0.282654,    0.825215,    1.060032,    0.230761,    0.547056,
-    0.671634,    0.220117,    -0.508078,    -1.909701,    0.856352,    1.117189,    0.268503,    -1.607931,
-    0.624975,    -1.443700,    -1.047338,    -0.314551,    1.535670,    0.766433,    0.434426,    0.174865,
-    -1.917136,    1.316849,    0.469940,    0.958586,    1.274351,    0.647691,    0.638542,    0.092485,
-    1.380782,    -0.411274,    1.319843,    0.346629,    -0.909429,    -0.348980,    -2.305605,    -0.200402,
-    1.788730,    0.393261,    0.390798,    -1.852647,    0.020324,    0.996919,    -0.405977,    -0.481047,
-    -1.534895,    -0.295456,    0.221373,    -0.309043,    -1.374479,    -0.383007,    -0.839286,    1.023837,
-    -0.208643,    1.360480,    0.755913,    -0.705832,    0.375734,    -0.609368,    -1.345413,    -0.112009,
-    1.481876,    0.905851,    0.032736,    -0.592901,    1.870453,    2.144165,    -1.208991,    0.748569,
-    -0.782632,    -1.654092,    -0.767299,    -0.977911,    -0.107200,    -0.347368,    -0.977057,    -0.107734,
-    -0.963988,    -0.402626,    -2.379172,    -1.065617,    -0.838188,    0.878523,    0.257346,    0.460551,
-    -0.183834,    -1.078622,    -0.167615,    0.644741,    -0.116989,    0.605399,    0.168488,    0.055073,
-    -0.501206,    -0.005505,    -0.705076,    -0.099485,    0.508165,    -0.225578,    -0.420922,    -1.026005,
-    0.229133,    -0.732352,    -0.959497,    -1.405453,    -0.146043,    -1.119476,    0.744538,    0.186157,
-    -0.890496,    -0.314564,    0.139062,    -0.088767,    -0.236144,    -0.160919,    -0.075459,    -1.936278,
-    -0.358572,    2.751755,    -2.077635,    1.292404,    -0.143546,    -0.233895,    1.393341,    -0.193140,
-    0.651804,    -0.104019,    -0.377134,    -0.814926,    -0.661443,    -0.108576,    0.248958,    -1.569143,
-    -0.383516,    0.212114,    -0.528480,    1.678775,    0.055388,    0.379010,    1.253769,    -0.668419,
-    -2.520004,    1.727974,    0.584856,    1.693388,    -1.008064,    -0.787045,    0.944285,    -1.874471,
-    -2.423957,    0.023853,    -0.223831,    1.518454,    0.058070,    0.534477,    -0.424614,    -1.355467,
-    -0.202918,    0.280923,    -1.513077,    0.182100,    -1.126352,    -0.256567,    -0.815002,    0.858411,
-    0.366614,    0.057070,    -0.586107,    -1.462498,    1.537409,    -2.326166,    0.140072,    2.562645,
-    -1.862767,    -0.639321,    -0.454193,    0.706010,    -0.652074,    0.627374,    0.103318,    -1.465271,
-    -0.220632,    0.548954,    -0.279043,    1.894620,    -0.733662,    0.901939,    -0.064534,    -0.684842,
-    -1.444004,    -0.410065,    0.612340,    -1.834344,    -1.323503,    -0.357176,    -0.661577,    -0.081545,
-    -0.146115,    -0.557160,    0.248085,    -1.778299,    -0.076633,    0.038674,    1.738170,    1.603402,
-    1.621972,    0.428308,    0.626436,    -0.321679,    0.091814,    0.158667,    -0.807607,    -1.831225,
-    -0.461337,    1.083138,    -1.405969,    -0.442318,    -0.374530,    0.213002,    -0.470911,    -0.429068,
-    1.751296,    1.112692,    0.753225,    1.054038,    0.064989,    0.192183,    -0.292764,    -0.175647,
-    0.082823,    0.561421,    0.766191,    1.251021,    2.236850,    -0.419377,    0.326887,    -1.464906,
-    0.863304,    -0.953308,    0.679387,    1.384259,    0.554758,    -0.966553,    1.001630,    -0.002071,
-    1.259365,    0.508627,    0.044151,    0.346342,    -0.314138,    -1.396941,    0.226708,    0.520130,
-    0.996692,    -0.349830,    1.215912,    0.530292,    -0.542702,    -0.256369,    0.912228,    -1.617286,
-    -0.172141,    1.556859,    -0.335955,    0.821068,    0.541487,    0.206095,    0.932111,    -1.697353,
-    -0.570253,    -0.168337,    -1.498605,    0.828194,    -0.050346,    0.047643,    0.553025,    -0.815924,
-    0.083498,    0.927294,    1.577524,    1.072150,    -0.330774,    0.775039,    0.795155,    -1.018418,
-    -0.784800,    -1.575652,    -1.263121,    1.943766,    0.666655,    1.479345,    -1.392632,    1.581105,
-    -1.300562,    -0.514692,    -0.605022,    -0.907108,    -1.488565,    2.258803,    0.558543,    0.040773,
-    -0.277354,    0.242866,    -1.293685,    -0.346606,    -0.888435,    1.047313,    -0.986520,    -0.267101,
-    -0.071618,    -0.821778,    -2.414591,    0.035640,    -0.694349,    1.483087,    -1.391389,    0.361272,
-    0.329648,    0.623759,    0.598544,    -0.910249,    0.147175,    -2.556832,    -0.101439,    1.665057,
-    -2.634981,    -0.959581,    0.028053,    -0.516870,    -0.876310,    -0.004631,    -0.265477,    -0.435447,
-    -0.327578,    0.881754,    -1.158247,    0.497467,    0.580053,    -0.853947,    0.239756,    0.541670,
-    -0.350885,    0.551414,    0.892098,    -0.137816,    1.578299,    -0.643850,    -1.108174,    -1.300456,
-    -0.025931,    -1.254519,    -1.110628,    1.840194,    0.750834,    -0.658852,    0.500167,    -0.275497,
-    -0.517261,    1.482824,    -0.559209,    -0.008348,    -0.753371,    0.090242,    0.925813,    -1.871995,
-    -0.248520,    -2.196485,    -0.149835,    -1.042585,    -1.258415,    0.545135,    0.312620,    -1.164465,
-    2.690277,    0.796787,    0.289696,    -0.250295,    -1.422803,    -1.112213,    0.246786,    -0.273161,
-    -1.435773,    -1.013451,    0.148573,    0.872165,    -1.693073,    -1.055581,    0.719188,    0.848015,
-    1.141773,    0.301299,    1.551936,    -0.682287,    1.383630,    -0.507902,    -0.758092,    -1.029466,
-    0.442663,    -0.285836,    0.911098,    -1.676208,    -1.074086,    -0.497489,    0.201762,    -0.386898,
-    0.762863,    0.043459,    -1.288187,    -0.655169,    -0.952962,    -0.146682,    0.778175,    0.085724,
-    -0.006331,    -0.961628,    0.524487,    0.459634,    1.364272,    -0.516323,    0.482039,    -0.735290,
-    -0.787066,    1.470784,    0.751999,    0.997273,    -0.166888,    1.306983,    -0.816228,    0.101254,
-    2.094065,    1.577574,    0.080153,    2.966203,    -0.937295,    -0.293681,    0.635739,    1.343905,
-    1.682028,    -0.749792,    0.593634,    -0.698793,    0.790153,    -1.302117,    0.105254,    -0.171760,
-    -0.158579,    0.711281,    0.870907,    -0.161837,    -0.194759,    0.203779,    0.075474,    0.314225,
-    -0.526635,    0.216177,    -0.685484,    0.249631,    -0.268388,    -1.610941,    -1.188346,    -0.451156,
-    0.248579,    -1.600001,    0.102452,    -0.145813,    -0.041007,    1.192038,    -2.247582,    0.285689,
-    -0.510776,    0.951135,    0.249243,    -0.965380,    0.369197,    -1.109424,    0.179197,    -0.616816,
-    -0.037283,    -1.160418,    -1.603310,    0.271828,    0.339372,    -1.964992,    -0.131135,    -0.199710,
-    0.485190,    1.792235,    0.598751,    -0.079401,    -0.086031,    0.764729,    0.325292,    0.660399,
-    -0.335143,    -1.688575,    -0.322449,    -0.429974,    -0.382374,    0.072841,    -0.953371,    1.479787,
-    0.233576,    -0.178427,    1.235245,    -1.206583,    -0.578532,    0.391987,    -0.501537,    -0.046549,
-    0.722864,    0.952528,    0.039498,    0.492656,    1.541279,    0.307890,    -1.701053,    -1.667987,
-    -1.033741,    0.978541,    -0.763708,    -0.857147,    2.176426,    -0.442284,    0.431612,    0.503775,
-    -0.443765,    -0.188553,    0.029996,    -0.521717,    -0.315671,    0.211892,    0.977846,    -0.686392,
-    0.018295,    -0.884268,    0.817963,    -0.059569,    0.702341,    -2.475835,    -0.231271,    0.565874,
-    -0.113690,    -0.925429,    0.127941,    -0.941007,    -0.799410,    -0.190420,    -0.238612,    0.128090,
-    -0.089463,    -0.067882,    -1.023264,    1.471262,    0.937538,    1.067682,    -1.131719,    0.229875,
-    -0.710702,    -0.005993,    -1.169501,    -1.168195,    1.065437,    -0.901779,    -0.680394,    0.323208,
-    -1.725773,    -0.012327,    0.813200,    0.554138,    1.441867,    0.062695,    0.672272,    -0.642997,
-    0.138665,    -0.331304,    -0.859534,    -0.267175,    -0.752251,    -0.247761,    1.229615,    0.777400,
-    1.150754,    0.343907,    -0.608025,    0.863760,    0.806158,    0.858534,    0.217133,    0.687307,
-    -0.373461,    -1.299311,    -0.832030,    0.603825,    0.286866,    -1.623527,    -1.818892,    -0.620491,
-    -1.573051,    0.643601,    2.015666,    -1.145666,    -0.071982,    0.844191,    2.628909,    -0.042906,
-    -0.243317,    -0.504335,    0.173276,    -0.443272,    0.923207,    2.083052,    -0.178553,    1.858875,
-    -0.521705,    0.926594,    1.431962,    0.295415,    -0.870117,    -0.266329,    0.807542,    0.742388,
-    -0.510635,    -0.080934,    0.743514,    0.935612,    0.847898,    -0.835204,    -0.829901,    -0.745189,
-    0.532994,    1.361685,    1.032848,    -0.306150,    -1.052024,    0.878438,    0.362114,    -1.100646,
-    -0.036787,    -0.489116,    -1.227636,    -1.350240,    -0.275099,    0.787780,    -0.160435,    0.823409,
-    -1.083575,    -0.679319,    -1.954213,    0.597177,    -0.909487,    -1.171166,    -0.005579,    2.037004,
-    -1.723490,    -0.440698,    1.263077,    -0.278440,    -0.600433,    0.270728,    -2.063925,    0.400994,
-    0.110911,    0.073894,    1.487614,    -1.040991,    0.053002,    -1.453535,    0.161981,    0.234838,
-    -0.026878,    1.049677,    0.173576,    0.341401,    0.882168,    -0.992679,    0.182294,    -1.617417,
-    0.755295,    -0.444344,    0.508035,    -1.055734,    0.131880,    -1.498971,    0.280104,    0.178499,
-    -0.982848,    -0.957286,    -0.944087,    1.314400,    -0.013058,    0.030501,    0.354345,    0.072074,
-    -0.894709,    0.555023,    0.812111,    -0.729819,    0.109537,    1.096371,    2.731644,    1.335793,
-    0.411079,    0.411439,    -1.306862,    1.632891,    0.383806,    0.243401,    0.499504,    -0.003108,
-    -0.510786,    -0.738833,    0.234922,    -1.767899,    -0.597825,    1.794224,    0.020771,    1.281544,
-    0.419443,    0.128371,    1.191104,    -0.214895,    0.771214,    -0.370359,    -2.644222,    -1.158590,
-    0.285430,    -1.478329,    0.826093,    -1.475635,    -0.008122,    0.651251,    0.858438,    -0.092348,
-    0.774788,    -0.367252,    1.305945,    0.817150,    1.231503,    1.235605,    0.958564,    0.336264,
-    -1.654548,    0.231398,    -0.990396,    0.046288,    0.685236,    -0.313591,    -0.974870,    -1.073320,
-    -0.606726,    -0.063315,    0.686794,    0.915108,    0.020049,    -1.675039,    1.063801,    0.918174,
-    -1.341050,    1.023589,    0.479510,    -0.904933,    -1.633974,    -1.921451,    -1.442665,    -0.136733,
-    0.293781,    1.363955,    -0.140364,    0.783375,    -1.130341,    0.527358,    -0.292538,    -0.746975,
-    -0.582536,    1.711351,    -0.896348,    -0.151251,    0.248601,    1.519014,    -1.489663,    -0.399837,
-    0.313509,    -2.012764,    -2.025084,    0.714259,    0.528990,    -1.927481,    0.343471,    -0.873411,
-    0.758193,    -0.361042,    -0.691940,    -1.607898,    0.680179,    -0.776993,    -1.072541,    -0.320873,
-    0.899772,    -1.313487,    -2.123092,    -0.108506,    0.284712,    -1.017612,    -0.733323,    1.300697,
-    -0.773376,    1.216150,    0.151842,    -1.046754,    -0.336843,    0.123953,    0.970761,    -1.106525,
-    -0.107236,    0.490938,    1.013492,    -1.681596,    -0.475347,    -0.171544,    0.068948,    0.723101,
-    0.398592,    -0.777245,    1.116326,    -0.093156,    0.620451,    0.167638,    -0.287674,    -0.637968,
-    -1.371773,    -0.104036,    -0.685868,    0.631968,    0.331685,    -1.687695,    -0.997722,    -0.517832,
-    0.291418,    0.086520,    1.107078,    2.199959,    0.244959,    0.760919,    0.164976,    -1.456448,
-    0.406231,    -1.774895,    1.215981,    0.295850,    1.448424,    1.018757,    -1.025137,    -0.643993,
-    0.205418,    -1.111593,    0.588882,    1.458524,    -0.264024,    0.103186,    2.495318,    -0.638423,
-    0.855948,    -0.025377,    -0.850954,    -1.301284,    0.811879,    0.344693,    0.700242,    -1.360544,
-    0.759938,    0.235772,    -1.712909,    2.432551,    1.537021,    -0.352882,    -1.609847,    -0.253408,
-    1.109526,    -0.078679,    -1.109704,    -1.203886,    0.385469,    0.454205,    0.965231,    0.669661,
-    0.818297,    -0.402472,    0.037049,    0.759026,    -0.926012,    1.281841,    -0.111919,    0.803598,
-    -0.803030,    -1.204083,    -1.665006,    -0.826183,    -0.901401,    -0.711036,    0.588350,    0.436303,
-    0.554159,    1.021926,    -0.415173,    -0.362657,    0.061795,    -0.298298,    0.457432,    0.733463,
-    0.199014,    0.340668,    0.257558,    -1.106307,    2.080730,    -2.043328,    -2.277237,    -0.358905,
-    0.339022,    0.595400,    0.289894,    0.375452,    0.662261,    1.202134,    -0.580860,    0.543575,
-    0.887752,    0.288461,    0.171871,    -0.665957,    0.848821,    -0.151442,    0.963769,    -0.659762,
-    1.321918,    -1.980876,    -0.064345,    -1.824813,    1.317053,    -0.255301,    0.228017,    -0.826776,
-    -1.429637,    1.532493,    -0.149701,    1.704903,    -0.504968,    -0.214990,    -1.729141,    1.705440,
-    -0.417472,    0.371870,    -0.614969,    -0.264290,    0.720777,    2.503227,    0.339364,    0.735706,
-    0.882845,    -1.099957,    0.284245,    -1.292489,    -0.145541,    1.249176,    -0.089646,    0.198285,
-    0.289161,    -0.704900,    1.164831,    0.384689,    0.805729,    -0.744461,    -1.355643,    -0.085510,
-    0.120893,    -0.760827,    -0.222178,    0.588159,    0.571732,    -0.488786,    -0.300140,    -0.790720,
-    1.134277,    0.186925,    -0.179356,    1.323236,    -1.467067,    -0.252240,    1.395346,    0.394448,
-    0.440836,    1.221421,    0.565384,    -0.630894,    -0.693623,    -0.172785,    0.833869,    0.590400,
-    -2.237378,    0.485708,    1.097644,    -0.345472,    -0.001617,    0.387311,    -1.614573,    0.004570,
-    -1.228727,    0.384520,    0.207405,    -1.412140,    0.220942,    -1.196011,    -1.006073,    0.047957,
-    -0.453067,    0.422308,    1.399453,    1.080871,    -0.461964,    -0.072034,    0.032716,    -0.752875,
-    0.798783,    -0.555757,    0.896816,    -1.304965,    0.137892,    -0.112053,    -1.619146,    0.367034,
-    -1.646606,    -0.327046,    0.428707,    -0.336445,    -0.737231,    -0.388655,    0.564926,    1.680910,
-    -1.384167,    0.707246,    0.460268,    1.030518,    0.629384,    0.305059,    0.379847,    -1.121984,
-    -1.013330,    -0.122902,    -0.347243,    -0.693724,    0.441912,    0.875911,    -1.590240,    -1.094234,
-    -0.701417,    0.925002,    -1.077601,    -0.229572,    1.002220,    0.225260,    1.729481,    -0.335907,
-    0.709032,    1.218315,    -0.747897,    -0.096137,    0.228862,    0.120568,    -0.223497,    1.004884,
-    -0.853275,    -0.657371,    0.345627,    0.405173,    0.109764,    0.890271,    -1.133039,    1.449045,
-    -0.683124,    1.382923,    -0.277856,    1.176089,    0.654790,    -1.729798,    -1.248394,    0.104649,
-    -0.597539,    -1.487626,    -0.481813,    -1.743067,    0.983372,    -0.510919,    1.762121,    -0.067293,
-    1.427402,    -0.063941,    0.911763,    -2.196356,    0.326823,    1.106144,    0.069619,    1.526127,
-    -1.499763,    -0.687166,    -0.418223,    1.160927,    -0.021037,    -0.425076,    0.228425,    -0.060661,
-    -1.008196,    -1.899981,    -0.664622,    1.219038,    0.558177,    0.901112,    -1.188542,    0.823237,
-    -0.775481,    1.882210,    0.271042,    0.238406,    1.534976,    -0.429217,    -1.052283,    -1.797562,
-    0.625559,    1.467291,    -0.797626,    1.030351,    -0.313522,    0.892838,    -0.602210,    1.395587,
-    1.259060,    0.416488,    0.858484,    1.545120,    -2.105292,    0.664929,    -0.360937,    0.706299,
-    0.553557,    2.759293,    -1.556384,    -0.051700,    -0.206666,    -0.839668,    -0.425568,    1.555326,
-    0.493778,    0.149258,    -0.870908,    -1.684651,    0.079828,    -0.569951,    -0.521619,    0.488593,
-    -1.413861,    -0.029233,    -0.384293,    -2.238255,    -0.457922,    -2.117238,    -0.291471,    0.152666,
-    -0.301224,    -1.353589,    -1.588594,    -0.206453,    1.094287,    -1.204119,    1.324167,    -0.436854,
-    -0.126480,    0.047149,    -0.737164,    2.478964,    0.213719,    -1.288683,    -0.400529,    0.565879,
-    0.064938,    -0.489134,    -1.757996,    0.571975,    1.686748,    -0.533281,    0.327400,    0.764733,
-    0.715967,    -1.748576,    1.598648,    -0.729925,    -2.064741,    -0.004472,    -0.743632,    0.535993,
-    0.176185,    -0.021122,    0.527839,    -0.669683,    -0.553153,    -0.056435,    0.298280,    -0.213079,
-    -1.226607,    0.432893,    -0.189676,    -0.065721,    -0.301713,    -2.272297,    0.956956,    -1.046249,
-    -0.533366,    -0.478385,    -0.901082,    -0.765758,    -0.892552,    -0.093739,    0.278717,    -1.139068,
-    -0.745807,    -0.691504,    1.603464,    -3.596550,    0.574270,    0.463068,    0.320655,    -1.966329,
-    -0.151383,    1.222704,    0.315762,    0.237313,    1.343703,    -1.015985,    -2.237832,    0.640365
-};
-
-/* ----------------------------------------------------------------------
-** Defines each of the tests performed
-** ------------------------------------------------------------------- */
-
-typedef struct
-{
-    ne10_uint32_t fftSize;
-    ne10_uint32_t ifftFlag;
-    ne10_uint32_t doBitReverse;
-    ne10_float32_t *inputF32;
-} test_config_cfft;
-
-static test_config_cfft CONFIG_CFFT[] =
-{
-    {1024, 0, 1, &testInput_f32[0]},
-    {256, 0, 1, &testInput_f32[0]},
-    {64, 0, 1, &testInput_f32[0]},
-    {16, 0, 1, &testInput_f32[0]},
-};
-static test_config_cfft CONFIG_CFFT_PERF[] =
-{
-    {1024, 0, 1, &testInput_f32[0]},
-    {256, 0, 1, &testInput_f32[0]},
-    {64, 0, 1, &testInput_f32[0]},
-    {16, 0, 1, &testInput_f32[0]},
-};
-
-#define CFFT_NUM_TESTS (sizeof(CONFIG_CFFT) / sizeof(CONFIG_CFFT[0]) )
-#define CFFT_NUM_PERF_TESTS (sizeof(CONFIG_CFFT_PERF) / sizeof(CONFIG_CFFT_PERF[0]) )
-
-//input and output
-static ne10_float32_t * guarded_in_c = NULL;
-static ne10_float32_t * guarded_in_neon = NULL;
-static ne10_float32_t * in_c = NULL;
-static ne10_float32_t * in_neon = NULL;
-
-static ne10_float32_t * guarded_out_c = NULL;
-static ne10_float32_t * guarded_out_neon = NULL;
-static ne10_float32_t * out_c = NULL;
-static ne10_float32_t * out_neon = NULL;
-
-static ne10_float32_t snr = 0.0f;
-
-#ifdef PERFORMANCE_TEST
-static ne10_int64_t time_c = 0;
-static ne10_int64_t time_neon = 0;
-static ne10_int64_t time_overhead_c = 0;
-static ne10_int64_t time_overhead_neon = 0;
-static ne10_float32_t time_speedup = 0.0f;
-static ne10_float32_t time_savings = 0.0f;
-#endif
-
-void test_cfft_case0()
-{
-    ne10_float32_t *p_src = testInput_f32;
-    ne10_cfft_radix4_instance_f32_t S;
-
-    ne10_uint16_t loop = 0;
-    ne10_uint16_t k = 0;
-    ne10_uint16_t i = 0;
-    ne10_uint16_t pos = 0;
-
-    test_config_cfft *config;
-    ne10_result_t status = NE10_OK;
-
-    fprintf (stdout, "----------%30s start\n", __FUNCTION__);
-
-    /* init input memory */
-    NE10_SRC_ALLOC (in_c, guarded_in_c, TEST_LENGTH_SAMPLES); // 16 extra bytes at the begining and 16 extra bytes at the end
-    NE10_SRC_ALLOC (in_neon, guarded_in_neon, TEST_LENGTH_SAMPLES); // 16 extra bytes at the begining and 16 extra bytes at the end
-
-    /* init dst memory */
-    NE10_DST_ALLOC (out_c, guarded_out_c, TEST_LENGTH_SAMPLES);
-    NE10_DST_ALLOC (out_neon, guarded_out_neon, TEST_LENGTH_SAMPLES);
-
-#if defined (SMOKE_TEST)||(REGRESSION_TEST)
-    for (loop = 0; loop < CFFT_NUM_TESTS; loop++)
-    {
-        config = &CONFIG_CFFT[loop];
-
-        /* Initialize the CFFT/CIFFT module */
-        status = ne10_cfft_radix4_init_float (&S, config->fftSize, config->ifftFlag);
-
-        if (status == NE10_ERR)
-        {
-            printf ("fft init error!\n");
-        }
-
-        /* copy input to input buffer and clear the output buffer */
-        for (i = 0; i < 2 * config->fftSize; i++)
-        {
-            in_c[i] = testInput_f32[i];
-            in_neon[i] = testInput_f32[i];
-        }
-
-        /* FFT test */
-        GUARD_ARRAY (out_c, config->fftSize * 2);
-        GUARD_ARRAY (out_neon, config->fftSize * 2);
-
-        ne10_radix4_butterfly_float_c (out_c, in_c, S.fft_len, S.p_twiddle);
-        ne10_radix4_butterfly_float_neon (out_neon, in_neon, S.fft_len, S.p_twiddle);
-
-        CHECK_ARRAY_GUARD (out_c, config->fftSize * 2);
-        CHECK_ARRAY_GUARD (out_neon, config->fftSize * 2);
-
-        //conformance test 1: compare snr
-        snr = CAL_SNR_FLOAT32 (out_c, out_neon, 2 * config->fftSize);
-        assert_false ( (snr < SNR_THRESHOLD));
-
-        //conformance test 2: compare output of C and neon
-#if defined (DEBUG_TRACE)
-        printf ("--------------------config %d\n", loop);
-        printf ("fftSize: %d ifftFlag: %d\n", config->fftSize, config->ifftFlag);
-#endif
-        for (pos = 0; pos < config->fftSize * 2; pos++)
-        {
-#if defined (DEBUG_TRACE)
-            printf ("pos %d \n", pos);
-            printf ("c %f (0x%04X) neon %f (0x%04X)\n", out_c[pos], * (ne10_uint32_t*) &out_c[pos], out_neon[pos], * (ne10_uint32_t*) &out_neon[pos]);
-#endif
-            assert_float_vec_equal (&out_c[pos], &out_neon[pos], ERROR_MARGIN_LARGE, 1);
-        }
-
-        /* IFFT test */
-        /* copy input to input buffer and clear the output buffer */
-        for (i = 0; i < 2 * config->fftSize; i++)
-        {
-            in_c[i] = out_c[i];
-            in_neon[i] = out_neon[i];
-        }
-
-        GUARD_ARRAY (out_c, config->fftSize * 2);
-        GUARD_ARRAY (out_neon, config->fftSize * 2);
-
-        ne10_radix4_butterfly_inverse_float_c (out_c, in_c, S.fft_len, S.p_twiddle, S.one_by_fft_len);
-        ne10_radix4_butterfly_inverse_float_neon (out_neon, in_neon, S.fft_len, S.p_twiddle, S.one_by_fft_len);
-
-        CHECK_ARRAY_GUARD (out_c, config->fftSize * 2);
-        CHECK_ARRAY_GUARD (out_neon, config->fftSize * 2);
-
-        //conformance test 1: compare snr
-        snr = CAL_SNR_FLOAT32 (out_c, out_neon, 2 * config->fftSize);
-        assert_false ( (snr < SNR_THRESHOLD));
-
-        //conformance test 2: compare output of C and neon
-#if defined (DEBUG_TRACE)
-        printf ("--------------------config %d\n", loop);
-        printf ("fftSize: %d ifftFlag: %d\n", config->fftSize, config->ifftFlag);
-        printf ("snr: %f\n", snr);
-#endif
-        for (pos = 0; pos < config->fftSize * 2; pos++)
-        {
-#if defined (DEBUG_TRACE)
-            printf ("pos %d \n", pos);
-            printf ("c %f (0x%04X) neon %f (0x%04X)\n", out_c[pos], * (ne10_uint32_t*) &out_c[pos], out_neon[pos], * (ne10_uint32_t*) &out_neon[pos]);
-#endif
-            assert_float_vec_equal (&out_c[pos], &out_neon[pos], ERROR_MARGIN_LARGE, 1);
-        }
-    }
-#endif
-
-#ifdef PERFORMANCE_TEST
-    fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
-    for (loop = 0; loop < CFFT_NUM_PERF_TESTS; loop++)
-    {
-        config = &CONFIG_CFFT_PERF[loop];
-
-        /* Initialize the CFFT/CIFFT module */
-        status = ne10_cfft_radix4_init_float (&S, config->fftSize, config->ifftFlag);
-
-        if (status == NE10_ERR)
-        {
-            printf ("fft init error!\n");
-        }
-
-        /* FFT test */
-        GET_TIME
-        (
-            time_overhead_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_c[i] = testInput_f32[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_c[i] = testInput_f32[i];
-                }
-                ne10_radix4_butterfly_float_c (out_c, in_c, S.fft_len, S.p_twiddle);
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_overhead_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_neon[i] = testInput_f32[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_neon[i] = testInput_f32[i];
-                }
-                ne10_radix4_butterfly_float_neon (out_neon, in_neon, S.fft_len, S.p_twiddle);
-            }
-        }
-        );
-
-        time_c = time_c - time_overhead_c;
-        time_neon = time_neon - time_overhead_neon;
-        time_speedup = (ne10_float32_t) time_c / time_neon;
-        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
-        ne10_log (__FUNCTION__, "CFFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", S.fft_len, time_c, time_neon, time_savings, time_speedup);
-
-        /* IFFT test */
-        GET_TIME
-        (
-            time_overhead_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_c[i] = out_c[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_c[i] = out_c[i];
-                }
-                ne10_radix4_butterfly_inverse_float_c (out_c, in_c, S.fft_len, S.p_twiddle, S.one_by_fft_len);
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_overhead_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_neon[i] = out_neon[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_neon[i] = out_neon[i];
-                }
-                ne10_radix4_butterfly_inverse_float_neon (out_neon, in_neon, S.fft_len, S.p_twiddle, S.one_by_fft_len);
-            }
-        }
-        );
-
-        time_c = time_c - time_overhead_c;
-        time_neon = time_neon - time_overhead_neon;
-        time_speedup = (ne10_float32_t) time_c / time_neon;
-        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
-        ne10_log (__FUNCTION__, "CIFFT%20d%20lld%20lld%19.2f%%%18.2f:1\n", S.fft_len, time_c, time_neon, time_savings, time_speedup);
-    }
-#endif
-
-    free (guarded_in_c);
-    free (guarded_in_neon);
-    free (guarded_out_c);
-    free (guarded_out_neon);
-    fprintf (stdout, "----------%30s end\n", __FUNCTION__);
-}
-
-void test_cfft()
-{
-    test_cfft_case0();
-}
-
-static void my_test_setup (void)
-{
-     ne10_log_buffer_ptr = ne10_log_buffer;
-}
-
-void test_fixture_cfft (void)
-{
-    test_fixture_start();               // starts a fixture
-
-    fixture_setup (my_test_setup);
-
-    run_test (test_cfft);       // run tests
-
-    test_fixture_end();                 // ends a fixture
-}
index 597abf4..05a108c 100644 (file)
 ** ------------------------------------------------------------------- */
 
 /* Max FFT Length and double buffer for real and imag */
-#define TEST_LENGTH_SAMPLES (16384)
+#define TEST_LENGTH_SAMPLES (32768)
 #define MIN_LENGTH_SAMPLES_CPX (4)
 #define MIN_LENGTH_SAMPLES_REAL (MIN_LENGTH_SAMPLES_CPX*2)
 
-#define TEST_COUNT 250000
+#define TEST_COUNT 10000000
 
 /* ----------------------------------------------------------------------
 ** Test input data for F32
@@ -62,18 +62,26 @@ static ne10_float32_t testInput_f32[TEST_LENGTH_SAMPLES * 2];
 ** ------------------------------------------------------------------- */
 
 //input and output
-static ne10_float32_t * in = NULL;
 static ne10_float32_t * guarded_in_c = NULL;
 static ne10_float32_t * guarded_in_neon = NULL;
 static ne10_float32_t * in_c = NULL;
 static ne10_float32_t * in_neon = NULL;
+static ne10_float32_t * in_c2 = NULL;
+static ne10_float32_t * in_neon2 = NULL;
+static ne10_float32_t * guarded_in_c2 = NULL;
+static ne10_float32_t * guarded_in_neon2 = NULL;
 
 static ne10_float32_t * guarded_out_c = NULL;
 static ne10_float32_t * guarded_out_neon = NULL;
 static ne10_float32_t * out_c = NULL;
 static ne10_float32_t * out_neon = NULL;
+static ne10_float32_t * guarded_out_c2 = NULL;
+static ne10_float32_t * guarded_out_neon2 = NULL;
+static ne10_float32_t * out_c2 = NULL;
+static ne10_float32_t * out_neon2 = NULL;
 
 static ne10_float32_t snr = 0.0f;
+static ne10_float32_t snr2 = 0.0f;
 
 static ne10_int64_t time_c = 0;
 static ne10_int64_t time_neon = 0;
@@ -81,6 +89,8 @@ static ne10_int64_t time_overhead_c = 0;
 static ne10_int64_t time_overhead_neon = 0;
 static ne10_float32_t time_speedup = 0.0f;
 static ne10_float32_t time_savings = 0.0f;
+static ne10_int64_t time_c2 = 0;
+static ne10_int64_t time_neon2 = 0;
 
 void test_fft_c2c_1d_float32_conformance()
 {
@@ -190,7 +200,7 @@ void test_fft_c2c_1d_float32_performance()
         memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
         memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
         cfg = ne10_fft_alloc_c2c_float32 (fftSize);
-        test_loop = TEST_COUNT/fftSize;
+        test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
         (
@@ -365,7 +375,7 @@ void test_fft_r2c_1d_float32_performance()
         memcpy (in_c, testInput_f32, fftSize * sizeof (ne10_float32_t));
         memcpy (in_neon, testInput_f32, fftSize * sizeof (ne10_float32_t));
         cfg = ne10_fft_alloc_r2c_float32 (fftSize);
-        test_loop = TEST_COUNT/fftSize;
+        test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
         (
index 5ebcd0a..cad4882 100644 (file)
@@ -119,9 +119,9 @@ void test_fft_c2c_1d_int16_conformance()
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int16_t));
 
         ne10_fft_c2c_1d_int16_scaled_c ( (ne10_fft_cpx_int16_t*) out_c, (ne10_fft_cpx_int16_t*) in_c,
-                                  cfg->twiddles, cfg->factors, fftSize, 0);
+                                         cfg->twiddles, cfg->factors, fftSize, 0);
         ne10_fft_c2c_1d_int16_scaled_neon ( (ne10_fft_cpx_int16_t*) out_neon, (ne10_fft_cpx_int16_t*) in_neon,
-                                     cfg->twiddles, cfg->factors, fftSize, 0);
+                                            cfg->twiddles, cfg->factors, fftSize, 0);
 
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int16_t));
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int16_t));
@@ -143,9 +143,9 @@ void test_fft_c2c_1d_int16_conformance()
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int16_t));
 
         ne10_fft_c2c_1d_int16_scaled_c ( (ne10_fft_cpx_int16_t*) out_c, (ne10_fft_cpx_int16_t*) in_c,
-                                  cfg->twiddles, cfg->factors, fftSize, 1);
+                                         cfg->twiddles, cfg->factors, fftSize, 1);
         ne10_fft_c2c_1d_int16_scaled_neon ( (ne10_fft_cpx_int16_t*) out_neon, (ne10_fft_cpx_int16_t*) in_neon,
-                                     cfg->twiddles, cfg->factors, fftSize, 1);
+                                            cfg->twiddles, cfg->factors, fftSize, 1);
 
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int16_t));
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int16_t));
@@ -205,7 +205,7 @@ void test_fft_c2c_1d_int16_performance()
         memcpy (in_c, testInput_i16, 2 * fftSize * sizeof (ne10_int16_t));
         memcpy (in_neon, testInput_i16, 2 * fftSize * sizeof (ne10_int16_t));
         cfg = ne10_fft_alloc_c2c_int16 (fftSize);
-        test_loop = TEST_COUNT/fftSize;
+        test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
         (
@@ -397,7 +397,7 @@ void test_fft_r2c_1d_int16_performance()
         memcpy (in_c, testInput_i16, fftSize * sizeof (ne10_int16_t));
         memcpy (in_neon, testInput_i16, fftSize * sizeof (ne10_int16_t));
         cfg = ne10_fft_alloc_r2c_int16 (fftSize);
-        test_loop = TEST_COUNT/fftSize;
+        test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
         (
index 9dd8531..3cc77f9 100644 (file)
@@ -106,7 +106,7 @@ void test_fft_c2c_1d_int32_conformance()
     for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
     {
         testInput_i32[i] = (ne10_int32_t) (drand48() * 8192) - 4096;
-        testInput2_i32[i] = (ne10_int32_t) (drand48() * NE10_F2I32_MAX) - NE10_F2I32_MAX/2;
+        testInput2_i32[i] = (ne10_int32_t) (drand48() * NE10_F2I32_MAX) - NE10_F2I32_MAX / 2;
     }
     for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
@@ -121,9 +121,9 @@ void test_fft_c2c_1d_int32_conformance()
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
         ne10_fft_c2c_1d_int32_unscaled_c ( (ne10_fft_cpx_int32_t*) out_c, (ne10_fft_cpx_int32_t*) in_c,
-                                  cfg->twiddles, cfg->factors, fftSize, 0);
+                                           cfg->twiddles, cfg->factors, fftSize, 0);
         ne10_fft_c2c_1d_int32_unscaled_neon ( (ne10_fft_cpx_int32_t*) out_neon, (ne10_fft_cpx_int32_t*) in_neon,
-                                     cfg->twiddles, cfg->factors, fftSize, 0);
+                                              cfg->twiddles, cfg->factors, fftSize, 0);
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
 
@@ -143,9 +143,9 @@ void test_fft_c2c_1d_int32_conformance()
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
         ne10_fft_c2c_1d_int32_unscaled_c ( (ne10_fft_cpx_int32_t*) out_c, (ne10_fft_cpx_int32_t*) in_c,
-                                  cfg->twiddles, cfg->factors, fftSize, 1);
+                                           cfg->twiddles, cfg->factors, fftSize, 1);
         ne10_fft_c2c_1d_int32_unscaled_neon ( (ne10_fft_cpx_int32_t*) out_neon, (ne10_fft_cpx_int32_t*) in_neon,
-                                     cfg->twiddles, cfg->factors, fftSize, 1);
+                                              cfg->twiddles, cfg->factors, fftSize, 1);
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
 
@@ -165,9 +165,9 @@ void test_fft_c2c_1d_int32_conformance()
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
         ne10_fft_c2c_1d_int32_scaled_c ( (ne10_fft_cpx_int32_t*) out_c, (ne10_fft_cpx_int32_t*) in_c,
-                                  cfg->twiddles, cfg->factors, fftSize, 0);
+                                         cfg->twiddles, cfg->factors, fftSize, 0);
         ne10_fft_c2c_1d_int32_scaled_neon ( (ne10_fft_cpx_int32_t*) out_neon, (ne10_fft_cpx_int32_t*) in_neon,
-                                     cfg->twiddles, cfg->factors, fftSize, 0);
+                                            cfg->twiddles, cfg->factors, fftSize, 0);
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
 
@@ -187,9 +187,9 @@ void test_fft_c2c_1d_int32_conformance()
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         GUARD_ARRAY_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
         ne10_fft_c2c_1d_int32_scaled_c ( (ne10_fft_cpx_int32_t*) out_c, (ne10_fft_cpx_int32_t*) in_c,
-                                  cfg->twiddles, cfg->factors, fftSize, 1);
+                                         cfg->twiddles, cfg->factors, fftSize, 1);
         ne10_fft_c2c_1d_int32_scaled_neon ( (ne10_fft_cpx_int32_t*) out_neon, (ne10_fft_cpx_int32_t*) in_neon,
-                                     cfg->twiddles, cfg->factors, fftSize, 1);
+                                            cfg->twiddles, cfg->factors, fftSize, 1);
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_c, fftSize * 2 * sizeof (ne10_int32_t));
         CHECK_ARRAY_GUARD_UINT8 ( (ne10_uint8_t*) out_neon, fftSize * 2 * sizeof (ne10_int32_t));
 
@@ -239,7 +239,7 @@ void test_fft_c2c_1d_int32_performance()
     for (i = 0; i < TEST_LENGTH_SAMPLES * 2; i++)
     {
         testInput_i32[i] = (ne10_int32_t) (drand48() * 8192) - 4096;
-        testInput2_i32[i] = (ne10_int32_t) (drand48() * NE10_F2I32_MAX) - NE10_F2I32_MAX/2;
+        testInput2_i32[i] = (ne10_int32_t) (drand48() * NE10_F2I32_MAX) - NE10_F2I32_MAX / 2;
     }
     for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
     {
@@ -249,7 +249,7 @@ void test_fft_c2c_1d_int32_performance()
         memcpy (in_c, testInput_i32, 2 * fftSize * sizeof (ne10_int32_t));
         memcpy (in_neon, testInput_i32, 2 * fftSize * sizeof (ne10_int32_t));
         cfg = ne10_fft_alloc_c2c_int32 (fftSize);
-        test_loop = TEST_COUNT/fftSize;
+        test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
         (
@@ -489,7 +489,7 @@ void test_fft_r2c_1d_int32_performance()
         memcpy (in_c, testInput_i32, fftSize * sizeof (ne10_int32_t));
         memcpy (in_neon, testInput_i32, fftSize * sizeof (ne10_int32_t));
         cfg = ne10_fft_alloc_r2c_int32 (fftSize);
-        test_loop = TEST_COUNT/fftSize;
+        test_loop = TEST_COUNT / fftSize;
 
         GET_TIME
         (
diff --git a/modules/dsp/test/test_suite_rfft.c b/modules/dsp/test/test_suite_rfft.c
deleted file mode 100644 (file)
index 7542843..0000000
+++ /dev/null
@@ -1,663 +0,0 @@
-/*
- *  Copyright 2012-14 ARM Limited
- *  All rights reserved.
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met:
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *    * Neither the name of ARM Limited nor the
- *      names of its contributors may be used to endorse or promote products
- *      derived from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
- *  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL ARM LIMITED BE LIABLE FOR ANY
- *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * NE10 Library : test_suite_rfft.c
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-
-#include "NE10_dsp.h"
-#include "seatest.h"
-
-
-/* ----------------------------------------------------------------------
-** Global defines
-** ------------------------------------------------------------------- */
-
-/* Max FFT Length 1024 and double buffer for real and imag */
-#define TEST_LENGTH_SAMPLES (1024 * 2)
-
-#define TEST_COUNT 5000
-
-/* ----------------------------------------------------------------------
-** Test input data for F32
-** Generated by the MATLAB rand() function
-** ------------------------------------------------------------------- */
-
-static ne10_float32_t testInput_f32[TEST_LENGTH_SAMPLES] =
-{
-    -0.432565,    -1.665584,    0.125332,    0.287676,    -1.146471,    1.190915,    1.189164,    -0.037633,
-    0.327292,    0.174639,    -0.186709,    0.725791,    -0.588317,    2.183186,    -0.136396,    0.113931,
-    1.066768,    0.059281,    -0.095648,    -0.832349,    0.294411,    -1.336182,    0.714325,    1.623562,
-    -0.691776,    0.857997,    1.254001,    -1.593730,    -1.440964,    0.571148,    -0.399886,    0.689997,
-    0.815622,    0.711908,    1.290250,    0.668601,    1.190838,    -1.202457,    -0.019790,    -0.156717,
-    -1.604086,    0.257304,    -1.056473,    1.415141,    -0.805090,    0.528743,    0.219321,    -0.921902,
-    -2.170674,    -0.059188,    -1.010634,    0.614463,    0.507741,    1.692430,    0.591283,    -0.643595,
-    0.380337,    -1.009116,    -0.019511,    -0.048221,    0.000043,    -0.317859,    1.095004,    -1.873990,
-    0.428183,    0.895638,    0.730957,    0.577857,    0.040314,    0.677089,    0.568900,    -0.255645,
-    -0.377469,    -0.295887,    -1.475135,    -0.234004,    0.118445,    0.314809,    1.443508,    -0.350975,
-    0.623234,    0.799049,    0.940890,    -0.992092,    0.212035,    0.237882,    -1.007763,    -0.742045,
-    1.082295,    -0.131500,    0.389880,    0.087987,    -0.635465,    -0.559573,    0.443653,    -0.949904,
-    0.781182,    0.568961,    -0.821714,    -0.265607,    -1.187777,    -2.202321,    0.986337,    -0.518635,
-    0.327368,    0.234057,    0.021466,    -1.003944,    -0.947146,    -0.374429,    -1.185886,    -1.055903,
-    1.472480,    0.055744,    -1.217317,    -0.041227,    -1.128344,    -1.349278,    -0.261102,    0.953465,
-    0.128644,    0.656468,    -1.167819,    -0.460605,    -0.262440,    -1.213152,    -1.319437,    0.931218,
-    0.011245,    -0.645146,    0.805729,    0.231626,    -0.989760,    1.339586,    0.289502,    1.478917,
-    1.138028,    -0.684139,    -1.291936,    -0.072926,    -0.330599,    -0.843628,    0.497770,    1.488490,
-    -0.546476,    -0.846758,    -0.246337,    0.663024,    -0.854197,    -1.201315,    -0.119869,    -0.065294,
-    0.485296,    -0.595491,    -0.149668,    -0.434752,    -0.079330,    1.535152,    -0.606483,    -1.347363,
-    0.469383,    -0.903567,    0.035880,    -0.627531,    0.535398,    0.552884,    -0.203690,    -2.054325,
-    0.132561,    1.592941,    1.018412,    -1.580402,    -0.078662,    -0.681657,    -1.024553,    -1.234353,
-    0.288807,    -0.429303,    0.055801,    -0.367874,    -0.464973,    0.370961,    0.728283,    2.112160,
-    -1.357298,    -1.022610,    1.037834,    -0.389800,    -1.381266,    0.315543,    1.553243,    0.707894,
-    1.957385,    0.504542,    1.864529,    -0.339812,    -1.139779,    -0.211123,    1.190245,    -1.116209,
-    0.635274,    -0.601412,    0.551185,    -1.099840,    0.085991,    -2.004563,    -0.493088,    0.462048,
-    -0.321005,    1.236556,    -0.631280,    -2.325211,    -1.231637,    1.055648,    -0.113224,    0.379224,
-    0.944200,    -2.120427,    -0.644679,    -0.704302,    -1.018137,    -0.182082,    1.521013,    -0.038439,
-    1.227448,    -0.696205,    0.007524,    -0.782893,    0.586939,    -0.251207,    0.480136,    0.668155,
-    -0.078321,    0.889173,    2.309287,    0.524639,    -0.011787,    0.913141,    0.055941,    -1.107070,
-    0.485498,    -0.005005,    -0.276218,    1.276452,    1.863401,    -0.522559,    0.103424,    -0.807649,
-    0.680439,    -2.364590,    0.990115,    0.218899,    0.261662,    1.213444,    -0.274667,    -0.133134,
-    -1.270500,    -1.663606,    -0.703554,    0.280880,    -0.541209,    -1.333531,    1.072686,    -0.712085,
-    -0.011286,    -0.000817,    -0.249436,    0.396575,    -0.264013,    -1.664011,    -1.028975,    0.243095,
-    -1.256590,    -0.347183,    -0.941372,    -1.174560,    -1.021142,    -0.401667,    0.173666,    -0.116118,
-    1.064119,    -0.245386,    -1.517539,    0.009734,    0.071373,    0.316536,    0.499826,    1.278084,
-    -0.547816,    0.260808,    -0.013177,    -0.580264,    2.136308,    -0.257617,    -1.409528,    1.770101,
-    0.325546,    -1.119040,    0.620350,    1.269782,    -0.896043,    0.135175,    -0.139040,    -1.163395,
-    1.183720,    -0.015430,    0.536219,    -0.716429,    -0.655559,    0.314363,    0.106814,    1.848216,
-    -0.275106,    2.212554,    1.508526,    -1.945079,    -1.680543,    -0.573534,    -0.185817,    0.008934,
-    0.836950,    -0.722271,    -0.721490,    -0.201181,    -0.020464,    0.278890,    1.058295,    0.621673,
-    -1.750615,    0.697348,    0.811486,    0.636345,    1.310080,    0.327098,    -0.672993,    -0.149327,
-    -2.449018,    0.473286,    0.116946,    -0.591104,    -0.654708,    -1.080662,    -0.047731,    0.379345,
-    -0.330361,    -0.499898,    -0.035979,    -0.174760,    -0.957265,    1.292548,    0.440910,    1.280941,
-    -0.497730,    -1.118717,    0.807650,    0.041200,    -0.756209,    -0.089129,    -2.008850,    1.083918,
-    -0.981191,    -0.688489,    1.339479,    -0.909243,    -0.412858,    -0.506163,    1.619748,    0.080901,
-    -1.081056,    -1.124518,    1.735676,    1.937459,    1.635068,    -1.255940,    -0.213538,    -0.198932,
-    0.307499,    -0.572325,    -0.977648,    -0.446809,    1.082092,    2.372648,    0.229288,    -0.266623,
-    0.701672,    -0.487590,    1.862480,    1.106851,    -1.227566,    -0.669885,    1.340929,    0.388083,
-    0.393059,    -1.707334,    0.227859,    0.685633,    -0.636790,    -1.002606,    -0.185621,    -1.054033,
-    -0.071539,    0.279198,    1.373275,    0.179841,    -0.542017,    1.634191,    0.825215,    0.230761,
-    0.671634,    -0.508078,    0.856352,    0.268503,    0.624975,    -1.047338,    1.535670,    0.434426,
-    -1.917136,    0.469940,    1.274351,    0.638542,    1.380782,    1.319843,    -0.909429,    -2.305605,
-    1.788730,    0.390798,    0.020324,    -0.405977,    -1.534895,    0.221373,    -1.374479,    -0.839286,
-    -0.208643,    0.755913,    0.375734,    -1.345413,    1.481876,    0.032736,    1.870453,    -1.208991,
-    -0.782632,    -0.767299,    -0.107200,    -0.977057,    -0.963988,    -2.379172,    -0.838188,    0.257346,
-    -0.183834,    -0.167615,    -0.116989,    0.168488,    -0.501206,    -0.705076,    0.508165,    -0.420922,
-    0.229133,    -0.959497,    -0.146043,    0.744538,    -0.890496,    0.139062,    -0.236144,    -0.075459,
-    -0.358572,    -2.077635,    -0.143546,    1.393341,    0.651804,    -0.377134,    -0.661443,    0.248958,
-    -0.383516,    -0.528480,    0.055388,    1.253769,    -2.520004,    0.584856,    -1.008064,    0.944285,
-    -2.423957,    -0.223831,    0.058070,    -0.424614,    -0.202918,    -1.513077,    -1.126352,    -0.815002,
-    0.366614,    -0.586107,    1.537409,    0.140072,    -1.862767,    -0.454193,    -0.652074,    0.103318,
-    -0.220632,    -0.279043,    -0.733662,    -0.064534,    -1.444004,    0.612340,    -1.323503,    -0.661577,
-    -0.146115,    0.248085,    -0.076633,    1.738170,    1.621972,    0.626436,    0.091814,    -0.807607,
-    -0.461337,    -1.405969,    -0.374530,    -0.470911,    1.751296,    0.753225,    0.064989,    -0.292764,
-    0.082823,    0.766191,    2.236850,    0.326887,    0.863304,    0.679387,    0.554758,    1.001630,
-    1.259365,    0.044151,    -0.314138,    0.226708,    0.996692,    1.215912,    -0.542702,    0.912228,
-    -0.172141,    -0.335955,    0.541487,    0.932111,    -0.570253,    -1.498605,    -0.050346,    0.553025,
-    0.083498,    1.577524,    -0.330774,    0.795155,    -0.784800,    -1.263121,    0.666655,    -1.392632,
-    -1.300562,    -0.605022,    -1.488565,    0.558543,    -0.277354,    -1.293685,    -0.888435,    -0.986520,
-    -0.071618,    -2.414591,    -0.694349,    -1.391389,    0.329648,    0.598544,    0.147175,    -0.101439,
-    -2.634981,    0.028053,    -0.876310,    -0.265477,    -0.327578,    -1.158247,    0.580053,    0.239756,
-    -0.350885,    0.892098,    1.578299,    -1.108174,    -0.025931,    -1.110628,    0.750834,    0.500167,
-    -0.517261,    -0.559209,    -0.753371,    0.925813,    -0.248520,    -0.149835,    -1.258415,    0.312620,
-    2.690277,    0.289696,    -1.422803,    0.246786,    -1.435773,    0.148573,    -1.693073,    0.719188,
-    1.141773,    1.551936,    1.383630,    -0.758092,    0.442663,    0.911098,    -1.074086,    0.201762,
-    0.762863,    -1.288187,    -0.952962,    0.778175,    -0.006331,    0.524487,    1.364272,    0.482039,
-    -0.787066,    0.751999,    -0.166888,    -0.816228,    2.094065,    0.080153,    -0.937295,    0.635739,
-    1.682028,    0.593634,    0.790153,    0.105254,    -0.158579,    0.870907,    -0.194759,    0.075474,
-    -0.526635,    -0.685484,    -0.268388,    -1.188346,    0.248579,    0.102452,    -0.041007,    -2.247582,
-    -0.510776,    0.249243,    0.369197,    0.179197,    -0.037283,    -1.603310,    0.339372,    -0.131135,
-    0.485190,    0.598751,    -0.086031,    0.325292,    -0.335143,    -0.322449,    -0.382374,    -0.953371,
-    0.233576,    1.235245,    -0.578532,    -0.501537,    0.722864,    0.039498,    1.541279,    -1.701053,
-    -1.033741,    -0.763708,    2.176426,    0.431612,    -0.443765,    0.029996,    -0.315671,    0.977846,
-    0.018295,    0.817963,    0.702341,    -0.231271,    -0.113690,    0.127941,    -0.799410,    -0.238612,
-    -0.089463,    -1.023264,    0.937538,    -1.131719,    -0.710702,    -1.169501,    1.065437,    -0.680394,
-    -1.725773,    0.813200,    1.441867,    0.672272,    0.138665,    -0.859534,    -0.752251,    1.229615,
-    1.150754,    -0.608025,    0.806158,    0.217133,    -0.373461,    -0.832030,    0.286866,    -1.818892,
-    -1.573051,    2.015666,    -0.071982,    2.628909,    -0.243317,    0.173276,    0.923207,    -0.178553,
-    -0.521705,    1.431962,    -0.870117,    0.807542,    -0.510635,    0.743514,    0.847898,    -0.829901,
-    0.532994,    1.032848,    -1.052024,    0.362114,    -0.036787,    -1.227636,    -0.275099,    -0.160435,
-    -1.083575,    -1.954213,    -0.909487,    -0.005579,    -1.723490,    1.263077,    -0.600433,    -2.063925,
-    0.110911,    1.487614,    0.053002,    0.161981,    -0.026878,    0.173576,    0.882168,    0.182294,
-    0.755295,    0.508035,    0.131880,    0.280104,    -0.982848,    -0.944087,    -0.013058,    0.354345,
-    -0.894709,    0.812111,    0.109537,    2.731644,    0.411079,    -1.306862,    0.383806,    0.499504,
-    -0.510786,    0.234922,    -0.597825,    0.020771,    0.419443,    1.191104,    0.771214,    -2.644222,
-    0.285430,    0.826093,    -0.008122,    0.858438,    0.774788,    1.305945,    1.231503,    0.958564,
-    -1.654548,    -0.990396,    0.685236,    -0.974870,    -0.606726,    0.686794,    0.020049,    1.063801,
-    -1.341050,    0.479510,    -1.633974,    -1.442665,    0.293781,    -0.140364,    -1.130341,    -0.292538,
-    -0.582536,    -0.896348,    0.248601,    -1.489663,    0.313509,    -2.025084,    0.528990,    0.343471,
-    0.758193,    -0.691940,    0.680179,    -1.072541,    0.899772,    -2.123092,    0.284712,    -0.733323,
-    -0.773376,    0.151842,    -0.336843,    0.970761,    -0.107236,    1.013492,    -0.475347,    0.068948,
-    0.398592,    1.116326,    0.620451,    -0.287674,    -1.371773,    -0.685868,    0.331685,    -0.997722,
-    0.291418,    1.107078,    0.244959,    0.164976,    0.406231,    1.215981,    1.448424,    -1.025137,
-    0.205418,    0.588882,    -0.264024,    2.495318,    0.855948,    -0.850954,    0.811879,    0.700242,
-    0.759938,    -1.712909,    1.537021,    -1.609847,    1.109526,    -1.109704,    0.385469,    0.965231,
-    0.818297,    0.037049,    -0.926012,    -0.111919,    -0.803030,    -1.665006,    -0.901401,    0.588350,
-    0.554159,    -0.415173,    0.061795,    0.457432,    0.199014,    0.257558,    2.080730,    -2.277237,
-    0.339022,    0.289894,    0.662261,    -0.580860,    0.887752,    0.171871,    0.848821,    0.963769,
-    1.321918,    -0.064345,    1.317053,    0.228017,    -1.429637,    -0.149701,    -0.504968,    -1.729141,
-    -0.417472,    -0.614969,    0.720777,    0.339364,    0.882845,    0.284245,    -0.145541,    -0.089646,
-    0.289161,    1.164831,    0.805729,    -1.355643,    0.120893,    -0.222178,    0.571732,    -0.300140,
-    1.134277,    -0.179356,    -1.467067,    1.395346,    0.440836,    0.565384,    -0.693623,    0.833869,
-    -2.237378,    1.097644,    -0.001617,    -1.614573,    -1.228727,    0.207405,    0.220942,    -1.006073,
-    -0.453067,    1.399453,    -0.461964,    0.032716,    0.798783,    0.896816,    0.137892,    -1.619146,
-    -1.646606,    0.428707,    -0.737231,    0.564926,    -1.384167,    0.460268,    0.629384,    0.379847,
-    -1.013330,    -0.347243,    0.441912,    -1.590240,    -0.701417,    -1.077601,    1.002220,    1.729481,
-    0.709032,    -0.747897,    0.228862,    -0.223497,    -0.853275,    0.345627,    0.109764,    -1.133039,
-    -0.683124,    -0.277856,    0.654790,    -1.248394,    -0.597539,    -0.481813,    0.983372,    1.762121,
-    1.427402,    0.911763,    0.326823,    0.069619,    -1.499763,    -0.418223,    -0.021037,    0.228425,
-    -1.008196,    -0.664622,    0.558177,    -1.188542,    -0.775481,    0.271042,    1.534976,    -1.052283,
-    0.625559,    -0.797626,    -0.313522,    -0.602210,    1.259060,    0.858484,    -2.105292,    -0.360937,
-    0.553557,    -1.556384,    -0.206666,    -0.425568,    0.493778,    -0.870908,    0.079828,    -0.521619,
-    -1.413861,    -0.384293,    -0.457922,    -0.291471,    -0.301224,    -1.588594,    1.094287,    1.324167,
-    -0.126480,    -0.737164,    0.213719,    -0.400529,    0.064938,    -1.757996,    1.686748,    0.327400,
-    0.715967,    1.598648,    -2.064741,    -0.743632,    0.176185,    0.527839,    -0.553153,    0.298280,
-    -1.226607,    -0.189676,    -0.301713,    0.956956,    -0.533366,    -0.901082,    -0.892552,    0.278717,
-    -0.745807,    1.603464,    0.574270,    0.320655,    -0.151383,    0.315762,    1.343703,    -2.237832,
-    1.292906,    -0.378459,    0.002521,    0.884641,    0.582450,    -1.614244,    -1.503666,    0.573586,
-    -0.910537,    -1.631277,    -0.359138,    -0.397616,    -1.161307,    -1.109838,    0.290672,    -1.910239,
-    1.314768,    0.665319,    -0.275115,    -0.023022,    -0.907976,    -1.043657,    0.373516,    0.901532,
-    1.278539,    -0.128456,    0.612821,    1.956518,    2.266326,    -0.373959,    2.238039,    -0.159580,
-    -0.703281,    0.563477,    -0.050296,    1.163593,    0.658808,    -1.550089,    -3.029118,    0.540578,
-    -1.008998,    0.908047,    1.582303,    -0.979088,    1.007902,    0.158491,    -0.586927,    1.574082,
-    -0.516649,    1.227800,    1.583876,    -2.088950,    2.949545,    1.356125,    1.050068,    -0.767170,
-    -0.257653,    -1.371845,    -1.267656,    -0.894948,    0.589089,    1.842629,    1.347967,    -0.491253,
-    -2.177568,    0.237000,    -0.735411,    -1.779419,    0.448030,    0.581214,    0.856607,    -0.266263,
-    -0.417470,    -0.205806,    -0.174323,    0.217577,    1.684295,    0.119528,    0.650667,    2.080061,
-    -0.339225,    0.730113,    0.293969,    -0.849109,    -2.533858,    -2.378941,    -0.346276,    -0.610937,
-    -0.408192,    -1.415611,    0.227122,    0.207974,    -0.719718,    0.757762,    -1.643135,    -1.056813,
-    -0.251662,    -1.298441,    1.233255,    1.494625,    0.235938,    -1.404359,    0.658791,    -2.556613,
-    -0.534945,    3.202525,    0.439198,    -1.149901,    0.886765,    -0.283386,    1.035336,    -0.364878,
-    1.341987,    1.008872,    0.213874,    -0.299264,    0.255849,    -0.190826,    -0.079060,    0.699851,
-    -0.796540,    -0.801284,    -0.007599,    -0.726810,    -1.490902,    0.870335,    -0.265675,    -1.566695,
-    -0.394636,    -0.143855,    -2.334247,    -1.357539,    -1.815689,    1.108422,    -0.142115,    1.112757,
-    0.559264,    0.478370,    -0.679385,    0.284967,    -1.332935,    -0.723980,    -0.663600,    0.198443,
-    -1.794868,    -1.387673,    0.197768,    1.469328,    0.366493,    -0.442775,    -0.048563,    0.077709,
-    1.957910,    -0.072848,    0.938810,    -0.079608,    -0.800959,    0.309424,    1.051826,    -1.664211,
-    -1.090792,    -0.191731,    0.463401,    -0.924147,    -0.649657,    0.622893,    -1.335107,    1.047689,
-    0.863327,    -0.642411,    0.660010,    1.294116,    0.314579,    0.859573,    0.128670,    0.016568,
-    -0.072801,    -0.994310,    -0.747358,    -0.030814,    0.988355,    -0.599017,    1.476644,    -0.813801,
-    0.645040,    -1.309919,    -0.867425,    -0.474233,    0.222417,    1.871323,    0.110001,    -0.411341,
-    0.511242,    -1.199117,    -0.096361,    0.445817,    -0.295825,    -0.167996,    0.179543,    0.421118,
-    1.677678,    1.996949,    0.696964,    -1.366382,    0.363045,    -0.567044,    -1.044154,    0.697139,
-    0.484026,    -0.193751,    -0.378095,    -0.886374,    -1.840197,    -1.628195,    -1.173789,    -0.415411,
-    0.175088,    0.229433,    -1.240889,    0.700004,    0.426877,    1.454803,    -0.510186,    -0.006657,
-    -0.525496,    0.717698,    1.088374,    0.500552,    2.771790,    -0.160309,    0.429489,    -1.966817,
-    -0.546019,    -1.888395,    -0.107952,    -1.316144,    -0.672632,    -0.902365,    -0.154798,    0.947242,
-    1.550375,    0.429040,    -0.560795,    0.179304,    -0.771509,    -0.943390,    -1.407569,    -1.906131,
-    -0.065293,    0.672149,    0.206147,    -0.008124,    0.020042,    -0.558447,    1.886079,    -0.219975,
-    -1.414395,    -0.302811,    -0.569574,    -0.121495,    -0.390171,    -0.844287,    -1.737757,    -0.449520,
-    -1.547933,    -0.095776,    0.907714,    2.369602,    0.519768,    0.410525,    1.052585,    0.428784,
-    1.295088,    -0.186053,    0.130733,    -0.657627,    -0.759267,    -0.595170,    0.812400,    0.069541,
-    -1.833687,    1.827363,    0.654075,    -1.544769,    -0.375109,    0.207688,    -0.765615,    -0.106355,
-    0.338769,    1.033461,    -1.404822,    -1.030570,    -0.643372,    0.170787,    1.344839,    1.936273,
-    0.741336,    0.811980,    -0.142808,    -0.099858,    -0.800131,    0.493249,    1.237574,    1.295951,
-    -0.278196,    0.217127,    0.630728,    -0.548549,    0.229632,    0.355311,    0.521284,    -0.615971,
-    1.345803,    0.974922,    -2.377934,    -1.092319,    -0.325710,    -2.012228,    1.567660,    0.233337,
-    0.646420,    -1.129412,    0.197038,    1.696870,    0.726034,    0.792526,    0.603357,    -0.058405,
-    -1.108666,    2.144229,    -1.352821,    0.457021,    0.391175,    2.073013,    -0.323318,    1.468132,
-    -0.502399,    0.209593,    0.754800,    -0.948189,    0.613157,    1.760503,    0.088762,    2.595570,
-    -0.675470,    2.786804,    -0.016827,    0.271651,    -0.914102,    -1.951371,    -0.317418,    0.588333,
-    0.828996,    -1.674851,    -1.922293,    -0.436662,    0.044974,    2.416609,    -0.309892,    0.187583,
-    0.947699,    -0.525703,    -1.115605,    -1.592320,    1.174844,    0.485144,    1.645480,    -0.454233,
-    1.008768,    2.049403,    0.602020,    0.017860,    -1.610426,    1.238752,    0.683587,    -0.780716,
-    0.530979,    2.134498,    0.354361,    0.231700,    1.287980,    -0.013488,    -1.333345,    -0.556343,
-    0.755597,    -0.911854,    1.371684,    0.245580,    0.118845,    0.384690,    -0.070152,    -0.578309,
-    0.469308,    1.299687,    1.634798,    -0.702809,    0.807253,    -1.027451,    1.294496,    0.014930,
-    0.218705,    1.713188,    -2.078805,    0.112917,    -1.086491,    -1.558311,    0.637406,    -0.404576,
-    -0.403325,    0.084076,    -0.435349,    -0.562623,    0.878062,    -0.814650,    -0.258363,    0.493299,
-    -0.802694,    -0.008329,    0.627571,    0.154382,    2.580735,    -1.306246,    1.023526,    0.777795,
-    -0.833884,    -0.586663,    0.065664,    -0.012342,    -0.076987,    -1.558587,    1.702607,    -0.468984,
-    0.094619,    0.287071,    0.919354,    0.510136,    0.245440,    -1.400519,    0.969571,    1.593698,
-    -1.437917,    -1.534230,    -0.074710,    0.081459,    -0.843240,    -0.564640,    -0.028207,    -1.243702,
-    0.733039,    0.059580,    0.149144,    1.595857,    -0.777250,    1.550277,    1.055002,    -0.166654,
-    0.314484,    1.419571,    0.327348,    0.475653,    0.398754,    -0.072770,    1.314784,    0.978279,
-    1.722114,    -0.412302,    0.565133,    0.739851,    0.220138,    1.312807,    0.629152,    -1.107987,
-    -0.447001,    -0.725993,    0.354045,    -0.506772,    -2.103747,    -0.664684,    1.450110,    -0.329805,
-    2.701872,    -1.634939,    -0.536325,    0.547223,    1.492603,    -0.455243,    -0.496416,    1.235260,
-    0.040926,    0.748467,    1.230764,    0.304903,    1.077771,    0.765151,    -1.319580,    -0.509191,
-    0.555116,    -1.957625,    -0.760453,    -2.443886,    -0.659366,    -0.114779,    0.300079,    -0.583996,
-    -3.073745,    1.551042,    -0.407369,    1.428095,    -1.353242,    0.903970,    0.541671,    -0.465020,
-    2.430415,    2.020479,    0.797287,    0.030996,    0.540738,    0.683921,    -0.590052,    -0.261084,
-    1.517068,    1.007259,    0.303421,    -0.817081,    -0.491192,    0.867467,    0.360790,    -0.080371,
-    0.749301,    -1.791968,    1.213226,    -0.060524,    -0.392520,    0.609547,    0.643580,    1.019521,
-    0.934437,    1.228582,    -0.249486,    -0.707583,    -0.593824,    -0.262310,    1.242847,    -1.548902,
-    -0.386760,    0.275098,    0.826154,    -0.979279,    -0.104297,    0.127849,    0.062544,    0.371624,
-    -0.103963,    -0.696775,    -0.386823,    0.016134,    1.369212,    0.416877,    0.068741,    0.294187,
-    0.472633,    1.782735,    0.260577,    1.510728,    0.316968,    0.803473,    0.580874,    1.778584,
-    -0.938075,    -0.916672,    0.376006,    0.909780,    0.154250,    -0.202264,    1.488708,    -0.621639,
-    0.809537,    1.928793,    0.396057,    -0.861399,    2.431936,    -0.840518,    0.280451,    0.820416,
-    1.227828,    -0.063565,    0.645265,    -1.771318,    0.059612,    -0.760177,    -1.690901,    1.103672,
-    1.462500,    0.236213,    -1.097691,    2.415233,    -0.402112,    0.914131,    -0.135959,    1.314193,
-    0.322361,    -0.476496,    0.076162,    -0.105147,    1.417013,    0.707911,    0.367918,    -0.602844,
-    -0.852110,    0.655122,    1.470184,    -0.810403,    -1.276157,    1.722268,    0.101878,    -0.801997,
-    -1.250837,    1.237717,    1.528165,    1.776923,    0.631168,    0.083259,    2.140043,    1.263469,
-    -1.750645,    -0.014432,    2.468102,    -0.669158,    0.259927,    -0.372328,    1.318554,    -0.653081,
-    0.062179,    -0.735873,    -0.179324,    1.084675,    0.136915,    -0.015608,    -0.938491,    -1.478085,
-    0.361931,    0.477791,    0.321742,    -1.877574,    0.680526,    0.233398,    1.239492,    0.125661,
-    0.179721,    -0.605061,    -1.036850,    -0.295278,    1.456114,    1.802525,    -1.333614,    0.387257,
-    -0.022809,    0.110596,    0.812811,    -1.009099,    -1.004572,    0.282958,    0.289750,    -0.247297,
-    -0.218864,    0.898687,    -0.642213,    -0.180445,    0.717913,    0.301386,    1.548895,    -0.044242,
-    -0.029651,    -0.382110,    -0.553929,    0.932358,    -1.315840,    -0.301519,    -2.599588,    0.780078,
-    0.602941,    0.942799,    -1.023913,    -0.067830,    0.081760,    -1.767027,    -1.781264,    -0.660354,
-    1.351417,    2.136370,    0.166783,    -1.705227,    0.276528,    0.394512,    -0.098555,    0.176450,
-    -1.837854,    -1.502291,    0.819197,    -0.234568,    -1.631598,    -0.317939,    -0.796289,    0.690800,
-    -0.042010,    0.324041,    0.506456,    -1.028590,    0.099426,    -0.116351,    0.689239,    1.883291,
-    0.325435,    -0.095213,    0.031172,    -0.613800,    -1.731258,    0.478775,    -0.447835,    0.386815,
-    0.052959,    -0.486085,    0.244473,    0.718309,    0.153485,    0.133783,    -1.006194,    1.306469,
-    1.199137,    -2.577336,    -2.086270,    0.386132,    -0.861031,    -1.230808,    2.641554,    -0.904404,
-    -1.223338,    0.303205,    -0.730097,    -1.143570,    -1.413193,    -0.591818,    0.518888,    -1.492811,
-    -0.086684,    -0.012620,    -0.345858,    0.986311,    0.643256,    2.919944,    -1.248585,    0.157115,
-    0.788733,    -0.577083,    0.527634,    1.671694,    0.800079,    0.883787,    -0.224185,    0.296991,
-    -0.521008,    -0.155359,    -0.098498,    0.997170,    0.434470,    -0.025721,    -0.379934,    -0.242396,
-    -1.165114,    0.756605,    1.164162,    -1.023455,    1.701589,    -0.494172,    0.172714,    0.354061,
-    -0.246258,    -0.145741,    -1.169008,    -0.022011,    0.618278,    1.865865,    0.081875,    1.607995,
-    -0.380666,    -1.299588,    -0.723958,    -0.564984,    0.621664,    -1.335471,    -0.123108,    -1.102815,
-    -2.753176,    0.252017,    -0.858148,    1.135363,    -0.297908,    1.154331,    1.046076,    2.126874,
-    -0.655774,    -1.142368,    0.949039,    -0.404608,    -0.384329,    0.482020,    0.443774,    0.381100,
-    1.102348,    0.856447,    -1.178509,    0.401970,    -0.584228,    -0.979486,    0.115106,    0.068471,
-    -0.529900,    0.541112,    0.681720,    0.538565,    -0.510035,    -1.322111,    -0.610659,    -0.565309,
-    0.086175,    0.691501,    2.133751,    -0.002864,    -0.089523,    -0.254982,    -0.874212,    0.422928,
-    -0.133399,    0.539578,    0.875171,    -1.250776,    0.868311,    -0.804806,    -0.752693,    -0.745812,
-    -0.309654,    -1.521891,    0.826531,    -0.612987,    0.959728,    1.972988,    0.294958,    -0.392651,
-    0.575927,    -1.141419,    0.061069,    0.012318,    -0.168118,    -0.687349,    -0.990650,    -0.049762,
-    0.719301,    -0.283063,    -1.424966,    0.461549,    1.091484,    -1.044295,    -2.842784,    0.996824,
-    0.076534,    -1.866737,    -0.613614,    1.169354,    -0.575013,    -0.264795,    0.004722,    -0.039410,
-    -0.505393,    -1.157832,    0.710427,    0.728172,    0.866884,    2.431569,    0.110204,    0.026449,
-    0.970324,    -0.005260,    1.409542,    1.757851,    0.885011,    1.140862,    0.403216,    0.191009,
-    -0.693627,    0.011036,    -1.105586,    1.907973,    -0.165412,    -0.732430,    -0.990741,    0.894305,
-    0.448227,    0.889219,    1.073337,    -0.104734,    1.547319,    0.169834,    0.804048,    -1.724029,
-    0.174133,    -0.484085,    -0.731627,    -2.131905,    -1.810366,    -0.052338,    -0.086212,    -1.189738,
-    -0.754141,    0.947278,    -0.182628,    -0.066268,    0.905018,    1.458216,    -1.117984,    1.813295,
-    0.150753,    -0.282994,    1.650122,    0.666378,    -0.346362,    -0.264042,    -0.644349,    -0.905540,
-    0.716679,    -0.007336,    -2.814799,    -0.149546,    0.577495,    0.753117,    -0.166985,    -0.581816,
-    0.365758,    -0.548919,    0.578737,    -1.955799,    0.522006,    1.601135,    0.732559,    0.555747,
-    -0.813346,    -0.538975,    1.307876,    -0.482579,    -1.752447,    -0.926570,    0.922440,    0.041001,
-    0.413647,    0.597244,    1.924270,    0.714119,    -2.312337,    1.380715,    1.390703,    -0.453904,
-    -0.628305,    1.023225,    -0.489111,    -0.402405,    1.399683,    0.280561,    1.880872,    -0.799673,
-    -0.560699,    1.708875,    -0.644810,    -1.422496,    -0.755937,    0.157520,    0.378346,    0.178665,
-    -0.602775,    -0.993406,    1.188948,    2.388009,    2.265523,    2.301073,    -0.270076,    0.502837,
-    -0.119191,    -0.001889,    -0.432649,    -0.194822,    0.985351,    0.468596,    -1.364901,    0.273689,
-    2.646683,    -0.053754,    0.472511,    -2.080034,    -0.802494,    -0.456793,    0.193857,    0.889525,
-    -1.591669,    -0.321976,    -0.703798,    -0.744287,    0.371287,    1.437276,    0.459913,    0.660738,
-    1.124368,    0.979412,    -1.316431,    -0.023211,    0.134547,    2.408125,    0.901705,    0.076185,
-    0.361743,    -2.058669,    -2.332033,    -0.370905,    1.285684,    0.557046,    -0.180229,    -0.035676
-};
-ne10_float32_t tmp_buffer[TEST_LENGTH_SAMPLES];
-/* ----------------------------------------------------------------------
-** Defines each of the tests performed
-** ------------------------------------------------------------------- */
-
-typedef struct
-{
-    ne10_uint32_t fftSize;
-    ne10_uint32_t ifftFlag;
-    ne10_uint32_t doBitReverse;
-    ne10_float32_t *inputF32;
-} test_config_rfft;
-
-static test_config_rfft CONFIG_RFFT[] =
-{
-    {128, 0, 1, &testInput_f32[0]},
-    {512, 0, 1, &testInput_f32[0]},
-};
-
-static test_config_rfft CONFIG_RFFT_PERF[] =
-{
-    {128, 0, 1, &testInput_f32[0]},
-    {512, 0, 1, &testInput_f32[0]},
-};
-
-#define RFFT_NUM_TESTS (sizeof(CONFIG_RFFT) / sizeof(CONFIG_RFFT[0]) )
-#define RFFT_NUM_PERF_TESTS (sizeof(CONFIG_RFFT_PERF) / sizeof(CONFIG_RFFT_PERF[0]) )
-
-//input and output
-static ne10_float32_t * guarded_in_c = NULL;
-static ne10_float32_t * guarded_in_neon = NULL;
-static ne10_float32_t * in_c = NULL;
-static ne10_float32_t * in_neon = NULL;
-
-static ne10_float32_t * guarded_out_c = NULL;
-static ne10_float32_t * guarded_out_neon = NULL;
-static ne10_float32_t * out_c = NULL;
-static ne10_float32_t * out_neon = NULL;
-
-static ne10_float32_t snr = 0.0f;
-
-#ifdef PERFORMANCE_TEST
-static ne10_int64_t time_c = 0;
-static ne10_int64_t time_neon = 0;
-static ne10_int64_t time_overhead_c = 0;
-static ne10_int64_t time_overhead_neon = 0;
-static ne10_float32_t time_speedup = 0.0f;
-static ne10_float32_t time_savings = 0.0f;
-#endif
-
-void test_rfft_case0()
-{
-    ne10_float32_t *p_src = testInput_f32;
-    ne10_rfft_instance_f32_t S;
-    ne10_cfft_radix4_instance_f32_t  S_CFFT;
-
-    ne10_uint16_t loop = 0;
-    ne10_uint16_t k = 0;
-    ne10_uint16_t i = 0;
-    ne10_uint16_t pos = 0;
-
-    test_config_rfft *config;
-    ne10_result_t status = NE10_OK;
-
-    fprintf (stdout, "----------%30s start\n", __FUNCTION__);
-
-    /* init input memory */
-    NE10_SRC_ALLOC (in_c, guarded_in_c, TEST_LENGTH_SAMPLES); // 16 extra bytes at the begining and 16 extra bytes at the end
-    NE10_SRC_ALLOC (in_neon, guarded_in_neon, TEST_LENGTH_SAMPLES); // 16 extra bytes at the begining and 16 extra bytes at the end
-
-    /* init dst memory */
-    NE10_DST_ALLOC (out_c, guarded_out_c, TEST_LENGTH_SAMPLES);
-    NE10_DST_ALLOC (out_neon, guarded_out_neon, TEST_LENGTH_SAMPLES);
-
-#if defined (SMOKE_TEST)||(REGRESSION_TEST)
-    for (loop = 0; loop < RFFT_NUM_TESTS; loop++)
-    {
-        config = &CONFIG_RFFT[loop];
-
-        /* Initialize the RFFT/RIFFT module */
-        status = ne10_rfft_init_float (&S, &S_CFFT, config->fftSize, config->ifftFlag);
-
-        if (status == NE10_ERR)
-        {
-            printf ("fft init error!\n");
-        }
-
-        /* copy input to input buffer and clear the output buffer */
-        for (i = 0; i < config->fftSize; i++)
-        {
-            in_c[i] = testInput_f32[i];
-            in_neon[i] = testInput_f32[i];
-        }
-
-        /* FFT test */
-        GUARD_ARRAY (out_c, config->fftSize * 2);
-        GUARD_ARRAY (out_neon, config->fftSize * 2);
-
-        ne10_rfft_float_c (&S, in_c, out_c, tmp_buffer);
-        ne10_rfft_float_neon (&S, in_neon, out_neon, tmp_buffer);
-
-
-        CHECK_ARRAY_GUARD (out_c, config->fftSize * 2);
-        CHECK_ARRAY_GUARD (out_neon, config->fftSize * 2);
-
-        //conformance test 1: compare snr
-        snr = CAL_SNR_FLOAT32 (out_c, out_neon, config->fftSize * 2);
-        assert_false ( (snr < SNR_THRESHOLD));
-
-        //conformance test 2: compare output of C and neon
-#if defined (DEBUG_TRACE)
-        printf ("-----------RFFT------------\n");
-        printf ("--------------------config %d\n", loop);
-        printf ("fftSize: %d\n", config->fftSize);
-        printf ("snr: %f\n", snr);
-#endif
-        for (pos = 0; pos < config->fftSize * 2; pos++)
-        {
-#if defined (DEBUG_TRACE)
-            printf ("pos %d \n", pos);
-            printf ("c %f (0x%04X) neon %f (0x%04X)\n", out_c[pos], * (ne10_uint32_t*) &out_c[pos], out_neon[pos], * (ne10_uint32_t*) &out_neon[pos]);
-#endif
-            assert_float_vec_equal (&out_c[pos], &out_neon[pos], ERROR_MARGIN_LARGE, 1);
-        }
-
-        /* IFFT test */
-        /* Initialize the RFFT/RIFFT module */
-        status = ne10_rfft_init_float (&S, &S_CFFT, config->fftSize, 1);
-
-        if (status == NE10_ERR)
-        {
-            printf ("fft init error!\n");
-        }
-
-        /* copy input to input buffer and clear the output buffer */
-        for (i = 0; i < config->fftSize * 2; i++)
-        {
-            in_c[i] = out_c[i];
-            in_neon[i] = out_neon[i];
-        }
-
-        GUARD_ARRAY (out_c, config->fftSize * 2);
-        GUARD_ARRAY (out_neon, config->fftSize * 2);
-
-        ne10_rfft_float_c (&S, in_c, out_c, tmp_buffer);
-        ne10_rfft_float_neon (&S, in_neon, out_neon, tmp_buffer);
-
-        CHECK_ARRAY_GUARD (out_c, config->fftSize * 2);
-        CHECK_ARRAY_GUARD (out_neon, config->fftSize * 2);
-
-        //conformance test 1: compare snr
-        snr = CAL_SNR_FLOAT32 (out_c, out_neon, config->fftSize);
-        assert_false ( (snr < SNR_THRESHOLD));
-
-        //conformance test 2: compare output of C and neon
-#if defined (DEBUG_TRACE)
-        printf ("-----------RIFFT------------\n");
-        printf ("--------------------config %d\n", loop);
-        printf ("fftSize: %d\n", config->fftSize);
-        printf ("snr: %f\n", snr);
-#endif
-        for (pos = 0; pos < config->fftSize; pos++)
-        {
-#if defined (DEBUG_TRACE)
-            printf ("pos %d \n", pos);
-            printf ("c %f (0x%04X) neon %f (0x%04X)\n", out_c[pos], * (ne10_uint32_t*) &out_c[pos], out_neon[pos], * (ne10_uint32_t*) &out_neon[pos]);
-#endif
-            assert_float_vec_equal (&out_c[pos], &out_neon[pos], ERROR_MARGIN_LARGE, 1);
-        }
-    }
-#endif
-
-#ifdef PERFORMANCE_TEST
-    fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time in ms", "NEON Time in ms", "Time Savings", "Performance Ratio");
-    for (loop = 0; loop < RFFT_NUM_PERF_TESTS; loop++)
-    {
-        config = &CONFIG_RFFT_PERF[loop];
-
-        /* Initialize the RFFT/RIFFT module */
-        status = ne10_rfft_init_float (&S, &S_CFFT, config->fftSize, config->ifftFlag);
-
-        if (status == NE10_ERR)
-        {
-            printf ("fft init error!\n");
-        }
-
-        /* FFT test */
-        /* Initialize the RFFT/RIFFT module */
-        status = ne10_rfft_init_float (&S, &S_CFFT, config->fftSize, config->ifftFlag);
-
-        GET_TIME
-        (
-            time_overhead_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < config->fftSize; i++)
-                {
-                    in_c[i] = testInput_f32[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < config->fftSize; i++)
-                {
-                    in_c[i] = testInput_f32[i];
-                }
-                ne10_rfft_float_c (&S, in_c, out_c, tmp_buffer);
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_overhead_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < config->fftSize; i++)
-                {
-                    in_neon[i] = testInput_f32[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < config->fftSize; i++)
-                {
-                    in_neon[i] = testInput_f32[i];
-                }
-                ne10_rfft_float_neon (&S, in_neon, out_neon, tmp_buffer);
-            }
-        }
-        );
-
-        time_c = time_c - time_overhead_c;
-        time_neon = time_neon - time_overhead_neon;
-        time_speedup = (ne10_float32_t) time_c / time_neon;
-        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
-        ne10_log (__FUNCTION__, "RFFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", config->fftSize, time_c, time_neon, time_savings, time_speedup);
-
-        /* IFFT test */
-        /* Initialize the RFFT/RIFFT module */
-        status = ne10_rfft_init_float (&S, &S_CFFT, config->fftSize, 1);
-
-        GET_TIME
-        (
-            time_overhead_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_c[i] = out_c[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_c,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_c[i] = out_c[i];
-                }
-                ne10_rfft_float_c (&S, in_c, out_c, tmp_buffer);
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_overhead_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_neon[i] = out_neon[i];
-                }
-            }
-        }
-        );
-
-        GET_TIME
-        (
-            time_neon,
-        {
-            for (k = 0; k < TEST_COUNT; k++)
-            {
-                for (i = 0; i < 2 * config->fftSize; i++)
-                {
-                    in_neon[i] = out_neon[i];
-                }
-                ne10_rfft_float_neon (&S, in_neon, out_neon, tmp_buffer);
-            }
-        }
-        );
-
-        time_c = time_c - time_overhead_c;
-        time_neon = time_neon - time_overhead_neon;
-        time_speedup = (ne10_float32_t) time_c / time_neon;
-        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
-        ne10_log (__FUNCTION__, "RIFFT%20d%20lld%20lld%19.2f%%%18.2f:1\n", config->fftSize, time_c, time_neon, time_savings, time_speedup);
-    }
-#endif
-
-    free (guarded_in_c);
-    free (guarded_in_neon);
-    free (guarded_out_c);
-    free (guarded_out_neon);
-    fprintf (stdout, "----------%30s end\n", __FUNCTION__);
-}
-
-void test_rfft()
-{
-    test_rfft_case0();
-}
-
-static void my_test_setup (void)
-{
-    ne10_log_buffer_ptr = ne10_log_buffer;
-}
-
-void test_fixture_rfft (void)
-{
-    test_fixture_start();               // starts a fixture
-
-    fixture_setup (my_test_setup);
-
-    run_test (test_rfft);
-
-    test_fixture_end();                 // ends a fixture
-}
index b217a12..7b466e3 100644 (file)
@@ -97,8 +97,6 @@ if(NE10_ENABLE_DSP)
     # Define dsp test files.
     set(NE10_TEST_DSP_SRCS
         ${PROJECT_SOURCE_DIR}/modules/dsp/test/test_main.c
-        ${PROJECT_SOURCE_DIR}/modules/dsp/test/test_suite_cfft.c
-        ${PROJECT_SOURCE_DIR}/modules/dsp/test/test_suite_rfft.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/test/test_suite_fft_float32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/test/test_suite_fft_int32.c
         ${PROJECT_SOURCE_DIR}/modules/dsp/test/test_suite_fft_int16.c