NE10/FFT/backward-complex-non-power-of-2 C
authorPhil.Wang <phil.wang@arm.com>
Wed, 17 Dec 2014 03:42:40 +0000 (11:42 +0800)
committerPhil.Wang <phil.wang@arm.com>
Wed, 17 Dec 2014 03:44:44 +0000 (11:44 +0800)
ARM 32-bit (Cortex-A9)
complex backward unscaled float GCC 4.9
        Time in ms      |
   |kiss|opus|pffft|NE10|
   |   C|   C| NEON|   C|
 60| 195| 172|   NA| 144|
120| 234| 200|   NA| 173|
240| 231| 203|  148| 175|
480| 267| 231|  176| 215|

ARM 64-bit (Cortex-A57)
complex backward unscaled float GCC 4.9
        Time in ms      |
   |kiss|opus|pffft|NE10|
   |   C|   C| NEON|   C|
 60| 125|  89|   NA|  87|
120| 141| 104|   NA| 103|
240| 146| 106|   52| 109|
480| 163| 120|   58| 127|

SNR > 100dB for all 2^M*3^N*5^K

Change-Id: Ie4bb27d053213bfbf2dbdd0020f9fda5db4312f9

modules/dsp/NE10_fft_generic_float32.c

index 4afb47e..699fd4f 100644 (file)
@@ -216,13 +216,14 @@ static inline void FFT5_FCU (ne10_fft_cpx_float32_t Fout[5],
 ////////////////////////////////////
 // Following are butterfly functions
 ////////////////////////////////////
-static inline void ne10_radix_2_butterfly_forward_float32_c (ne10_fft_cpx_float32_t *Fout,
+static inline void ne10_radix_2_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_fft_cpx_float32_t *twiddles,
         const ne10_int32_t fstride,
         const ne10_int32_t out_step,
         const ne10_int32_t nfft,
-        const ne10_int32_t is_first_stage)
+        const ne10_int32_t is_first_stage,
+        const ne10_int32_t is_inverse)
 {
     ne10_fft_cpx_float32_t scratch_in[2];
     ne10_fft_cpx_float32_t scratch_out[2];
@@ -238,6 +239,24 @@ static inline void ne10_radix_2_butterfly_forward_float32_c (ne10_fft_cpx_float3
             scratch_in[0] = Fin[0 * in_step];
             scratch_in[1] = Fin[1 * in_step];
 
+            if (is_inverse)
+            {
+                scratch_in[0].i = -scratch_in[0].i;
+                scratch_in[1].i = -scratch_in[1].i;
+            }
+
+#ifdef NE10_DSP_CFFT_SCALING
+            if (is_inverse && is_first_stage)
+            {
+                const ne10_float32_t one_by_nfft = 1.0 / nfft;
+
+                scratch_in[0].r *= one_by_nfft;
+                scratch_in[0].i *= one_by_nfft;
+                scratch_in[1].r *= one_by_nfft;
+                scratch_in[1].i *= one_by_nfft;
+            }
+#endif
+
             if (!is_first_stage)
             {
                 ne10_fft_cpx_float32_t scratch_tw[1];
@@ -253,6 +272,12 @@ static inline void ne10_radix_2_butterfly_forward_float32_c (ne10_fft_cpx_float3
 
             FFT2_FCU (scratch_out, scratch_in);
 
+            if (is_inverse)
+            {
+                scratch_out[0].i = -scratch_out[0].i;
+                scratch_out[1].i = -scratch_out[1].i;
+            }
+
             Fout[0 * out_step] = scratch_out[0];
             Fout[1 * out_step] = scratch_out[1];
 
@@ -276,13 +301,14 @@ static inline void ne10_radix_2_butterfly_forward_float32_c (ne10_fft_cpx_float3
     }
 }
 
-static inline void ne10_radix_4_butterfly_forward_float32_c (ne10_fft_cpx_float32_t *Fout,
+static inline void ne10_radix_4_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_fft_cpx_float32_t *twiddles,
         const ne10_int32_t fstride,
         const ne10_int32_t out_step,
         const ne10_int32_t nfft,
-        const ne10_int32_t is_first_stage)
+        const ne10_int32_t is_first_stage,
+        const ne10_int32_t is_inverse)
 {
     ne10_fft_cpx_float32_t scratch_in[4];
     ne10_fft_cpx_float32_t scratch_out[4];
@@ -300,6 +326,30 @@ static inline void ne10_radix_4_butterfly_forward_float32_c (ne10_fft_cpx_float3
             scratch_in[2] = Fin[2 * in_step];
             scratch_in[3] = Fin[3 * in_step];
 
+            if (is_inverse)
+            {
+                scratch_in[0].i = -scratch_in[0].i;
+                scratch_in[1].i = -scratch_in[1].i;
+                scratch_in[2].i = -scratch_in[2].i;
+                scratch_in[3].i = -scratch_in[3].i;
+            }
+
+#ifdef NE10_DSP_CFFT_SCALING
+            if (is_inverse && is_first_stage)
+            {
+                const ne10_float32_t one_by_nfft = 1.0 / nfft;
+
+                scratch_in[0].r *= one_by_nfft;
+                scratch_in[0].i *= one_by_nfft;
+                scratch_in[1].r *= one_by_nfft;
+                scratch_in[1].i *= one_by_nfft;
+                scratch_in[2].r *= one_by_nfft;
+                scratch_in[2].i *= one_by_nfft;
+                scratch_in[3].r *= one_by_nfft;
+                scratch_in[3].i *= one_by_nfft;
+            }
+#endif
+
             if (!is_first_stage)
             {
                 ne10_fft_cpx_float32_t scratch_tw[3];
@@ -319,6 +369,14 @@ static inline void ne10_radix_4_butterfly_forward_float32_c (ne10_fft_cpx_float3
 
             FFT4_FCU (scratch_out, scratch_in);
 
+            if (is_inverse)
+            {
+                scratch_out[0].i = -scratch_out[0].i;
+                scratch_out[1].i = -scratch_out[1].i;
+                scratch_out[2].i = -scratch_out[2].i;
+                scratch_out[3].i = -scratch_out[3].i;
+            }
+
             Fout[0 * out_step] = scratch_out[0];
             Fout[1 * out_step] = scratch_out[1];
             Fout[2 * out_step] = scratch_out[2];
@@ -344,13 +402,14 @@ static inline void ne10_radix_4_butterfly_forward_float32_c (ne10_fft_cpx_float3
     }
 }
 
-static inline void ne10_radix_3_butterfly_forward_float32_c (ne10_fft_cpx_float32_t *Fout,
+static inline void ne10_radix_3_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_fft_cpx_float32_t *twiddles,
         const ne10_int32_t fstride,
         const ne10_int32_t out_step,
         const ne10_int32_t nfft,
-        const ne10_int32_t is_first_stage)
+        const ne10_int32_t is_first_stage,
+        const ne10_int32_t is_inverse)
 {
     ne10_fft_cpx_float32_t scratch_in[3];
     ne10_fft_cpx_float32_t scratch_out[3];
@@ -367,6 +426,27 @@ static inline void ne10_radix_3_butterfly_forward_float32_c (ne10_fft_cpx_float3
             scratch_in[1] = Fin[1 * in_step];
             scratch_in[2] = Fin[2 * in_step];
 
+            if (is_inverse)
+            {
+                scratch_in[0].i = -scratch_in[0].i;
+                scratch_in[1].i = -scratch_in[1].i;
+                scratch_in[2].i = -scratch_in[2].i;
+            }
+
+#ifdef NE10_DSP_CFFT_SCALING
+            if (is_first_stage && is_inverse)
+            {
+                const ne10_float32_t one_by_nfft = 1.0 / nfft;
+
+                scratch_in[0].r *= one_by_nfft;
+                scratch_in[0].i *= one_by_nfft;
+                scratch_in[1].r *= one_by_nfft;
+                scratch_in[1].i *= one_by_nfft;
+                scratch_in[2].r *= one_by_nfft;
+                scratch_in[2].i *= one_by_nfft;
+            }
+#endif
+
             if (!is_first_stage)
             {
                 ne10_fft_cpx_float32_t scratch_tw[2];
@@ -384,6 +464,13 @@ static inline void ne10_radix_3_butterfly_forward_float32_c (ne10_fft_cpx_float3
 
             FFT3_FCU (scratch_out, scratch_in);
 
+            if (is_inverse)
+            {
+                scratch_out[0].i = -scratch_out[0].i;
+                scratch_out[1].i = -scratch_out[1].i;
+                scratch_out[2].i = -scratch_out[2].i;
+            }
+
             Fout[0 * out_step] = scratch_out[0];
             Fout[1 * out_step] = scratch_out[1];
             Fout[2 * out_step] = scratch_out[2];
@@ -408,13 +495,14 @@ static inline void ne10_radix_3_butterfly_forward_float32_c (ne10_fft_cpx_float3
     }
 }
 
-static inline void ne10_radix_5_butterfly_forward_float32_c (ne10_fft_cpx_float32_t *Fout,
+static inline void ne10_radix_5_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_fft_cpx_float32_t *twiddles,
         const ne10_int32_t fstride,
         const ne10_int32_t out_step,
         const ne10_int32_t nfft,
-        const ne10_int32_t is_first_stage)
+        const ne10_int32_t is_first_stage,
+        const ne10_int32_t is_inverse)
 {
     ne10_fft_cpx_float32_t scratch_in[5];
     ne10_fft_cpx_float32_t scratch_out[5];
@@ -433,6 +521,33 @@ static inline void ne10_radix_5_butterfly_forward_float32_c (ne10_fft_cpx_float3
             scratch_in[3] = Fin[3 * in_step];
             scratch_in[4] = Fin[4 * in_step];
 
+            if (is_inverse)
+            {
+                scratch_in[0].i = -scratch_in[0].i;
+                scratch_in[1].i = -scratch_in[1].i;
+                scratch_in[2].i = -scratch_in[2].i;
+                scratch_in[3].i = -scratch_in[3].i;
+                scratch_in[4].i = -scratch_in[4].i;
+            }
+
+#ifdef NE10_DSP_CFFT_SCALING
+            if (is_first_stage && is_inverse)
+            {
+                const ne10_float32_t one_by_nfft = 1.0 / nfft;
+
+                scratch_in[0].r *= one_by_nfft;
+                scratch_in[0].i *= one_by_nfft;
+                scratch_in[1].r *= one_by_nfft;
+                scratch_in[1].i *= one_by_nfft;
+                scratch_in[2].r *= one_by_nfft;
+                scratch_in[2].i *= one_by_nfft;
+                scratch_in[3].r *= one_by_nfft;
+                scratch_in[3].i *= one_by_nfft;
+                scratch_in[4].r *= one_by_nfft;
+                scratch_in[4].i *= one_by_nfft;
+            }
+#endif
+
             if (!is_first_stage)
             {
                 ne10_fft_cpx_float32_t scratch_tw[4];
@@ -454,6 +569,15 @@ static inline void ne10_radix_5_butterfly_forward_float32_c (ne10_fft_cpx_float3
 
             FFT5_FCU (scratch_out, scratch_in);
 
+            if (is_inverse)
+            {
+                scratch_out[0].i = -scratch_out[0].i;
+                scratch_out[1].i = -scratch_out[1].i;
+                scratch_out[2].i = -scratch_out[2].i;
+                scratch_out[3].i = -scratch_out[3].i;
+                scratch_out[4].i = -scratch_out[4].i;
+            }
+
             Fout[0 * out_step] = scratch_out[0];
             Fout[1 * out_step] = scratch_out[1];
             Fout[2 * out_step] = scratch_out[2];
@@ -480,12 +604,13 @@ static inline void ne10_radix_5_butterfly_forward_float32_c (ne10_fft_cpx_float3
     }
 }
 
-static inline void ne10_radix_generic_butterfly_forward_float32_c (ne10_fft_cpx_float32_t *Fout,
+static inline void ne10_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_fft_cpx_float32_t *twiddles,
         const ne10_int32_t radix,
         const ne10_int32_t in_step,
-        const ne10_int32_t out_step)
+        const ne10_int32_t out_step,
+        const ne10_int32_t is_inverse)
 {
     ne10_int32_t q, q1;
     ne10_int32_t f_count = in_step;
@@ -501,6 +626,15 @@ static inline void ne10_radix_generic_butterfly_forward_float32_c (ne10_fft_cpx_
         for (q1 = 0; q1 < radix; q1++)
         {
             scratch[q1] = Fin[in_step * q1];
+            if (is_inverse)
+            {
+                scratch[q1].i = -scratch[q1].i;
+#ifdef NE10_DSP_CFFT_SCALING
+                const ne10_float32_t one_by_nfft = 1.0 / (radix * in_step);
+                scratch[q1].r *= one_by_nfft;
+                scratch[q1].i *= one_by_nfft;
+#endif
+            }
         } // q1
 
         // compute Fout[q1 * out_step] from definition
@@ -518,6 +652,10 @@ static inline void ne10_radix_generic_butterfly_forward_float32_c (ne10_fft_cpx_
                 NE10_CPX_MUL_F32 (tmp, scratch[q], twiddles[twidx]);
                 NE10_CPX_ADDTO (Fout[q1 * out_step], tmp);
             } // q
+            if (is_inverse)
+            {
+                Fout[q1 * out_step].i = -Fout[q1 * out_step].i;
+            }
         } // q1
 
         Fout += radix;
@@ -527,24 +665,13 @@ static inline void ne10_radix_generic_butterfly_forward_float32_c (ne10_fft_cpx_
     NE10_FREE (scratch);
 }
 
-void ne10_mixed_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
+static inline void ne10_mixed_radix_generic_butterfly_float32_impl_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_int32_t *factors,
         const ne10_fft_cpx_float32_t *twiddles,
-        ne10_fft_cpx_float32_t *buffer)
+        ne10_fft_cpx_float32_t *buffer,
+        const ne10_int32_t is_inverse)
 {
-    PRINT_HIT;
-#ifdef NE10_VERBOSE
-    {
-        int i;
-        printf ("factors = \n");
-        for (i = 0; i < NE10_MAXFACTORS; i++)
-        {
-            printf ("[%d] = ( %d, %d )\n", i, factors[2 * i], factors[2 * i + 1]);
-        }
-    }
-#endif
-
     ne10_int32_t fstride, mstride, radix;
     ne10_int32_t stage_count;
     ne10_int32_t nfft;
@@ -567,20 +694,20 @@ void ne10_mixed_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
     switch (radix)
     {
     case 2:
-        ne10_radix_2_butterfly_forward_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1);
+        ne10_radix_2_butterfly_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1, is_inverse);
         break;
     case 4:
-        ne10_radix_4_butterfly_forward_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1);
+        ne10_radix_4_butterfly_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1, is_inverse);
         break;
     case 3:
-        ne10_radix_3_butterfly_forward_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1);
+        ne10_radix_3_butterfly_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1, is_inverse);
         break;
     case 5:
-        ne10_radix_5_butterfly_forward_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1);
+        ne10_radix_5_butterfly_float32_c (Fout, Fin, NULL, fstride, 1, nfft, 1, is_inverse);
         break;
     default:
-        ne10_radix_generic_butterfly_forward_float32_c (Fout, Fin, twiddles, radix,
-                fstride, 1);
+        ne10_radix_generic_butterfly_float32_c (Fout, Fin, twiddles, radix,
+                fstride, 1, is_inverse);
         break;
     }
 
@@ -590,7 +717,10 @@ void ne10_mixed_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         return;
     }
 
-    twiddles += radix;
+    if (radix % 2)
+    {
+        twiddles += radix;
+    }
 
     // other stges
     while (stage_count > 0)
@@ -607,20 +737,20 @@ void ne10_mixed_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         switch (radix)
         {
         case 2:
-            ne10_radix_2_butterfly_forward_float32_c (Fout, buffer, twiddles, fstride,
-                    mstride, nfft, 0);
+            ne10_radix_2_butterfly_float32_c (Fout, buffer, twiddles, fstride,
+                    mstride, nfft, 0, is_inverse);
             break;
         case 3:
-            ne10_radix_3_butterfly_forward_float32_c (Fout, buffer, twiddles, fstride,
-                    mstride, nfft, 0);
+            ne10_radix_3_butterfly_float32_c (Fout, buffer, twiddles, fstride,
+                    mstride, nfft, 0, is_inverse);
             break;
         case 4:
-            ne10_radix_4_butterfly_forward_float32_c (Fout, buffer, twiddles, fstride,
-                    mstride, nfft, 0);
+            ne10_radix_4_butterfly_float32_c (Fout, buffer, twiddles, fstride,
+                    mstride, nfft, 0, is_inverse);
             break;
         case 5:
-            ne10_radix_5_butterfly_forward_float32_c (Fout, buffer, twiddles, fstride,
-                    mstride, nfft, 0);
+            ne10_radix_5_butterfly_float32_c (Fout, buffer, twiddles, fstride,
+                    mstride, nfft, 0, is_inverse);
             break;
         } // switch (radix)
 
@@ -630,11 +760,20 @@ void ne10_mixed_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
     } // while (stage_count)
 }
 
-void ne10_mixed_radix_generic_butterfly_inverse_float32_c (ne10_fft_cpx_float32_t *Fout,
+void ne10_mixed_radix_generic_butterfly_float32_c (ne10_fft_cpx_float32_t *Fout,
         const ne10_fft_cpx_float32_t *Fin,
         const ne10_int32_t *factors,
         const ne10_fft_cpx_float32_t *twiddles,
         ne10_fft_cpx_float32_t *buffer)
 {
+    ne10_mixed_radix_generic_butterfly_float32_impl_c (Fout, Fin, factors, twiddles, buffer, 0);
+}
 
+void ne10_mixed_radix_generic_butterfly_inverse_float32_c (ne10_fft_cpx_float32_t *Fout,
+        const ne10_fft_cpx_float32_t *Fin,
+        const ne10_int32_t *factors,
+        const ne10_fft_cpx_float32_t *twiddles,
+        ne10_fft_cpx_float32_t *buffer)
+{
+    ne10_mixed_radix_generic_butterfly_float32_impl_c (Fout, Fin, factors, twiddles, buffer, 1);
 }