From d3dd7dba4fe5be091502c7e049937fc4c6bba2d5 Mon Sep 17 00:00:00 2001 From: "Phil.Wang" Date: Thu, 19 Mar 2015 16:26:57 +0800 Subject: [PATCH] DSP: Fine tune fixed-point non-power-of-2 CFFT for GCC 4.9.0 Now Ne10 provides inline assembly for fixed-point non-power-of-2 Complex FFT, on AArch64. For GCC 4.9.0, user can define NE10_INLINE_ASM_OPT to enable this optimization. Below is performance data with or without this optimization. Cortex-A53 AArch64 mode (1.69GHz) GCC 4.9.0, with -O2 Android-21, AArch64 | C2C FFT Time Cost| | in ms| |size| Ne10| | |Without| With| | 60| 4.67| 2.91| | 120| 5.79| 3.37| | 240| 5.57| 3.30| | 480| 6.76| 3.87| | 970| 6.89| 4.00| Change-Id: I4f74217b026d8ef6ab6af4e1fb178ce4f1398b50 --- modules/dsp/NE10_fft_generic_int32.neonintrinsic.h | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/modules/dsp/NE10_fft_generic_int32.neonintrinsic.h b/modules/dsp/NE10_fft_generic_int32.neonintrinsic.h index 8b244cb..c28265d 100644 --- a/modules/dsp/NE10_fft_generic_int32.neonintrinsic.h +++ b/modules/dsp/NE10_fft_generic_int32.neonintrinsic.h @@ -59,11 +59,47 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND typedef int32x4x2_t CPLX; typedef int32x4_t REAL; #define NE10_REAL_DUP_NEON_S32 vdupq_n_s32 + +#ifndef NE10_INLINE_ASM_OPT #define NE10_CPLX_LOAD(PTR) vld2q_s32 ((ne10_int32_t*) (PTR)) #define NE10_CPLX_STORE(PTR,OUT) \ do { \ vst2q_s32 ((ne10_int32_t*) (PTR), OUT); \ } while (0) +#else // NE10_INLINE_ASM_OPT +#ifndef __aarch64__ +#error Currently, inline assembly optimizations are only available on AArch64. +#else // __aarch64__ +template +inline static int32x4x2_t NE10_CPLX_LOAD(T *ptr) +{ + int32x4x2_t result; + asm volatile ( + "ld2 {v0.4s, v1.4s}, [%[pin]] \n\t" + "mov %[r].16b, v0.16b \n\t" + "mov %[i].16b, v1.16b \n\t" + : [r]"+w"(result.val[0]), + [i]"+w"(result.val[1]) + : [pin]"r"(ptr) + : "memory", "v0", "v1"); + return result; +} + +template +inline static void NE10_CPLX_STORE(T *ptr, int32x4x2_t out) +{ + asm volatile ( + "mov v0.16b, %[r].16b \n\t" + "mov v1.16b, %[i].16b \n\t" + "st2 {v0.4s, v1.4s}, [%[pout]] \n\t" + : [r]"+w"(out.val[0]), + [i]"+w"(out.val[1]) + : [pout]"r"(ptr) + : "memory", "v0", "v1"); +} + +#endif // __aarch64__ +#endif // NE10_INLINE_ASM_OPT template<> inline CPLX NE10_CPX_LOAD_S (const CPLX *ptr) -- 2.7.4