From d5abdd83c9095a7746dc46fcc0e5307208a9e9b4 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Wed, 17 Feb 2016 11:20:06 -0500 Subject: [PATCH] audio-resampler: add neon optimizations Unroll some more loops in the fallback code that seems to work fine for ARM. Add some simple ARM optimizations taken from speex. --- gst-libs/gst/audio/audio-resampler-neon.h | 253 ++++++++++++++++++++++++++++++ gst-libs/gst/audio/audio-resampler-x86.h | 12 +- gst-libs/gst/audio/audio-resampler.c | 67 +++++--- 3 files changed, 300 insertions(+), 32 deletions(-) create mode 100644 gst-libs/gst/audio/audio-resampler-neon.h diff --git a/gst-libs/gst/audio/audio-resampler-neon.h b/gst-libs/gst/audio/audio-resampler-neon.h new file mode 100644 index 0000000..905289c --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-neon.h @@ -0,0 +1,253 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +static inline void +inner_product_gint16_none_1_neon (gint16 * o, const gint16 * a, + const gint16 * b, gint len, const gint16 * icoeff) +{ + uint32_t remainder = len % 16; + len = len - remainder; + + asm volatile (" cmp %[len], #0\n" + " bne 1f\n" + " vld1.16 {d16}, [%[b]]!\n" + " vld1.16 {d20}, [%[a]]!\n" + " subs %[remainder], %[remainder], #4\n" + " vmull.s16 q0, d16, d20\n" + " beq 5f\n" + " b 4f\n" + "1:" + " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n" + " vld1.16 {d20, d21, d22, d23}, [%[a]]!\n" + " subs %[len], %[len], #16\n" + " vmull.s16 q0, d16, d20\n" + " vmlal.s16 q0, d17, d21\n" + " vmlal.s16 q0, d18, d22\n" + " vmlal.s16 q0, d19, d23\n" + " beq 3f\n" + "2:" + " vld1.16 {d16, d17, d18, d19}, [%[b]]!\n" + " vld1.16 {d20, d21, d22, d23}, [%[a]]!\n" + " subs %[len], %[len], #16\n" + " vmlal.s16 q0, d16, d20\n" + " vmlal.s16 q0, d17, d21\n" + " vmlal.s16 q0, d18, d22\n" + " vmlal.s16 q0, d19, d23\n" + " bne 2b\n" + "3:" + " cmp %[remainder], #0\n" + " beq 5f\n" + "4:" + " vld1.16 {d16}, [%[b]]!\n" + " vld1.16 {d20}, [%[a]]!\n" + " subs %[remainder], %[remainder], #4\n" + " vmlal.s16 q0, d16, d20\n" + " bne 4b\n" + "5:" + " vaddl.s32 q0, d0, d1\n" + " vadd.s64 d0, d0, d1\n" + " vqmovn.s64 d0, q0\n" + " vqrshrn.s32 d0, q0, #15\n" + " vst1.s16 d0[0], [%[o]]\n" + : [a] "+r" (a), [b] "+r" (b), + [len] "+r" (len), [remainder] "+r" (remainder) + : [o] "r" (o) + : "cc", "q0", + "d16", "d17", "d18", "d19", + "d20", "d21", "d22", "d23"); +} + +static inline void +inner_product_gint16_linear_1_neon (gint16 * o, const gint16 * a, + const gint16 * b, gint len, const gint16 * icoeff) +{ +} + +static inline void +inner_product_gint16_cubic_1_neon (gint16 * o, const gint16 * a, + const gint16 * b, gint len, const gint16 * icoeff) +{ +} + +static inline void +inner_product_gint32_none_1_neon (gint32 * o, const gint32 * a, + const gint32 * b, gint len, const gint32 * icoeff) +{ +} + +static inline void +inner_product_gint32_linear_1_neon (gint32 * o, const gint32 * a, + const gint32 * b, gint len, const gint32 * icoeff) +{ +} + +static inline void +inner_product_gint32_cubic_1_neon (gint32 * o, const gint32 * a, + const gint32 * b, gint len, const gint32 * icoeff) +{ +} + +static inline void +inner_product_gfloat_none_1_neon (gfloat * o, const gfloat * a, + const gfloat * b, gint len, const gfloat * icoeff) +{ + uint32_t remainder = len % 16; + len = len - remainder; + + asm volatile (" cmp %[len], #0\n" + " bne 1f\n" + " vld1.32 {q4}, [%[b]]!\n" + " vld1.32 {q8}, [%[a]]!\n" + " subs %[remainder], %[remainder], #4\n" + " vmul.f32 q0, q4, q8\n" + " bne 4f\n" + " b 5f\n" + "1:" + " vld1.32 {q4, q5}, [%[b]]!\n" + " vld1.32 {q8, q9}, [%[a]]!\n" + " vld1.32 {q6, q7}, [%[b]]!\n" + " vld1.32 {q10, q11}, [%[a]]!\n" + " subs %[len], %[len], #16\n" + " vmul.f32 q0, q4, q8\n" + " vmul.f32 q1, q5, q9\n" + " vmul.f32 q2, q6, q10\n" + " vmul.f32 q3, q7, q11\n" + " beq 3f\n" + "2:" + " vld1.32 {q4, q5}, [%[b]]!\n" + " vld1.32 {q8, q9}, [%[a]]!\n" + " vld1.32 {q6, q7}, [%[b]]!\n" + " vld1.32 {q10, q11}, [%[a]]!\n" + " subs %[len], %[len], #16\n" + " vmla.f32 q0, q4, q8\n" + " vmla.f32 q1, q5, q9\n" + " vmla.f32 q2, q6, q10\n" + " vmla.f32 q3, q7, q11\n" + " bne 2b\n" + "3:" + " vadd.f32 q4, q0, q1\n" + " vadd.f32 q5, q2, q3\n" + " cmp %[remainder], #0\n" + " vadd.f32 q0, q4, q5\n" + " beq 5f\n" + "4:" + " vld1.32 {q6}, [%[b]]!\n" + " vld1.32 {q10}, [%[a]]!\n" + " subs %[remainder], %[remainder], #4\n" + " vmla.f32 q0, q6, q10\n" + " bne 4b\n" + "5:" + " vadd.f32 d0, d0, d1\n" + " vpadd.f32 d0, d0, d0\n" + " vst1.f32 d0[0], [%[o]]\n" + : [a] "+r" (a), [b] "+r" (b), + [len] "+r" (len), [remainder] "+r" (remainder) + : [o] "r" (o) + : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11"); + +} + +static inline void +inner_product_gfloat_linear_1_neon (gfloat * o, const gfloat * a, + const gfloat * b, gint len, const gfloat * icoeff) +{ +} + +static inline void +inner_product_gfloat_cubic_1_neon (gfloat * o, const gfloat * a, + const gfloat * b, gint len, const gfloat * icoeff) +{ +} + +static inline void +inner_product_gdouble_none_1_neon (gdouble * o, const gdouble * a, + const gdouble * b, gint len, const gdouble * icoeff) +{ +} + +static inline void +inner_product_gdouble_linear_1_neon (gdouble * o, const gdouble * a, + const gdouble * b, gint len, const gdouble * icoeff) +{ +} + +static inline void +inner_product_gdouble_cubic_1_neon (gdouble * o, const gdouble * a, + const gdouble * b, gint len, const gdouble * icoeff) +{ +} + +static void +interpolate_gdouble_linear_neon (gdouble * o, const gdouble * a, + gint len, const gdouble * icoeff) +{ +} + +static void +interpolate_gdouble_cubic_neon (gdouble * o, const gdouble * a, + gint len, const gdouble * icoeff) +{ +} + +MAKE_RESAMPLE_FUNC (gint16, none, 1, neon); +MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon); +MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon); + +MAKE_RESAMPLE_FUNC (gint32, none, 1, neon); +MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon); +MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon); + +MAKE_RESAMPLE_FUNC (gfloat, none, 1, neon); +MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon); +MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon); + +MAKE_RESAMPLE_FUNC (gdouble, none, 1, neon); +MAKE_RESAMPLE_FUNC (gdouble, linear, 1, neon); +MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, neon); + +static void +audio_resampler_check_neon (const gchar *target_name, const gchar *option) +{ + if (!strcmp (target_name, "neon")) { + GST_DEBUG ("enable NEON optimisations"); + resample_gint16_none_1 = resample_gint16_none_1_neon; + + resample_gfloat_none_1 = resample_gfloat_none_1_neon; + + if (0) { + resample_gint16_linear_1 = resample_gint16_linear_1_neon; + resample_gint16_cubic_1 = resample_gint16_cubic_1_neon; + + resample_gint32_none_1 = resample_gint32_none_1_neon; + resample_gint32_linear_1 = resample_gint32_linear_1_neon; + resample_gint32_cubic_1 = resample_gint32_cubic_1_neon; + + resample_gfloat_linear_1 = resample_gfloat_linear_1_neon; + resample_gfloat_cubic_1 = resample_gfloat_cubic_1_neon; + + resample_gdouble_none_1 = resample_gdouble_none_1_neon; + resample_gdouble_linear_1 = resample_gdouble_linear_1_neon; + resample_gdouble_cubic_1 = resample_gdouble_cubic_1_neon; + + interpolate_gdouble_linear = interpolate_gdouble_linear_neon; + interpolate_gdouble_cubic = interpolate_gdouble_cubic_neon; + } + } +} diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h index b5033ef..22349d6 100644 --- a/gst-libs/gst/audio/audio-resampler-x86.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -546,9 +546,9 @@ MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41); #endif static void -audio_resampler_check_x86 (const gchar *option) +audio_resampler_check_x86 (const gchar *target_name, const gchar *option) { - if (!strcmp (option, "sse")) { + if (!strcmp (target_name, "sse")) { #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) GST_DEBUG ("enable SSE optimisations"); resample_gfloat_none_1 = resample_gfloat_none_1_sse; @@ -559,23 +559,19 @@ audio_resampler_check_x86 (const gchar *option) #else GST_DEBUG ("SSE optimisations not enabled"); #endif - } else if (!strcmp (option, "sse2")) { + } + if (!strcmp (option, "sse2")) { #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) GST_DEBUG ("enable SSE2 optimisations"); resample_gint16_none_1 = resample_gint16_none_1_sse2; resample_gint16_linear_1 = resample_gint16_linear_1_sse2; resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2; - resample_gfloat_none_1 = resample_gfloat_none_1_sse; - resample_gfloat_linear_1 = resample_gfloat_linear_1_sse; - resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse; - resample_gdouble_none_1 = resample_gdouble_none_1_sse2; resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2; resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2; resample_gint16_none_2 = resample_gint16_none_2_sse2; - resample_gfloat_none_2 = resample_gfloat_none_2_sse; resample_gdouble_none_2 = resample_gdouble_none_2_sse2; interpolate_gdouble_linear = interpolate_gdouble_linear_sse2; diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index 4f5c601..f9e0a40 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -637,14 +637,17 @@ inner_product_##type##_none_1_c (type * o, const type * a, \ const type * b, gint len, const type *ic) \ { \ gint i; \ - type2 res = 0; \ + type2 res[4] = { 0, 0, 0, 0 }; \ \ - for (i = 0; i < len; i += 2) { \ - res += (type2) a[2*i+0] * (type2) b[2*i+0]; \ - res += (type2) a[2*i+1] * (type2) b[2*i+1]; \ + for (i = 0; i < len; i += 4) { \ + res[0] += (type2) a[i + 0] * (type2) b[i + 0]; \ + res[1] += (type2) a[i + 1] * (type2) b[i + 1]; \ + res[2] += (type2) a[i + 2] * (type2) b[i + 2]; \ + res[3] += (type2) a[i + 3] * (type2) b[i + 3]; \ } \ - res = (res + ((type2)1 << ((prec) - 1))) >> (prec); \ - *o = CLAMP (res, -(limit), (limit) - 1); \ + res[0] = res[0] + res[1] + res[2] + res[3]; \ + res[0] = (res[0] + ((type2)1 << ((prec) - 1))) >> (prec); \ + *o = CLAMP (res[0], -(limit), (limit) - 1); \ } INNER_PRODUCT_INT_NONE_FUNC (gint16, gint32, PRECISION_S16, (gint32) 1 << 15); @@ -656,14 +659,18 @@ inner_product_##type##_linear_1_c (type * o, const type * a, \ const type * b, gint len, const type *ic) \ { \ gint i; \ - type2 res[2] = { 0, 0 }; \ + type2 res[4] = { 0, 0, 0, 0 }; \ \ - for (i = 0; i < len; i++) { \ - res[0] += (type2) a[i] * (type2) b[2 * i + 0]; \ - res[1] += (type2) a[i] * (type2) b[2 * i + 1]; \ + for (i = 0; i < len; i += 2) { \ + res[0] += (type2) a[i + 0] * (type2) b[2 * i + 0]; \ + res[1] += (type2) a[i + 0] * (type2) b[2 * i + 1]; \ + res[2] += (type2) a[i + 1] * (type2) b[2 * i + 2]; \ + res[3] += (type2) a[i + 1] * (type2) b[2 * i + 3]; \ } \ - res[0] = (type2)(type)(res[0] >> (prec)) * (type2) ic[0] + \ - (type2)(type)(res[1] >> (prec)) * (type2) ic[1]; \ + res[0] = (res[0] + res[2]) >> (prec); \ + res[1] = (res[1] + res[3]) >> (prec); \ + res[0] = (type2)(type)res[0] * (type2) ic[0] + \ + (type2)(type)res[1] * (type2) ic[1]; \ res[0] = (res[0] + ((type2)1 << ((prec) - 1))) >> (prec); \ *o = CLAMP (res[0], -(limit), (limit) - 1); \ } @@ -702,13 +709,15 @@ inner_product_##type##_none_1_c (type * o, const type * a, \ const type * b, gint len, const type *ic) \ { \ gint i; \ - type res = 0.0; \ + type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \ \ - for (i = 0; i < len; i += 2) { \ - res += a[2 * i + 0] * b[2 * i + 0]; \ - res += a[2 * i + 1] * b[2 * i + 1]; \ + for (i = 0; i < len; i += 4) { \ + res[0] += a[i + 0] * b[i + 0]; \ + res[1] += a[i + 1] * b[i + 1]; \ + res[2] += a[i + 2] * b[i + 2]; \ + res[3] += a[i + 3] * b[i + 3]; \ } \ - *o = res; \ + *o = res[0] + res[1] + res[2] + res[3]; \ } INNER_PRODUCT_FLOAT_NONE_FUNC (gfloat); @@ -720,13 +729,16 @@ inner_product_##type##_linear_1_c (type * o, const type * a, \ const type * b, gint len, const type *ic) \ { \ gint i; \ - type res[2] = { 0.0, 0.0 }; \ + type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \ \ - for (i = 0; i < len; i++) { \ + for (i = 0; i < len; i += 2) { \ res[0] += a[i] * b[2 * i + 0]; \ res[1] += a[i] * b[2 * i + 1]; \ + res[2] += a[i] * b[2 * i + 2]; \ + res[3] += a[i] * b[2 * i + 3]; \ } \ - *o = res[0] * ic[0] + res[1] * ic[1]; \ + *o = (res[0] + res[2]) * ic[0] + \ + (res[1] + res[3]) * ic[1]; \ } INNER_PRODUCT_FLOAT_LINEAR_FUNC (gfloat); INNER_PRODUCT_FLOAT_LINEAR_FUNC (gdouble); @@ -856,6 +868,10 @@ static ResampleFunc resample_funcs[] = { #define resample_gdouble_cubic_1 resample_funcs[19] #if defined HAVE_ORC && !defined DISABLE_ORC +# if defined (__ARM_NEON__) +# define CHECK_NEON +# include "audio-resampler-neon.h" +# endif # if defined (__i386__) || defined (__x86_64__) # define CHECK_X86 # include "audio-resampler-x86.h" @@ -880,17 +896,20 @@ audio_resampler_init (void) if (target) { unsigned int flags = orc_target_get_default_flags (target); - const gchar *name; + const gchar *tname, *name; - name = orc_target_get_name (target); - GST_DEBUG ("target %s, default flags %08x", name, flags); + tname = orc_target_get_name (target); + GST_DEBUG ("target %s, default flags %08x", tname, flags); for (i = 0; i < 32; ++i) { if (flags & (1U << i)) { name = orc_target_get_flag_name (target, i); GST_DEBUG ("target flag %s", name); #ifdef CHECK_X86 - audio_resampler_check_x86 (name); + audio_resampler_check_x86 (tname, name); +#endif +#ifdef CHECK_NEON + audio_resampler_check_neon (tname, name); #endif } } -- 2.7.4