From d5abdd83c9095a7746dc46fcc0e5307208a9e9b4 Mon Sep 17 00:00:00 2001
From: Wim Taymans <wtaymans@redhat.com>
Date: Wed, 17 Feb 2016 11:20:06 -0500
Subject: [PATCH] audio-resampler: add neon optimizations

Unroll some more loops in the fallback code that seems to work fine
for ARM.
Add some simple ARM optimizations taken from speex.
---
 gst-libs/gst/audio/audio-resampler-neon.h | 253 ++++++++++++++++++++++++++++++
 gst-libs/gst/audio/audio-resampler-x86.h  |  12 +-
 gst-libs/gst/audio/audio-resampler.c      |  67 +++++---
 3 files changed, 300 insertions(+), 32 deletions(-)
 create mode 100644 gst-libs/gst/audio/audio-resampler-neon.h

diff --git a/gst-libs/gst/audio/audio-resampler-neon.h b/gst-libs/gst/audio/audio-resampler-neon.h
new file mode 100644
index 0000000..905289c
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-neon.h
@@ -0,0 +1,253 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+static inline void
+inner_product_gint16_none_1_neon (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff)
+{
+    uint32_t remainder = len % 16;
+    len = len - remainder;
+
+    asm volatile ("      cmp %[len], #0\n"
+                  "      bne 1f\n"
+                  "      vld1.16 {d16}, [%[b]]!\n"
+                  "      vld1.16 {d20}, [%[a]]!\n"
+                  "      subs %[remainder], %[remainder], #4\n"
+                  "      vmull.s16 q0, d16, d20\n"
+                  "      beq 5f\n" 
+                  "      b 4f\n"
+                  "1:"
+                  "      vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
+                  "      vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
+                  "      subs %[len], %[len], #16\n"
+                  "      vmull.s16 q0, d16, d20\n"
+                  "      vmlal.s16 q0, d17, d21\n"
+                  "      vmlal.s16 q0, d18, d22\n"
+                  "      vmlal.s16 q0, d19, d23\n"
+                  "      beq 3f\n"
+                  "2:"
+                  "      vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
+                  "      vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
+                  "      subs %[len], %[len], #16\n"
+                  "      vmlal.s16 q0, d16, d20\n"
+                  "      vmlal.s16 q0, d17, d21\n"
+                  "      vmlal.s16 q0, d18, d22\n"
+                  "      vmlal.s16 q0, d19, d23\n"
+                  "      bne 2b\n"
+                  "3:"
+                  "      cmp %[remainder], #0\n"
+                  "      beq 5f\n"
+                  "4:"
+                  "      vld1.16 {d16}, [%[b]]!\n"
+                  "      vld1.16 {d20}, [%[a]]!\n"
+                  "      subs %[remainder], %[remainder], #4\n"
+                  "      vmlal.s16 q0, d16, d20\n"
+                  "      bne 4b\n"
+                  "5:"
+                  "      vaddl.s32 q0, d0, d1\n"
+                  "      vadd.s64 d0, d0, d1\n"
+                  "      vqmovn.s64 d0, q0\n"
+                  "      vqrshrn.s32 d0, q0, #15\n"
+                  "      vst1.s16 d0[0], [%[o]]\n"
+                  : [a] "+r" (a), [b] "+r" (b),
+                    [len] "+r" (len), [remainder] "+r" (remainder)
+                  : [o] "r" (o)
+                  : "cc", "q0",
+                    "d16", "d17", "d18", "d19",
+                    "d20", "d21", "d22", "d23");
+}
+
+static inline void
+inner_product_gint16_linear_1_neon (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff)
+{
+}
+
+static inline void
+inner_product_gint16_cubic_1_neon (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff)
+{
+}
+
+static inline void
+inner_product_gint32_none_1_neon (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff)
+{
+}
+
+static inline void
+inner_product_gint32_linear_1_neon (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff)
+{
+}
+
+static inline void
+inner_product_gint32_cubic_1_neon (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff)
+{
+}
+
+static inline void
+inner_product_gfloat_none_1_neon (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff)
+{
+    uint32_t remainder = len % 16;
+    len = len - remainder;
+
+    asm volatile ("      cmp %[len], #0\n"
+                  "      bne 1f\n"
+                  "      vld1.32 {q4}, [%[b]]!\n"
+                  "      vld1.32 {q8}, [%[a]]!\n"
+                  "      subs %[remainder], %[remainder], #4\n"
+                  "      vmul.f32 q0, q4, q8\n"
+                  "      bne 4f\n"
+                  "      b 5f\n"
+                  "1:"
+                  "      vld1.32 {q4, q5}, [%[b]]!\n"
+                  "      vld1.32 {q8, q9}, [%[a]]!\n"
+                  "      vld1.32 {q6, q7}, [%[b]]!\n"
+                  "      vld1.32 {q10, q11}, [%[a]]!\n"
+                  "      subs %[len], %[len], #16\n"
+                  "      vmul.f32 q0, q4, q8\n"
+                  "      vmul.f32 q1, q5, q9\n"
+                  "      vmul.f32 q2, q6, q10\n"
+                  "      vmul.f32 q3, q7, q11\n"
+                  "      beq 3f\n"
+                  "2:"
+                  "      vld1.32 {q4, q5}, [%[b]]!\n"
+                  "      vld1.32 {q8, q9}, [%[a]]!\n"
+                  "      vld1.32 {q6, q7}, [%[b]]!\n"
+                  "      vld1.32 {q10, q11}, [%[a]]!\n"
+                  "      subs %[len], %[len], #16\n"
+                  "      vmla.f32 q0, q4, q8\n"
+                  "      vmla.f32 q1, q5, q9\n"
+                  "      vmla.f32 q2, q6, q10\n"
+                  "      vmla.f32 q3, q7, q11\n"
+                  "      bne 2b\n"
+                  "3:"
+                  "      vadd.f32 q4, q0, q1\n"
+                  "      vadd.f32 q5, q2, q3\n"
+                  "      cmp %[remainder], #0\n"
+                  "      vadd.f32 q0, q4, q5\n"
+                  "      beq 5f\n"
+                  "4:"
+                  "      vld1.32 {q6}, [%[b]]!\n"
+                  "      vld1.32 {q10}, [%[a]]!\n"
+                  "      subs %[remainder], %[remainder], #4\n"
+                  "      vmla.f32 q0, q6, q10\n"
+                  "      bne 4b\n"
+                  "5:"
+                  "      vadd.f32 d0, d0, d1\n"
+                  "      vpadd.f32 d0, d0, d0\n"
+                  "      vst1.f32 d0[0], [%[o]]\n"
+                  : [a] "+r" (a), [b] "+r" (b),
+                    [len] "+r" (len), [remainder] "+r" (remainder)
+                  : [o] "r" (o)
+                  : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+                    "q9", "q10", "q11");
+
+}
+
+static inline void
+inner_product_gfloat_linear_1_neon (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff)
+{
+}
+
+static inline void
+inner_product_gfloat_cubic_1_neon (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff)
+{
+}
+
+static inline void
+inner_product_gdouble_none_1_neon (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff)
+{
+}
+
+static inline void
+inner_product_gdouble_linear_1_neon (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff)
+{
+}
+
+static inline void
+inner_product_gdouble_cubic_1_neon (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff)
+{
+}
+
+static void
+interpolate_gdouble_linear_neon (gdouble * o, const gdouble * a,
+    gint len, const gdouble * icoeff)
+{
+}
+
+static void
+interpolate_gdouble_cubic_neon (gdouble * o, const gdouble * a,
+    gint len, const gdouble * icoeff)
+{
+}
+
+MAKE_RESAMPLE_FUNC (gint16, none, 1, neon);
+MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
+MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
+
+MAKE_RESAMPLE_FUNC (gint32, none, 1, neon);
+MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
+MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
+
+MAKE_RESAMPLE_FUNC (gfloat, none, 1, neon);
+MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
+MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
+
+MAKE_RESAMPLE_FUNC (gdouble, none, 1, neon);
+MAKE_RESAMPLE_FUNC (gdouble, linear, 1, neon);
+MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, neon);
+
+static void
+audio_resampler_check_neon (const gchar *target_name, const gchar *option)
+{
+  if (!strcmp (target_name, "neon")) {
+    GST_DEBUG ("enable NEON optimisations");
+    resample_gint16_none_1 = resample_gint16_none_1_neon;
+
+    resample_gfloat_none_1 = resample_gfloat_none_1_neon;
+
+    if (0) {
+      resample_gint16_linear_1 = resample_gint16_linear_1_neon;
+      resample_gint16_cubic_1 = resample_gint16_cubic_1_neon;
+
+      resample_gint32_none_1 = resample_gint32_none_1_neon;
+      resample_gint32_linear_1 = resample_gint32_linear_1_neon;
+      resample_gint32_cubic_1 = resample_gint32_cubic_1_neon;
+
+      resample_gfloat_linear_1 = resample_gfloat_linear_1_neon;
+      resample_gfloat_cubic_1 = resample_gfloat_cubic_1_neon;
+
+      resample_gdouble_none_1 = resample_gdouble_none_1_neon;
+      resample_gdouble_linear_1 = resample_gdouble_linear_1_neon;
+      resample_gdouble_cubic_1 = resample_gdouble_cubic_1_neon;
+
+      interpolate_gdouble_linear = interpolate_gdouble_linear_neon;
+      interpolate_gdouble_cubic = interpolate_gdouble_cubic_neon;
+    }
+  }
+}
diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h
index b5033ef..22349d6 100644
--- a/gst-libs/gst/audio/audio-resampler-x86.h
+++ b/gst-libs/gst/audio/audio-resampler-x86.h
@@ -546,9 +546,9 @@ MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
 #endif
 
 static void
-audio_resampler_check_x86 (const gchar *option)
+audio_resampler_check_x86 (const gchar *target_name, const gchar *option)
 {
-  if (!strcmp (option, "sse")) {
+  if (!strcmp (target_name, "sse")) {
 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
     GST_DEBUG ("enable SSE optimisations");
     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
@@ -559,23 +559,19 @@ audio_resampler_check_x86 (const gchar *option)
 #else
     GST_DEBUG ("SSE optimisations not enabled");
 #endif
-  } else if (!strcmp (option, "sse2")) {
+  }
+  if (!strcmp (option, "sse2")) {
 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
     GST_DEBUG ("enable SSE2 optimisations");
     resample_gint16_none_1 = resample_gint16_none_1_sse2;
     resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
     resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2;
 
-    resample_gfloat_none_1 = resample_gfloat_none_1_sse;
-    resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
-    resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
-
     resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
     resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2;
     resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2;
 
     resample_gint16_none_2 = resample_gint16_none_2_sse2;
-    resample_gfloat_none_2 = resample_gfloat_none_2_sse;
     resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
 
     interpolate_gdouble_linear = interpolate_gdouble_linear_sse2;
diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c
index 4f5c601..f9e0a40 100644
--- a/gst-libs/gst/audio/audio-resampler.c
+++ b/gst-libs/gst/audio/audio-resampler.c
@@ -637,14 +637,17 @@ inner_product_##type##_none_1_c (type * o, const type * a,      \
     const type * b, gint len, const type *ic)                   \
 {                                                               \
   gint i;                                                       \
-  type2 res = 0;                                                \
+  type2 res[4] = { 0, 0, 0, 0 };                                \
                                                                 \
-  for (i = 0; i < len; i += 2) {                                \
-    res += (type2) a[2*i+0] * (type2) b[2*i+0];                 \
-    res += (type2) a[2*i+1] * (type2) b[2*i+1];                 \
+  for (i = 0; i < len; i += 4) {                                \
+    res[0] += (type2) a[i + 0] * (type2) b[i + 0];              \
+    res[1] += (type2) a[i + 1] * (type2) b[i + 1];              \
+    res[2] += (type2) a[i + 2] * (type2) b[i + 2];              \
+    res[3] += (type2) a[i + 3] * (type2) b[i + 3];              \
   }                                                             \
-  res = (res + ((type2)1 << ((prec) - 1))) >> (prec);           \
-  *o = CLAMP (res, -(limit), (limit) - 1);                      \
+  res[0] = res[0] + res[1] + res[2] + res[3];                   \
+  res[0] = (res[0] + ((type2)1 << ((prec) - 1))) >> (prec);     \
+  *o = CLAMP (res[0], -(limit), (limit) - 1);                   \
 }
 
 INNER_PRODUCT_INT_NONE_FUNC (gint16, gint32, PRECISION_S16, (gint32) 1 << 15);
@@ -656,14 +659,18 @@ inner_product_##type##_linear_1_c (type * o, const type * a,    \
     const type * b, gint len, const type *ic)                   \
 {                                                               \
   gint i;                                                       \
-  type2 res[2] = { 0, 0 };                                      \
+  type2 res[4] = { 0, 0, 0, 0 };                                \
                                                                 \
-  for (i = 0; i < len; i++) {                                   \
-    res[0] += (type2) a[i] * (type2) b[2 * i + 0];              \
-    res[1] += (type2) a[i] * (type2) b[2 * i + 1];              \
+  for (i = 0; i < len; i += 2) {                                \
+    res[0] += (type2) a[i + 0] * (type2) b[2 * i + 0];          \
+    res[1] += (type2) a[i + 0] * (type2) b[2 * i + 1];          \
+    res[2] += (type2) a[i + 1] * (type2) b[2 * i + 2];          \
+    res[3] += (type2) a[i + 1] * (type2) b[2 * i + 3];          \
   }                                                             \
-  res[0] = (type2)(type)(res[0] >> (prec)) * (type2) ic[0] +    \
-           (type2)(type)(res[1] >> (prec)) * (type2) ic[1];     \
+  res[0] = (res[0] + res[2]) >> (prec);                         \
+  res[1] = (res[1] + res[3]) >> (prec);                         \
+  res[0] = (type2)(type)res[0] * (type2) ic[0] +                \
+           (type2)(type)res[1] * (type2) ic[1];                 \
   res[0] = (res[0] + ((type2)1 << ((prec) - 1))) >> (prec);     \
   *o = CLAMP (res[0], -(limit), (limit) - 1);                   \
 }
@@ -702,13 +709,15 @@ inner_product_##type##_none_1_c (type * o, const type * a,      \
     const type * b, gint len, const type *ic)                   \
 {                                                               \
   gint i;                                                       \
-  type res = 0.0;                                               \
+  type res[4] = { 0.0, 0.0, 0.0, 0.0 };                         \
                                                                 \
-  for (i = 0; i < len; i += 2) {                                \
-    res += a[2 * i + 0] * b[2 * i + 0];                         \
-    res += a[2 * i + 1] * b[2 * i + 1];                         \
+  for (i = 0; i < len; i += 4) {                                \
+    res[0] += a[i + 0] * b[i + 0];                              \
+    res[1] += a[i + 1] * b[i + 1];                              \
+    res[2] += a[i + 2] * b[i + 2];                              \
+    res[3] += a[i + 3] * b[i + 3];                              \
   }                                                             \
-  *o = res;                                                     \
+  *o = res[0] + res[1] + res[2] + res[3];                       \
 }
 
 INNER_PRODUCT_FLOAT_NONE_FUNC (gfloat);
@@ -720,13 +729,16 @@ inner_product_##type##_linear_1_c (type * o, const type * a,    \
     const type * b, gint len, const type *ic)                   \
 {                                                               \
   gint i;                                                       \
-  type res[2] = { 0.0, 0.0 };                                   \
+  type res[4] = { 0.0, 0.0, 0.0, 0.0 };                         \
                                                                 \
-  for (i = 0; i < len; i++) {                                   \
+  for (i = 0; i < len; i += 2) {                                \
     res[0] += a[i] * b[2 * i + 0];                              \
     res[1] += a[i] * b[2 * i + 1];                              \
+    res[2] += a[i] * b[2 * i + 2];                              \
+    res[3] += a[i] * b[2 * i + 3];                              \
   }                                                             \
-  *o = res[0] * ic[0] + res[1] * ic[1];                         \
+  *o = (res[0] + res[2]) * ic[0] +                              \
+       (res[1] + res[3]) * ic[1];                               \
 }
 INNER_PRODUCT_FLOAT_LINEAR_FUNC (gfloat);
 INNER_PRODUCT_FLOAT_LINEAR_FUNC (gdouble);
@@ -856,6 +868,10 @@ static ResampleFunc resample_funcs[] = {
 #define resample_gdouble_cubic_1 resample_funcs[19]
 
 #if defined HAVE_ORC && !defined DISABLE_ORC
+# if defined (__ARM_NEON__)
+#  define CHECK_NEON
+#  include "audio-resampler-neon.h"
+# endif
 # if defined (__i386__) || defined (__x86_64__)
 #  define CHECK_X86
 #  include "audio-resampler-x86.h"
@@ -880,17 +896,20 @@ audio_resampler_init (void)
 
       if (target) {
         unsigned int flags = orc_target_get_default_flags (target);
-        const gchar *name;
+        const gchar *tname, *name;
 
-        name = orc_target_get_name (target);
-        GST_DEBUG ("target %s, default flags %08x", name, flags);
+        tname = orc_target_get_name (target);
+        GST_DEBUG ("target %s, default flags %08x", tname, flags);
 
         for (i = 0; i < 32; ++i) {
           if (flags & (1U << i)) {
             name = orc_target_get_flag_name (target, i);
             GST_DEBUG ("target flag %s", name);
 #ifdef CHECK_X86
-            audio_resampler_check_x86 (name);
+            audio_resampler_check_x86 (tname, name);
+#endif
+#ifdef CHECK_NEON
+            audio_resampler_check_neon (tname, name);
 #endif
           }
         }
-- 
2.7.4