From f6e0481ab5d5f9ba4ffa73a6f33b685bc1fa812d Mon Sep 17 00:00:00 2001
From: Wim Taymans <wtaymans@redhat.com>
Date: Thu, 11 Feb 2016 11:57:26 +0100
Subject: [PATCH] audio-resampler: Improve taps memory layout

Rearrange the oversampled taps in memory to make it easier to use
SIMD instructions on them. this simplifies some sse code.
Add some more optimizations
---
 gst-libs/gst/audio/audio-resampler-x86.h | 181 +++++++++++++++++++++++++++----
 gst-libs/gst/audio/audio-resampler.c     | 138 +++++++++++++----------
 2 files changed, 242 insertions(+), 77 deletions(-)

diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h
index a82042d..56be8aa 100644
--- a/gst-libs/gst/audio/audio-resampler-x86.h
+++ b/gst-libs/gst/audio/audio-resampler-x86.h
@@ -45,23 +45,15 @@ inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
 {
   gint i = 0;
-  __m128 sum = _mm_setzero_ps (), t, b0;
+  __m128 sum = _mm_setzero_ps (), t;
   __m128 f = _mm_loadu_ps(icoeff);
 
   for (; i < len; i += 4) {
     t = _mm_loadu_ps (a + i);
-
-    b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+0)*oversample));
-    b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+1)*oversample));
-
-    sum =
-        _mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t), b0));
-
-    b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+2)*oversample));
-    b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+3)*oversample));
-
-    sum =
-        _mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t), b0));
+    sum = _mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t),
+          _mm_load_ps (b + 2 * (i + 0))));
+    sum = _mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t),
+          _mm_load_ps (b + 2 * (i + 2))));
   }
   sum = _mm_mul_ps (sum, f);
   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
@@ -79,9 +71,9 @@ inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
 
   for (; i < len; i += 2) {
     sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 0),
-          _mm_loadu_ps (b + (i + 0) * oversample)));
+          _mm_load_ps (b + 4 * (i + 0))));
     sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 1),
-          _mm_loadu_ps (b + (i + 1) * oversample)));
+          _mm_load_ps (b + 4 * (i + 1))));
   }
   sum = _mm_mul_ps (sum, f);
   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
@@ -118,9 +110,10 @@ inner_product_gfloat_none_2_sse (gfloat * o, const gfloat * a,
 }
 
 MAKE_RESAMPLE_FUNC (gfloat, none, 1, sse);
-MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
 MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
 MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
+
+MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
 #endif
 
 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
@@ -155,6 +148,94 @@ inner_product_gint16_none_1_sse2 (gint16 * o, const gint16 * a,
 }
 
 static inline void
+inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
+{
+  gint i = 0;
+  __m128i sum, t, ta, tb, m1, m2;
+  __m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff));
+
+  sum = _mm_setzero_si128 ();
+  f = _mm_unpacklo_epi16 (f, sum);
+
+  for (; i < len; i += 8) {
+    t = _mm_loadu_si128 ((__m128i *) (a + i));
+
+    ta = _mm_unpacklo_epi16 (t, t);
+    tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 0));
+
+    m1 = _mm_mulhi_epi16 (ta, tb);
+    m2 = _mm_mullo_epi16 (ta, tb);
+
+    sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1));
+    sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1));
+
+    ta = _mm_unpackhi_epi16 (t, t);
+    tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 8));
+
+    m1 = _mm_mulhi_epi16 (ta, tb);
+    m2 = _mm_mullo_epi16 (ta, tb);
+
+    sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1));
+    sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1));
+  }
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_madd_epi16 (sum, f);
+
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
+              3)));
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
+              1)));
+
+  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_packs_epi32 (sum, sum);
+  *o = _mm_extract_epi16 (sum, 0);
+}
+
+static inline void
+inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
+{
+  gint i = 0;
+  __m128i sum, ta, tb, m1, m2;
+  __m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff));
+
+  sum = _mm_setzero_si128 ();
+  f = _mm_unpacklo_epi16 (f, sum);
+
+  for (; i < len; i += 2) {
+    ta = _mm_cvtsi32_si128 (*(gint32*)(a + i));
+    ta = _mm_unpacklo_epi16 (ta, ta);
+    ta = _mm_unpacklo_epi16 (ta, ta);
+
+    tb = _mm_load_si128 ((__m128i *) (b + 4 * i + 0));
+
+    m1 = _mm_mulhi_epi16 (ta, tb);
+    m2 = _mm_mullo_epi16 (ta, tb);
+
+    sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1));
+    sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1));
+  }
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_madd_epi16 (sum, f);
+
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
+              3)));
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
+              1)));
+
+  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_packs_epi32 (sum, sum);
+  *o = _mm_extract_epi16 (sum, 0);
+}
+
+static inline void
 inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
     const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
 {
@@ -180,6 +261,51 @@ inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
 }
 
 static inline void
+inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
+{
+  gint i = 0;
+  __m128d sum = _mm_setzero_pd ();
+  __m128d f = _mm_loadu_pd (icoeff);
+
+  for (; i < len; i += 4) {
+    sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 0), _mm_load_pd (b + 2 * i + 0)));
+    sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 1), _mm_load_pd (b + 2 * i + 2)));
+    sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 2), _mm_load_pd (b + 2 * i + 4)));
+    sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 3), _mm_load_pd (b + 2 * i + 6)));
+  }
+  sum = _mm_mul_pd (sum, f);
+  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
+  _mm_store_sd (o, sum);
+}
+
+static inline void
+inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
+{
+  gint i = 0;
+  __m128d sum1 = _mm_setzero_pd (), t;
+  __m128d sum2 = _mm_setzero_pd ();
+  __m128d f1 = _mm_loadu_pd (icoeff);
+  __m128d f2 = _mm_loadu_pd (icoeff+2);
+
+  for (; i < len; i += 2) {
+    t = _mm_load1_pd (a + i + 0);
+    sum1 = _mm_add_pd (sum1, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 0)));
+    sum2 = _mm_add_pd (sum2, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 2)));
+
+    t = _mm_load1_pd (a + i + 1);
+    sum1 = _mm_add_pd (sum1, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 4)));
+    sum2 = _mm_add_pd (sum2, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 6)));
+  }
+  sum1 = _mm_mul_pd (sum1, f1);
+  sum2 = _mm_mul_pd (sum2, f2);
+  sum1 = _mm_add_pd (sum1, sum2);
+  sum1 = _mm_add_sd (sum1, _mm_unpackhi_pd (sum1, sum1));
+  _mm_store_sd (o, sum1);
+}
+
+static inline void
 inner_product_gint16_none_2_sse2 (gint16 * o, const gint16 * a,
     const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
 {
@@ -239,9 +365,16 @@ inner_product_gdouble_none_2_sse2 (gdouble * o, const gdouble * a,
 }
 
 MAKE_RESAMPLE_FUNC (gint16, none, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
+
 MAKE_RESAMPLE_FUNC (gdouble, none, 1, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
+
 MAKE_RESAMPLE_FUNC (gint16, none, 2, sse2);
 MAKE_RESAMPLE_FUNC (gdouble, none, 2, sse2);
+
 #endif
 
 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
@@ -295,21 +428,29 @@ audio_resampler_check_x86 (const gchar *option)
 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
     GST_DEBUG ("enable SSE optimisations");
     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
-    resample_gfloat_none_2 = resample_gfloat_none_2_sse;
     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
     resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
+
+    resample_gfloat_none_2 = resample_gfloat_none_2_sse;
 #endif
   } else if (!strcmp (option, "sse2")) {
 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
     GST_DEBUG ("enable SSE2 optimisations");
     resample_gint16_none_1 = resample_gint16_none_1_sse2;
+    resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
+    resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2;
+
     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
-    resample_gfloat_none_2 = resample_gfloat_none_2_sse;
+    resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
+    resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
+
     resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
+    resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2;
+    resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2;
+
     resample_gint16_none_2 = resample_gint16_none_2_sse2;
+    resample_gfloat_none_2 = resample_gfloat_none_2_sse;
     resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
-    resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
-    resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
 #endif
   } else if (!strcmp (option, "sse41")) {
 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c
index 07096c4..bd03846 100644
--- a/gst-libs/gst/audio/audio-resampler.c
+++ b/gst-libs/gst/audio/audio-resampler.c
@@ -390,6 +390,27 @@ MAKE_CONVERT_TAPS_INT_FUNC (gint32, PRECISION_S32);
 MAKE_CONVERT_TAPS_FLOAT_FUNC (gfloat);
 MAKE_CONVERT_TAPS_FLOAT_FUNC (gdouble);
 
+#define MAKE_EXTRACT_TAPS_FUNC(type)                                    \
+static inline void                                                      \
+extract_taps_##type (GstAudioResampler * resampler, type *tmpcoeff,     \
+    gint n_taps, gint oversample, gint mult)                            \
+{                                                                       \
+  gint i, j, k;                                                         \
+  for (i = 0; i < oversample; i++) {                                    \
+    type *coeff = (type *) ((gint8*)resampler->coeff +                  \
+                i * resampler->cstride);                                \
+    for (j = 0; j < n_taps; j++) {                                      \
+      for (k = 0; k < mult; k++) {                                      \
+        *coeff++ = tmpcoeff[i + j*oversample + k];                      \
+      }                                                                 \
+    }                                                                   \
+  }                                                                     \
+}
+MAKE_EXTRACT_TAPS_FUNC (gint16);
+MAKE_EXTRACT_TAPS_FUNC (gint32);
+MAKE_EXTRACT_TAPS_FUNC (gfloat);
+MAKE_EXTRACT_TAPS_FUNC (gdouble);
+
 #define GET_TAPS_NONE_FUNC(type)                                                \
 static inline gpointer                                                          \
 get_taps_##type##_none (GstAudioResampler * resampler,                          \
@@ -421,44 +442,32 @@ get_taps_##type##_none (GstAudioResampler * resampler,
   }                                                                             \
   return res;                                                                   \
 }
-
 GET_TAPS_NONE_FUNC (gint16);
 GET_TAPS_NONE_FUNC (gint32);
 GET_TAPS_NONE_FUNC (gfloat);
 GET_TAPS_NONE_FUNC (gdouble);
 
-#define MAKE_COEFF_LINEAR_FLOAT_FUNC(type)                              \
+#define MAKE_COEFF_LINEAR_INT_FUNC(type,type2,prec)                     \
 static inline void                                                      \
 make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff)     \
 {                                                                       \
-  type x = (type)frac / out_rate;                                       \
+  type x = ((type2)frac << prec) / out_rate;                            \
   icoeff[0] = icoeff[2] = x;                                            \
-  icoeff[1] = icoeff[3] = 1.0 - x;                                      \
+  icoeff[1] = icoeff[3] = (1L << prec) - x;                             \
 }
-#define MAKE_COEFF_LINEAR_INT_FUNC(type,type2,prec)                     \
+#define MAKE_COEFF_LINEAR_FLOAT_FUNC(type)                              \
 static inline void                                                      \
 make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff)     \
 {                                                                       \
-  type x = ((type2)frac << prec) / out_rate;                            \
+  type x = (type)frac / out_rate;                                       \
   icoeff[0] = icoeff[2] = x;                                            \
-  icoeff[1] = icoeff[3] = (1 << prec) - x;                              \
+  icoeff[1] = icoeff[3] = 1.0 - x;                                      \
 }
-
 MAKE_COEFF_LINEAR_INT_FUNC (gint16, gint32, PRECISION_S16);
 MAKE_COEFF_LINEAR_INT_FUNC (gint32, gint64, PRECISION_S32);
 MAKE_COEFF_LINEAR_FLOAT_FUNC (gfloat);
 MAKE_COEFF_LINEAR_FLOAT_FUNC (gdouble);
 
-#define MAKE_COEFF_CUBIC_FLOAT_FUNC(type)                               \
-static inline void                                                      \
-make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff)      \
-{                                                                       \
-  type x = (type) frac / out_rate, x2 = x * x, x3 = x2 * x;             \
-  icoeff[0] = 0.16667f * (x3 - x);                                      \
-  icoeff[1] = x + 0.5f * (x2 - x3);                                     \
-  icoeff[3] = -0.33333f * x + 0.5f * x2 - 0.16667f * x3;                \
-  icoeff[2] = 1. - icoeff[0] - icoeff[1] - icoeff[3];                   \
-}
 #define MAKE_COEFF_CUBIC_INT_FUNC(type,type2,prec)                      \
 static inline void                                                      \
 make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff)      \
@@ -473,7 +482,16 @@ make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff)      \
             (x2 >> 1) - ((((type2) x3 << prec) / 6) >> prec);           \
   icoeff[2] = one - icoeff[0] - icoeff[1] - icoeff[3];                  \
 }
-
+#define MAKE_COEFF_CUBIC_FLOAT_FUNC(type)                               \
+static inline void                                                      \
+make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff)      \
+{                                                                       \
+  type x = (type) frac / out_rate, x2 = x * x, x3 = x2 * x;             \
+  icoeff[0] = 0.16667f * (x3 - x);                                      \
+  icoeff[1] = x + 0.5f * (x2 - x3);                                     \
+  icoeff[3] = -0.33333f * x + 0.5f * x2 - 0.16667f * x3;                \
+  icoeff[2] = 1. - icoeff[0] - icoeff[1] - icoeff[3];                   \
+}
 MAKE_COEFF_CUBIC_INT_FUNC (gint16, gint32, PRECISION_S16);
 MAKE_COEFF_CUBIC_INT_FUNC (gint32, gint64, PRECISION_S32);
 MAKE_COEFF_CUBIC_FLOAT_FUNC (gfloat);
@@ -488,12 +506,13 @@ get_taps_##type##_##inter (GstAudioResampler * resampler,       \
   gint out_rate = resampler->out_rate;                          \
   gint offset, frac, pos;                                       \
   gint oversample = resampler->oversample;                      \
+  gint cstride = resampler->cstride;                            \
                                                                 \
   pos = *samp_phase * oversample;                               \
   offset = (oversample - 1) - (pos / out_rate);                 \
   frac = pos % out_rate;                                        \
                                                                 \
-  res = (type *)resampler->coeff + offset;                      \
+  res = (gint8 *) resampler->coeff + offset * cstride;          \
   make_coeff_##type##_##inter (frac, out_rate, icoeff);         \
                                                                 \
   *samp_index += resampler->samp_inc;                           \
@@ -526,7 +545,7 @@ inner_product_##type##_none_1_c (type * o, const type * a,      \
   for (i = 0; i < len; i++)                                     \
     res += (type2) a[i] * (type2) b[i];                         \
                                                                 \
-  res = (res + (1 << ((prec) - 1))) >> (prec);                  \
+  res = (res + (1L << ((prec) - 1))) >> (prec);                 \
   *o = CLAMP (res, -(limit), (limit) - 1);                      \
 }
 
@@ -542,12 +561,12 @@ inner_product_##type##_linear_1_c (type * o, const type * a,    \
   type2 res[2] = { 0, 0 };                                      \
                                                                 \
   for (i = 0; i < len; i++) {                                   \
-    res[0] += (type2) a[i] * (type2) b[i * oversample + 0];     \
-    res[1] += (type2) a[i] * (type2) b[i * oversample + 1];     \
+    res[0] += (type2) a[i] * (type2) b[2 * i + 0];              \
+    res[1] += (type2) a[i] * (type2) b[2 * i + 1];              \
   }                                                             \
-  res[0] = (res[0] >> (prec)) * ic[0] +                         \
-           (res[1] >> (prec)) * ic[1];                          \
-  res[0] = (res[0] + (1 << ((prec) - 1))) >> (prec);            \
+  res[0] = (res[0] >> (prec)) * (type2) ic[0] +                 \
+           (res[1] >> (prec)) * (type2) ic[1];                  \
+  res[0] = (res[0] + (1L << ((prec) - 1))) >> (prec);           \
   *o = CLAMP (res[0], -(limit), (limit) - 1);                   \
 }
 
@@ -563,16 +582,16 @@ inner_product_##type##_cubic_1_c (type * o, const type * a,    \
   type2 res[4] = { 0, 0, 0, 0 };                                \
                                                                 \
   for (i = 0; i < len; i++) {                                   \
-    res[0] += (type2) a[i] * (type2) b[i * oversample + 0];     \
-    res[1] += (type2) a[i] * (type2) b[i * oversample + 1];     \
-    res[2] += (type2) a[i] * (type2) b[i * oversample + 2];     \
-    res[3] += (type2) a[i] * (type2) b[i * oversample + 3];     \
+    res[0] += (type2) a[i] * (type2) b[4 * i + 0];              \
+    res[1] += (type2) a[i] * (type2) b[4 * i + 1];              \
+    res[2] += (type2) a[i] * (type2) b[4 * i + 2];              \
+    res[3] += (type2) a[i] * (type2) b[4 * i + 3];              \
   }                                                             \
-  res[0] = (res[0] >> (prec)) * ic[0] +                         \
-           (res[1] >> (prec)) * ic[1] +                         \
-           (res[2] >> (prec)) * ic[2] +                         \
-           (res[3] >> (prec)) * ic[3];                          \
-  res[0] = (res[0] + (1 << ((prec) - 1))) >> (prec);            \
+  res[0] = (res[0] >> (prec)) * (type2) ic[0] +                 \
+           (res[1] >> (prec)) * (type2) ic[1] +                 \
+           (res[2] >> (prec)) * (type2) ic[2] +                 \
+           (res[3] >> (prec)) * (type2) ic[3];                  \
+  res[0] = (res[0] + (1L << ((prec) - 1))) >> (prec);           \
   *o = CLAMP (res[0], -(limit), (limit) - 1);                   \
 }
 
@@ -605,8 +624,8 @@ inner_product_##type##_linear_1_c (type * o, const type * a,    \
   type res[2] = { 0.0, 0.0 };                                   \
                                                                 \
   for (i = 0; i < len; i++) {                                   \
-    res[0] += a[i] * b[i * oversample + 0];                     \
-    res[1] += a[i] * b[i * oversample + 1];                     \
+    res[0] += a[i] * b[2 * i + 0];                              \
+    res[1] += a[i] * b[2 * i + 1];                              \
   }                                                             \
   *o = res[0] * ic[0] + res[1] * ic[1];                         \
 }
@@ -622,10 +641,10 @@ inner_product_##type##_cubic_1_c (type * o, const type * a,     \
   type res[4] = { 0.0, 0.0, 0.0, 0.0 };                         \
                                                                 \
   for (i = 0; i < len; i++) {                                   \
-    res[0] += a[i] * b[i * oversample + 0];                     \
-    res[1] += a[i] * b[i * oversample + 1];                     \
-    res[2] += a[i] * b[i * oversample + 2];                     \
-    res[3] += a[i] * b[i * oversample + 3];                     \
+    res[0] += a[i] * b[4 * i + 0];                              \
+    res[1] += a[i] * b[4 * i + 1];                              \
+    res[2] += a[i] * b[4 * i + 2];                              \
+    res[3] += a[i] * b[4 * i + 3];                              \
   }                                                             \
   *o = res[0] * ic[0] + res[1] * ic[1] +                        \
        res[2] * ic[2] + res[3] * ic[3];                         \
@@ -659,9 +678,10 @@ resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resa
                                                                                 \
       ipp = &ip[samp_index * channels];                                         \
                                                                                 \
-      taps = get_taps_ ##type##_##inter (resampler, &samp_index, &samp_phase, icoeff);                   \
-                                                                                \
-      inner_product_ ##type##_##inter##_##channels##_##arch (op, ipp, taps, n_taps, icoeff, oversample);     \
+      taps = get_taps_ ##type##_##inter                                         \
+              (resampler, &samp_index, &samp_phase, icoeff);                    \
+      inner_product_ ##type##_##inter##_##channels##_##arch                     \
+              (op, ipp, taps, n_taps, icoeff, oversample);                      \
       op += ostride;                                                            \
     }                                                                           \
     memmove (ip, &ip[samp_index * channels],                                    \
@@ -802,10 +822,10 @@ deinterleave_ ##type (GstAudioResampler * resampler, gpointer sbuf[],   \
   }                                                                     \
 }
 
-MAKE_DEINTERLEAVE_FUNC (gdouble);
-MAKE_DEINTERLEAVE_FUNC (gfloat);
-MAKE_DEINTERLEAVE_FUNC (gint32);
 MAKE_DEINTERLEAVE_FUNC (gint16);
+MAKE_DEINTERLEAVE_FUNC (gint32);
+MAKE_DEINTERLEAVE_FUNC (gfloat);
+MAKE_DEINTERLEAVE_FUNC (gdouble);
 
 static DeinterleaveFunc deinterleave_funcs[] = {
   deinterleave_gint16,
@@ -875,7 +895,7 @@ calculate_kaiser_params (GstAudioResampler * resampler)
 
 static void
 alloc_coeff_mem (GstAudioResampler * resampler, gint bps, gint n_taps,
-    gint n_phases)
+    gint n_phases, gint n_mult)
 {
   if (resampler->alloc_taps >= n_taps && resampler->alloc_phases >= n_phases)
     return;
@@ -883,7 +903,8 @@ alloc_coeff_mem (GstAudioResampler * resampler, gint bps, gint n_taps,
   resampler->tmpcoeff =
       g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble));
 
-  resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + TAPS_OVERREAD));
+  resampler->cstride =
+      GST_ROUND_UP_32 (bps * (n_mult * n_taps + TAPS_OVERREAD));
   g_free (resampler->coeffmem);
   resampler->coeffmem = g_malloc0 (n_phases * resampler->cstride + ALIGN - 1);
   resampler->coeff = MEM_ALIGN (resampler->coeffmem, ALIGN);
@@ -983,7 +1004,7 @@ resampler_calculate_taps (GstAudioResampler * resampler)
   }
 
   if (interpolate) {
-    gint otaps;
+    gint otaps, mult;
     gpointer coeff;
     gdouble x, weight, *tmpcoeff;
     GstAudioResamplerFilterInterpolation filter_interpolation =
@@ -995,37 +1016,40 @@ resampler_calculate_taps (GstAudioResampler * resampler)
     else
       resampler->filter_interpolation = filter_interpolation;
 
-    otaps = oversample * n_taps;
     switch (resampler->filter_interpolation) {
       default:
       case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR:
-        otaps += 1;
+        mult = 2;
         break;
       case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC:
-        otaps += 3;
+        mult = 4;
         break;
     }
+    otaps = oversample * n_taps + mult - 1;
 
-    alloc_coeff_mem (resampler, bps, otaps, 1);
+    alloc_coeff_mem (resampler, bps, otaps, oversample, mult);
 
-    coeff = resampler->coeff;
-    tmpcoeff = resampler->tmpcoeff;
+    coeff = tmpcoeff = resampler->tmpcoeff;
     x = 1.0 - n_taps / 2;
     weight = fill_taps (resampler, tmpcoeff, x, otaps, oversample);
 
     switch (resampler->format) {
       case GST_AUDIO_FORMAT_S16:
         convert_taps_gint16 (tmpcoeff, coeff, weight / oversample, otaps);
+        extract_taps_gint16 (resampler, coeff, n_taps, oversample, mult);
         break;
       case GST_AUDIO_FORMAT_S32:
         convert_taps_gint32 (tmpcoeff, coeff, weight / oversample, otaps);
+        extract_taps_gint32 (resampler, coeff, n_taps, oversample, mult);
         break;
       case GST_AUDIO_FORMAT_F32:
         convert_taps_gfloat (tmpcoeff, coeff, weight / oversample, otaps);
+        extract_taps_gfloat (resampler, coeff, n_taps, oversample, mult);
         break;
       default:
       case GST_AUDIO_FORMAT_F64:
         convert_taps_gdouble (tmpcoeff, coeff, weight / oversample, otaps);
+        extract_taps_gdouble (resampler, coeff, n_taps, oversample, mult);
         break;
     }
   } else {
@@ -1033,7 +1057,7 @@ resampler_calculate_taps (GstAudioResampler * resampler)
         GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_NONE;
     resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap));
     memset (resampler->taps, 0, sizeof (Tap) * out_rate);
-    alloc_coeff_mem (resampler, bps, n_taps, out_rate);
+    alloc_coeff_mem (resampler, bps, n_taps, out_rate, 1);
   }
 
   resampler->samp_inc = in_rate / out_rate;
-- 
2.7.4