From 71871c50483df00cd46dcfcb99148a4cd0097dfc Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Wed, 13 Jan 2016 17:44:39 +0100 Subject: [PATCH] audio-resampler: optimizations Improve int16 resampling by using pmaddwd Use intrinsics to scale and pack int16 samples Align the coefficients so that we can use aligned loads Add padding to taps and samples so that we don't have to use partial loads for the remainder of the loops. Remove copy_n, we can reuse the plain copy function with some new parameters. Align and pad the sample array. --- gst-libs/gst/audio/audio-resampler-x86.h | 176 +++++++++++-------------------- gst-libs/gst/audio/audio-resampler.c | 99 ++++++++++------- 2 files changed, 118 insertions(+), 157 deletions(-) diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h index 65a5194..284e7c9 100644 --- a/gst-libs/gst/audio/audio-resampler-x86.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -19,171 +19,118 @@ #ifdef HAVE_EMMINTRIN_H #include -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len) +inner_product_gint16_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len) { gint i = 0; - gint32 res = 0; - __m128i sum[2], ta, tb; - __m128i t1[2]; + __m128i sum, ta, tb; - sum[0] = _mm_setzero_si128 (); - sum[1] = _mm_setzero_si128 (); + sum = _mm_setzero_si128 (); - for (; i < len - 7; i += 8) { + for (; i < len; i += 8) { ta = _mm_loadu_si128 ((__m128i *) (a + i)); - tb = _mm_loadu_si128 ((__m128i *) (b + i)); + tb = _mm_load_si128 ((__m128i *) (b + i)); - t1[0] = _mm_mullo_epi16 (ta, tb); - t1[1] = _mm_mulhi_epi16 (ta, tb); - - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); } - sum[0] = _mm_add_epi32 (sum[0], sum[1]); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + sum = + _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, + sum = + _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); - res = _mm_cvtsi128_si32 (sum[0]); - - for (; i < len; i++) - res += (gint32) a[i] * (gint32) b[i]; - res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - *o = CLAMP (res, -(1L << 15), (1L << 15) - 1); + sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum = _mm_srai_epi32 (sum, PRECISION_S16); + sum = _mm_packs_epi32 (sum, sum); + *o = _mm_extract_epi16 (sum, 0); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len) { gint i = 0; - gfloat res; __m128 sum = _mm_setzero_ps (); - for (; i < len - 7; i += 8) { + for (; i < len; i += 8) { sum = _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0), - _mm_loadu_ps (b + i + 0))); + _mm_load_ps (b + i + 0))); sum = _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4), - _mm_loadu_ps (b + i + 4))); + _mm_load_ps (b + i + 4))); } sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); - _mm_store_ss (&res, sum); - - for (; i < len; i++) - res += a[i] * b[i]; - - *o = res; + _mm_store_ss (o, sum); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b, +inner_product_gdouble_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len) { gint i = 0; - gdouble res; __m128d sum = _mm_setzero_pd (); - for (; i < len - 7; i += 8) { + for (; i < len; i += 8) { sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), - _mm_loadu_pd (b + i + 0))); + _mm_load_pd (b + i + 0))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), - _mm_loadu_pd (b + i + 2))); + _mm_load_pd (b + i + 2))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), - _mm_loadu_pd (b + i + 4))); + _mm_load_pd (b + i + 4))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), - _mm_loadu_pd (b + i + 6))); + _mm_load_pd (b + i + 6))); } sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); - _mm_store_sd (&res, sum); - - for (; i < len; i++) - res += a[i] * b[i]; - - *o = res; + _mm_store_sd (o, sum); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len) +inner_product_gint16_2_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len) { gint i = 0; - gint32 r[2]; - guint64 r64; - __m128i sum[2], ta, tb; - __m128i t1[2]; + __m128i sum, ta, tb, t1; - sum[0] = _mm_setzero_si128 (); - sum[1] = _mm_setzero_si128 (); + sum = _mm_setzero_si128 (); - for (; i < len - 7; i += 8) { - tb = _mm_loadu_si128 ((__m128i *) (b + i)); - - t1[1] = _mm_unpacklo_epi16 (tb, tb); + for (; i < len; i += 8) { + tb = _mm_load_si128 ((__m128i *) (b + i)); + t1 = _mm_unpacklo_epi16 (tb, tb); ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i)); - t1[0] = _mm_mullo_epi16 (ta, t1[1]); - t1[1] = _mm_mulhi_epi16 (ta, t1[1]); - - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); - t1[1] = _mm_unpackhi_epi16 (tb, tb); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1)); + t1 = _mm_unpackhi_epi16 (tb, tb); ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8)); - t1[0] = _mm_mullo_epi16 (ta, t1[1]); - t1[1] = _mm_mulhi_epi16 (ta, t1[1]); - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1)); } - sum[0] = _mm_add_epi32 (sum[0], sum[1]); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + sum = + _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); - r64 = _mm_cvtsi128_si64 (sum[0]); - r[0] = r64 >> 32; - r[1] = r64 & 0xffffffff; - for (; i < len; i++) { - r[0] += (gint32) a[2 * i] * (gint32) b[i]; - r[1] += (gint32) a[2 * i + 1] * (gint32) b[i]; - } - r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1); - o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1); + sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum = _mm_srai_epi32 (sum, PRECISION_S16); + sum = _mm_packs_epi32 (sum, sum); + *(gint32*)o = _mm_cvtsi128_si32 (sum); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, +inner_product_gdouble_2_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len) { gint i = 0; - gdouble r[2]; __m128d sum = _mm_setzero_pd (), t; - for (; i < len - 3; i += 4) { - t = _mm_loadu_pd (b + i); + for (; i < len; i += 4) { + t = _mm_load_pd (b + i); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i), _mm_unpacklo_pd (t, t))); @@ -191,7 +138,7 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2), _mm_unpackhi_pd (t, t))); - t = _mm_loadu_pd (b + i + 2); + t = _mm_load_pd (b + i + 2); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4), _mm_unpacklo_pd (t, t))); @@ -199,34 +146,29 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6), _mm_unpackhi_pd (t, t))); } - _mm_store_pd (r, sum); - - for (; i < len; i++) { - r[0] += a[2 * i] * b[i]; - r[1] += a[2 * i + 1] * b[i]; - } - o[0] = r[0]; - o[1] = r[1]; + _mm_store_pd (o, sum); } -#endif -#ifdef HAVE_EMMINTRIN_H -MAKE_RESAMPLE_FUNC (gint16, 1, sse); +MAKE_RESAMPLE_FUNC (gint16, 1, sse2); MAKE_RESAMPLE_FUNC (gfloat, 1, sse); -MAKE_RESAMPLE_FUNC (gdouble, 1, sse); -MAKE_RESAMPLE_FUNC (gint16, 2, sse); -MAKE_RESAMPLE_FUNC (gdouble, 2, sse); +MAKE_RESAMPLE_FUNC (gdouble, 1, sse2); +MAKE_RESAMPLE_FUNC (gint16, 2, sse2); +MAKE_RESAMPLE_FUNC (gdouble, 2, sse2); #endif static void audio_resampler_check_x86 (const gchar *option) { - if (!strcmp (option, "sse2")) { - GST_DEBUG ("enable SSE2 optimisations"); - resample_gint16_1 = resample_gint16_1_sse; +#ifdef HAVE_EMMINTRIN_H + if (!strcmp (option, "sse")) { + GST_DEBUG ("enable SSE optimisations"); resample_gfloat_1 = resample_gfloat_1_sse; - resample_gdouble_1 = resample_gdouble_1_sse; - resample_gint16_2 = resample_gint16_2_sse; - resample_gdouble_2 = resample_gdouble_2_sse; + } else if (!strcmp (option, "sse2")) { + GST_DEBUG ("enable SSE2 optimisations"); + resample_gint16_1 = resample_gint16_1_sse2; + resample_gdouble_1 = resample_gdouble_1_sse2; + resample_gint16_2 = resample_gint16_2_sse2; + resample_gdouble_2 = resample_gdouble_2_sse2; } +#endif } diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index e07f8d7..26c44d0 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -48,6 +48,8 @@ typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler, gpointer * sbuf, gpointer in[], gsize in_frames); typedef void (*MirrorFunc) (GstAudioResampler * resampler, gpointer * sbuf); +#define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1))) + struct _GstAudioResampler { GstAudioResamplerMethod method; @@ -68,6 +70,8 @@ struct _GstAudioResampler guint n_taps; Tap *taps; gpointer coeff; + gpointer coeffmem; + gsize cstride; gpointer tmpcoeff; DeinterleaveFunc deinterleave; @@ -75,6 +79,7 @@ struct _GstAudioResampler ResampleFunc resample; guint blocks; + guint inc; gboolean filling; gint samp_inc; gint samp_frac; @@ -256,7 +261,7 @@ get_kaiser_tap (GstAudioResampler * resampler, gdouble x) #define CONVERT_TAPS(type, precision) \ G_STMT_START { \ - type *taps = t->taps = (type *) resampler->coeff + j * n_taps; \ + type *taps = t->taps = (type *) ((gint8*)resampler->coeff + j * resampler->cstride); \ gdouble multiplier = (1 << precision); \ gint i, j; \ gdouble offset, l_offset, h_offset; \ @@ -338,14 +343,16 @@ make_taps (GstAudioResampler * resampler, Tap * t, gint j) switch (resampler->format) { case GST_AUDIO_FORMAT_F64: { - gdouble *taps = t->taps = (gdouble *) resampler->coeff + j * n_taps; + gdouble *taps = t->taps = + (gdouble *) ((gint8 *) resampler->coeff + j * resampler->cstride); for (l = 0; l < n_taps; l++) taps[l] = tmpcoeff[l] / weight; break; } case GST_AUDIO_FORMAT_F32: { - gfloat *taps = t->taps = (gfloat *) resampler->coeff + j * n_taps; + gfloat *taps = t->taps = + (gfloat *) ((gint8 *) resampler->coeff + j * resampler->cstride); for (l = 0; l < n_taps; l++) taps[l] = tmpcoeff[l] / weight; break; @@ -593,28 +600,18 @@ static void deinterleave_copy (GstAudioResampler * resampler, gpointer sbuf[], gpointer in[], gsize in_frames) { - gsize samples_avail = resampler->samples_avail; - gint bpf = resampler->bpf; + guint c, blocks = resampler->blocks; + gsize bytes_avail, in_bytes, bpf; - if (in == NULL) - memset ((guint8 *) sbuf[0] + samples_avail * bpf, 0, in_frames * bpf); - else - memcpy ((guint8 *) sbuf[0] + samples_avail * bpf, in[0], in_frames * bpf); -} + bpf = resampler->bps * resampler->inc; + bytes_avail = resampler->samples_avail * bpf; + in_bytes = in_frames * bpf; -static void -deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[], - gpointer in[], gsize in_frames) -{ - guint c, channels = resampler->channels; - gsize samples_avail = resampler->samples_avail; - gint bps = resampler->bps; - - for (c = 0; c < channels; c++) { + for (c = 0; c < blocks; c++) { if (in == NULL) - memset ((guint8 *) sbuf[c] + samples_avail * bps, 0, in_frames * bps); + memset ((guint8 *) sbuf[c] + bytes_avail, 0, in_bytes); else - memcpy ((guint8 *) sbuf[c] + samples_avail * bps, in[c], in_frames * bps); + memcpy ((guint8 *) sbuf[c] + bytes_avail, in[c], in_bytes); } } @@ -623,10 +620,10 @@ deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[], static void \ mirror_ ##type (GstAudioResampler * resampler, gpointer sbuf[]) \ { \ - guint i, c, channels = resampler->channels; \ + guint i, c, blocks = resampler->blocks; \ gint si = resampler->n_taps / 2; \ gint n_taps = resampler->n_taps; \ - for (c = 0; c < channels; c++) { \ + for (c = 0; c < blocks; c++) { \ type *s = sbuf[c]; \ for (i = 0; i < si; i++) \ s[i] = -s[n_taps - i]; \ @@ -686,6 +683,7 @@ resampler_calculate_taps (GstAudioResampler * resampler) gint n_taps; gint out_rate; gint in_rate; + gboolean non_interleaved; switch (resampler->method) { case GST_AUDIO_RESAMPLER_METHOD_NEAREST: @@ -729,7 +727,12 @@ resampler_calculate_taps (GstAudioResampler * resampler) GST_LOG ("using n_taps %d cutoff %f", n_taps, resampler->cutoff); resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap)); - resampler->coeff = g_realloc_n (resampler->coeff, out_rate, bps * n_taps); + + resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + 16)); + g_free (resampler->coeffmem); + resampler->coeffmem = g_malloc0 (out_rate * resampler->cstride + 31); + resampler->coeff = MEM_ALIGN (resampler->coeffmem, 32); + resampler->tmpcoeff = g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble)); @@ -743,13 +746,22 @@ resampler_calculate_taps (GstAudioResampler * resampler) t->next_phase = (j + in_rate) % out_rate; } + non_interleaved = + (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED); + + resampler->ostride = non_interleaved ? 1 : resampler->channels; + + /* we resample each channel separately */ resampler->blocks = resampler->channels; + resampler->inc = 1; + switch (resampler->format) { case GST_AUDIO_FORMAT_F64: - if (resampler->channels == 2 && n_taps >= 4) { + if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) { resampler->resample = resample_gdouble_2; resampler->deinterleave = deinterleave_copy; resampler->blocks = 1; + resampler->inc = resampler->channels;; } else { resampler->resample = resample_gdouble_1; resampler->deinterleave = deinterleave_gdouble; @@ -767,10 +779,11 @@ resampler_calculate_taps (GstAudioResampler * resampler) resampler->mirror = mirror_gint32; break; case GST_AUDIO_FORMAT_S16: - if (resampler->channels == 2 && n_taps >= 4) { + if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) { resampler->resample = resample_gint16_2; resampler->deinterleave = deinterleave_copy; resampler->blocks = 1; + resampler->inc = resampler->channels;; } else { resampler->resample = resample_gint16_1; resampler->deinterleave = deinterleave_gint16; @@ -780,12 +793,6 @@ resampler_calculate_taps (GstAudioResampler * resampler) default: break; } - if (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED) { - resampler->deinterleave = deinterleave_copy_n; - resampler->ostride = 1; - } else { - resampler->ostride = resampler->channels; - } } #define PRINT_TAPS(type,print) \ @@ -1017,7 +1024,7 @@ gst_audio_resampler_free (GstAudioResampler * resampler) g_return_if_fail (resampler != NULL); g_free (resampler->taps); - g_free (resampler->coeff); + g_free (resampler->coeffmem); g_free (resampler->tmpcoeff); g_free (resampler->samples); g_free (resampler->sbuf); @@ -1119,16 +1126,28 @@ static inline gpointer * get_sample_bufs (GstAudioResampler * resampler, gsize need) { if (resampler->samples_len < need) { - guint c, channels = resampler->channels; + guint c, blocks = resampler->blocks; + gsize bytes, bpf; + gint8 *ptr; + GST_LOG ("realloc %d -> %d", (gint) resampler->samples_len, (gint) need); + + bpf = resampler->bps * resampler->inc; + + bytes = (need + 8) * bpf; + bytes = GST_ROUND_UP_32 (bytes); + /* FIXME, move history */ - resampler->samples = g_realloc (resampler->samples, need * resampler->bpf); - resampler->samples_len = need; + resampler->samples = + g_realloc (resampler->samples, resampler->blocks * bytes + 31); + resampler->samples_len = bytes / bpf; + + ptr = MEM_ALIGN (resampler->samples, 32); + /* set up new pointers */ - for (c = 0; c < channels; c++) - resampler->sbuf[c] = - (gint8 *) resampler->samples + - (c * resampler->samples_len * resampler->bps); + for (c = 0; c < blocks; c++) { + resampler->sbuf[c] = ptr + (c * bytes); + } } return resampler->sbuf; } -- 2.7.4