From f55a67ca7cbded75b383791c4d0ca52993524f55 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Tue, 12 Jan 2016 18:55:19 +0100 Subject: [PATCH] audio-resampler: make pluggable optimized functions Add support for x86 specialized functions and select them at runtime. --- ...udio-resampler-core.h => audio-resampler-x86.h} | 216 ++++++++++----------- gst-libs/gst/audio/audio-resampler.c | 206 +++++++++++++++++--- 2 files changed, 278 insertions(+), 144 deletions(-) rename gst-libs/gst/audio/{audio-resampler-core.h => audio-resampler-x86.h} (84%) diff --git a/gst-libs/gst/audio/audio-resampler-core.h b/gst-libs/gst/audio/audio-resampler-x86.h similarity index 84% rename from gst-libs/gst/audio/audio-resampler-core.h rename to gst-libs/gst/audio/audio-resampler-x86.h index 1f69d1f..65a5194 100644 --- a/gst-libs/gst/audio/audio-resampler-core.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -17,55 +17,55 @@ * Boston, MA 02110-1301, USA. */ - -#define PRECISION_S16 15 -#define PRECISION_S32 30 - #ifdef HAVE_EMMINTRIN_H #include #endif +#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gdouble_1 (gdouble * o, const gdouble * a, const gdouble * b, - gint len) +inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len) { gint i = 0; - gdouble res; -#ifdef HAVE_EMMINTRIN_H - __m128d sum = _mm_setzero_pd (); + gint32 res = 0; + __m128i sum[2], ta, tb; + __m128i t1[2]; + + sum[0] = _mm_setzero_si128 (); + sum[1] = _mm_setzero_si128 (); for (; i < len - 7; i += 8) { - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), - _mm_loadu_pd (b + i + 0))); - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), - _mm_loadu_pd (b + i + 2))); - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), - _mm_loadu_pd (b + i + 4))); - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), - _mm_loadu_pd (b + i + 6))); + ta = _mm_loadu_si128 ((__m128i *) (a + i)); + tb = _mm_loadu_si128 ((__m128i *) (b + i)); + + t1[0] = _mm_mullo_epi16 (ta, tb); + t1[1] = _mm_mulhi_epi16 (ta, tb); + + sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); + sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); } - sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); - _mm_store_sd (&res, sum); -#else - res = 0.0; -#endif + sum[0] = _mm_add_epi32 (sum[0], sum[1]); + sum[0] = + _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + 3))); + sum[0] = + _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, + 1))); + res = _mm_cvtsi128_si32 (sum[0]); for (; i < len; i++) - res += a[i] * b[i]; + res += (gint32) a[i] * (gint32) b[i]; - *o = res; + res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; + *o = CLAMP (res, -(1L << 15), (1L << 15) - 1); } +#endif +#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gfloat_1 (gfloat * o, const gfloat * a, const gfloat * b, gint len) +inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len) { gint i = 0; gfloat res; -#ifdef HAVE_EMMINTRIN_H __m128 sum = _mm_setzero_ps (); for (; i < len - 7; i += 8) { @@ -79,35 +79,54 @@ inner_product_gfloat_1 (gfloat * o, const gfloat * a, const gfloat * b, gint len sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); _mm_store_ss (&res, sum); -#else - res = 0.0; -#endif for (; i < len; i++) res += a[i] * b[i]; *o = res; } +#endif +#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gint32_1 (gint32 * o, const gint32 * a, const gint32 * b, gint len) +inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b, + gint len) { gint i = 0; - gint64 res = 0; + gdouble res; + __m128d sum = _mm_setzero_pd (); + + for (; i < len - 7; i += 8) { + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), + _mm_loadu_pd (b + i + 0))); + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), + _mm_loadu_pd (b + i + 2))); + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), + _mm_loadu_pd (b + i + 4))); + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), + _mm_loadu_pd (b + i + 6))); + } + sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); + _mm_store_sd (&res, sum); for (; i < len; i++) - res += (gint64) a[i] * (gint64) b[i]; + res += a[i] * b[i]; - res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; - *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); + *o = res; } +#endif +#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gint16_1 (gint16 * o, const gint16 * a, const gint16 * b, gint len) +inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len) { gint i = 0; - gint32 res = 0; -#ifdef HAVE_EMMINTRIN_H + gint32 r[2]; + guint64 r64; __m128i sum[2], ta, tb; __m128i t1[2]; @@ -115,11 +134,22 @@ inner_product_gint16_1 (gint16 * o, const gint16 * a, const gint16 * b, gint len sum[1] = _mm_setzero_si128 (); for (; i < len - 7; i += 8) { - ta = _mm_loadu_si128 ((__m128i *) (a + i)); tb = _mm_loadu_si128 ((__m128i *) (b + i)); - t1[0] = _mm_mullo_epi16 (ta, tb); - t1[1] = _mm_mulhi_epi16 (ta, tb); + t1[1] = _mm_unpacklo_epi16 (tb, tb); + + ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i)); + t1[0] = _mm_mullo_epi16 (ta, t1[1]); + t1[1] = _mm_mulhi_epi16 (ta, t1[1]); + + sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); + sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); + + t1[1] = _mm_unpackhi_epi16 (tb, tb); + + ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8)); + t1[0] = _mm_mullo_epi16 (ta, t1[1]); + t1[1] = _mm_mulhi_epi16 (ta, t1[1]); sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); @@ -128,28 +158,28 @@ inner_product_gint16_1 (gint16 * o, const gint16 * a, const gint16 * b, gint len sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, - 1))); - res = _mm_cvtsi128_si32 (sum[0]); -#else - res = 0; -#endif - - for (; i < len; i++) - res += (gint32) a[i] * (gint32) b[i]; + r64 = _mm_cvtsi128_si64 (sum[0]); + r[0] = r64 >> 32; + r[1] = r64 & 0xffffffff; - res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - *o = CLAMP (res, -(1L << 15), (1L << 15) - 1); + for (; i < len; i++) { + r[0] += (gint32) a[2 * i] * (gint32) b[i]; + r[1] += (gint32) a[2 * i + 1] * (gint32) b[i]; + } + r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; + r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; + o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1); + o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1); } +#endif +#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gdouble_2 (gdouble * o, const gdouble * a, const gdouble * b, +inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, gint len) { gint i = 0; gdouble r[2]; -#ifdef HAVE_EMMINTRIN_H __m128d sum = _mm_setzero_pd (), t; for (; i < len - 3; i += 4) { @@ -170,10 +200,6 @@ inner_product_gdouble_2 (gdouble * o, const gdouble * a, const gdouble * b, _mm_unpackhi_pd (t, t))); } _mm_store_pd (r, sum); -#else - r[0] = 0.0; - r[1] = 0.0; -#endif for (; i < len; i++) { r[0] += a[2 * i] * b[i]; @@ -182,59 +208,25 @@ inner_product_gdouble_2 (gdouble * o, const gdouble * a, const gdouble * b, o[0] = r[0]; o[1] = r[1]; } +#endif -static inline void -inner_product_gint16_2 (gint16 * o, const gint16 * a, const gint16 * b, gint len) -{ - gint i = 0; - gint32 r[2]; #ifdef HAVE_EMMINTRIN_H - guint64 r64; - __m128i sum[2], ta, tb; - __m128i t1[2]; - - sum[0] = _mm_setzero_si128 (); - sum[1] = _mm_setzero_si128 (); - - for (; i < len - 7; i += 8) { - tb = _mm_loadu_si128 ((__m128i *) (b + i)); - - t1[1] = _mm_unpacklo_epi16 (tb, tb); - - ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i)); - t1[0] = _mm_mullo_epi16 (ta, t1[1]); - t1[1] = _mm_mulhi_epi16 (ta, t1[1]); - - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); - - t1[1] = _mm_unpackhi_epi16 (tb, tb); - - ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8)); - t1[0] = _mm_mullo_epi16 (ta, t1[1]); - t1[1] = _mm_mulhi_epi16 (ta, t1[1]); - - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); - } - sum[0] = _mm_add_epi32 (sum[0], sum[1]); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, - 3))); - r64 = _mm_cvtsi128_si64 (sum[0]); - r[0] = r64 >> 32; - r[1] = r64 & 0xffffffff; -#else - r[0] = 0; - r[1] = 0; +MAKE_RESAMPLE_FUNC (gint16, 1, sse); +MAKE_RESAMPLE_FUNC (gfloat, 1, sse); +MAKE_RESAMPLE_FUNC (gdouble, 1, sse); +MAKE_RESAMPLE_FUNC (gint16, 2, sse); +MAKE_RESAMPLE_FUNC (gdouble, 2, sse); #endif - for (; i < len; i++) { - r[0] += (gint32) a[2 * i] * (gint32) b[i]; - r[1] += (gint32) a[2 * i + 1] * (gint32) b[i]; +static void +audio_resampler_check_x86 (const gchar *option) +{ + if (!strcmp (option, "sse2")) { + GST_DEBUG ("enable SSE2 optimisations"); + resample_gint16_1 = resample_gint16_1_sse; + resample_gfloat_1 = resample_gfloat_1_sse; + resample_gdouble_1 = resample_gdouble_1_sse; + resample_gint16_2 = resample_gint16_2_sse; + resample_gdouble_2 = resample_gdouble_2_sse; } - r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1); - o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1); } diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index f62f43c..e07f8d7 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -25,6 +25,10 @@ #include #include +#ifdef HAVE_ORC +#include +#endif + #include "audio-resampler.h" typedef struct _Tap @@ -84,27 +88,8 @@ struct _GstAudioResampler gpointer *sbuf; }; -#ifndef GST_DISABLE_GST_DEBUG -#define GST_CAT_DEFAULT ensure_debug_category() -static GstDebugCategory * -ensure_debug_category (void) -{ - static gsize cat_gonce = 0; - - if (g_once_init_enter (&cat_gonce)) { - gsize cat_done; - - cat_done = (gsize) _gst_debug_category_new ("audio-resampler", 0, - "audio-resampler object"); - - g_once_init_leave (&cat_gonce, cat_done); - } - - return (GstDebugCategory *) cat_gonce; -} -#else -#define ensure_debug_category() /* NOOP */ -#endif /* GST_DISABLE_GST_DEBUG */ +GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug); +#define GST_CAT_DEFAULT audio_resampler_debug /** * SECTION:gstaudioresampler @@ -305,7 +290,8 @@ G_STMT_START { \ GST_WARNING ("can't find exact taps"); \ } G_STMT_END -#include "audio-resampler-core.h" +#define PRECISION_S16 15 +#define PRECISION_S32 30 static void make_taps (GstAudioResampler * resampler, Tap * t, gint j) @@ -375,10 +361,98 @@ make_taps (GstAudioResampler * resampler, Tap * t, gint j) } } -#define MAKE_RESAMPLE_FUNC(type,channels) \ +static inline void +inner_product_gint16_1_c (gint16 * o, const gint16 * a, const gint16 * b, + gint len) +{ + gint i; + gint32 res = 0; + + for (i = 0; i < len; i++) + res += (gint32) a[i] * (gint32) b[i]; + + res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; + *o = CLAMP (res, -(1L << 15), (1L << 15) - 1); +} + +static inline void +inner_product_gint16_2_c (gint16 * o, const gint16 * a, const gint16 * b, + gint len) +{ + gint i; + gint32 r[2] = { 0, 0 }; + + for (i = 0; i < len; i++) { + r[0] += (gint32) a[2 * i] * (gint32) b[i]; + r[1] += (gint32) a[2 * i + 1] * (gint32) b[i]; + } + r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; + r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; + o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1); + o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1); +} + + +static inline void +inner_product_gint32_1_c (gint32 * o, const gint32 * a, const gint32 * b, + gint len) +{ + gint i; + gint64 res = 0; + + for (i = 0; i < len; i++) + res += (gint64) a[i] * (gint64) b[i]; + + res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; + *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); +} + +static inline void +inner_product_gfloat_1_c (gfloat * o, const gfloat * a, const gfloat * b, + gint len) +{ + gint i; + gfloat res = 0.0; + + for (i = 0; i < len; i++) + res += a[i] * b[i]; + + *o = res; +} + +static inline void +inner_product_gdouble_1_c (gdouble * o, const gdouble * a, const gdouble * b, + gint len) +{ + gint i; + gdouble res = 0.0; + + for (i = 0; i < len; i++) + res += a[i] * b[i]; + + *o = res; +} + +static inline void +inner_product_gdouble_2_c (gdouble * o, const gdouble * a, const gdouble * b, + gint len) +{ + gint i; + gdouble r[2] = { 0.0, 0.0 }; + + for (i = 0; i < len; i++) { + r[0] += a[2 * i] * b[i]; + r[1] += a[2 * i + 1] * b[i]; + } + o[0] = r[0]; + o[1] = r[1]; +} + +#define MAKE_RESAMPLE_FUNC(type,channels,arch) \ static void \ -resample_ ##type## _ ##channels (GstAudioResampler * resampler, gpointer in[], gsize in_len, \ - gpointer out[], gsize out_len, gsize * consumed, gboolean move) \ +resample_ ##type## _ ##channels## _ ##arch (GstAudioResampler * resampler, \ + gpointer in[], gsize in_len, gpointer out[], gsize out_len, \ + gsize * consumed, gboolean move) \ { \ gint c, di = 0; \ gint n_taps = resampler->n_taps; \ @@ -401,7 +475,7 @@ resample_ ##type## _ ##channels (GstAudioResampler * resampler, gpointer in[], g if (t->taps == NULL) \ make_taps (resampler, t, samp_phase); \ \ - inner_product_ ##type## _##channels (op, ipp, t->taps, n_taps); \ + inner_product_ ##type## _##channels##_##arch (op, ipp, t->taps, n_taps); \ op += ostride; \ \ samp_phase = t->next_phase; \ @@ -417,12 +491,78 @@ resample_ ##type## _ ##channels (GstAudioResampler * resampler, gpointer in[], g resampler->samp_phase = samp_phase; \ } -MAKE_RESAMPLE_FUNC (gdouble, 1); -MAKE_RESAMPLE_FUNC (gfloat, 1); -MAKE_RESAMPLE_FUNC (gint32, 1); -MAKE_RESAMPLE_FUNC (gint16, 1); -MAKE_RESAMPLE_FUNC (gdouble, 2); -MAKE_RESAMPLE_FUNC (gint16, 2); +MAKE_RESAMPLE_FUNC (gint16, 1, c); +MAKE_RESAMPLE_FUNC (gint32, 1, c); +MAKE_RESAMPLE_FUNC (gfloat, 1, c); +MAKE_RESAMPLE_FUNC (gdouble, 1, c); +MAKE_RESAMPLE_FUNC (gint16, 2, c); +MAKE_RESAMPLE_FUNC (gdouble, 2, c); + +typedef void (*ResampleFunc) (GstAudioResampler * resampler, + gpointer in[], gsize in_len, gpointer out[], gsize out_len, + gsize * consumed, gboolean move); + +static ResampleFunc resample_funcs[] = { + resample_gint16_1_c, + resample_gint32_1_c, + resample_gfloat_1_c, + resample_gdouble_1_c, + resample_gint16_2_c, + resample_gdouble_2_c, +}; + +#define resample_gint16_1 resample_funcs[0] +#define resample_gint32_1 resample_funcs[1] +#define resample_gfloat_1 resample_funcs[2] +#define resample_gdouble_1 resample_funcs[3] +#define resample_gint16_2 resample_funcs[4] +#define resample_gdouble_2 resample_funcs[5] + +#if defined HAVE_ORC && !defined DISABLE_ORC +# if defined (__i386__) || defined (__x86_64__) +# define CHECK_X86 +# include "audio-resampler-x86.h" +# endif +#endif + +static void +audio_resampler_init (void) +{ + static gsize init_gonce = 0; + + if (g_once_init_enter (&init_gonce)) { + + GST_DEBUG_CATEGORY_INIT (audio_resampler_debug, "audio-resampler", 0, + "audio-resampler object"); + +#if defined HAVE_ORC && !defined DISABLE_ORC + orc_init (); + { + OrcTarget *target = orc_target_get_default (); + gint i; + + if (target) { + unsigned int flags = orc_target_get_default_flags (target); + const gchar *name; + + name = orc_target_get_name (target); + GST_DEBUG ("target %s, default flags %08x", name, flags); + + for (i = 0; i < 32; ++i) { + if (flags & (1U << i)) { + name = orc_target_get_flag_name (target, i); + GST_DEBUG ("target flag %s", name); +#ifdef CHECK_X86 + audio_resampler_check_x86 (name); +#endif + } + } + } + } +#endif + g_once_init_leave (&init_gonce, 1); + } +} #define MAKE_DEINTERLEAVE_FUNC(type) \ static void \ @@ -790,6 +930,8 @@ gst_audio_resampler_new (GstAudioResamplerMethod method, g_return_val_if_fail (in_rate != 0, FALSE); g_return_val_if_fail (out_rate != 0, FALSE); + audio_resampler_init (); + resampler = g_slice_new0 (GstAudioResampler); resampler->method = method; resampler->flags = flags; -- 2.7.4