#ifdef HAVE_EMMINTRIN_H
#include <emmintrin.h>
-#endif
-#ifdef HAVE_EMMINTRIN_H
static inline void
-inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
+inner_product_gint16_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
{
gint i = 0;
- gint32 res = 0;
- __m128i sum[2], ta, tb;
- __m128i t1[2];
+ __m128i sum, ta, tb;
- sum[0] = _mm_setzero_si128 ();
- sum[1] = _mm_setzero_si128 ();
+ sum = _mm_setzero_si128 ();
- for (; i < len - 7; i += 8) {
+ for (; i < len; i += 8) {
ta = _mm_loadu_si128 ((__m128i *) (a + i));
- tb = _mm_loadu_si128 ((__m128i *) (b + i));
+ tb = _mm_load_si128 ((__m128i *) (b + i));
- t1[0] = _mm_mullo_epi16 (ta, tb);
- t1[1] = _mm_mulhi_epi16 (ta, tb);
-
- sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
- sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
+ sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb));
}
- sum[0] = _mm_add_epi32 (sum[0], sum[1]);
- sum[0] =
- _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+ sum =
+ _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
- sum[0] =
- _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
+ sum =
+ _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
1)));
- res = _mm_cvtsi128_si32 (sum[0]);
-
- for (; i < len; i++)
- res += (gint32) a[i] * (gint32) b[i];
- res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
- *o = CLAMP (res, -(1L << 15), (1L << 15) - 1);
+ sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+ sum = _mm_srai_epi32 (sum, PRECISION_S16);
+ sum = _mm_packs_epi32 (sum, sum);
+ *o = _mm_extract_epi16 (sum, 0);
}
-#endif
-#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len)
{
gint i = 0;
- gfloat res;
__m128 sum = _mm_setzero_ps ();
- for (; i < len - 7; i += 8) {
+ for (; i < len; i += 8) {
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
- _mm_loadu_ps (b + i + 0)));
+ _mm_load_ps (b + i + 0)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
- _mm_loadu_ps (b + i + 4)));
+ _mm_load_ps (b + i + 4)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
- _mm_store_ss (&res, sum);
-
- for (; i < len; i++)
- res += a[i] * b[i];
-
- *o = res;
+ _mm_store_ss (o, sum);
}
-#endif
-#ifdef HAVE_EMMINTRIN_H
static inline void
-inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b,
+inner_product_gdouble_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
- gdouble res;
__m128d sum = _mm_setzero_pd ();
- for (; i < len - 7; i += 8) {
+ for (; i < len; i += 8) {
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
- _mm_loadu_pd (b + i + 0)));
+ _mm_load_pd (b + i + 0)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
- _mm_loadu_pd (b + i + 2)));
+ _mm_load_pd (b + i + 2)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
- _mm_loadu_pd (b + i + 4)));
+ _mm_load_pd (b + i + 4)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
- _mm_loadu_pd (b + i + 6)));
+ _mm_load_pd (b + i + 6)));
}
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
- _mm_store_sd (&res, sum);
-
- for (; i < len; i++)
- res += a[i] * b[i];
-
- *o = res;
+ _mm_store_sd (o, sum);
}
-#endif
-#ifdef HAVE_EMMINTRIN_H
static inline void
-inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
+inner_product_gint16_2_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
{
gint i = 0;
- gint32 r[2];
- guint64 r64;
- __m128i sum[2], ta, tb;
- __m128i t1[2];
+ __m128i sum, ta, tb, t1;
- sum[0] = _mm_setzero_si128 ();
- sum[1] = _mm_setzero_si128 ();
+ sum = _mm_setzero_si128 ();
- for (; i < len - 7; i += 8) {
- tb = _mm_loadu_si128 ((__m128i *) (b + i));
-
- t1[1] = _mm_unpacklo_epi16 (tb, tb);
+ for (; i < len; i += 8) {
+ tb = _mm_load_si128 ((__m128i *) (b + i));
+ t1 = _mm_unpacklo_epi16 (tb, tb);
ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
- t1[0] = _mm_mullo_epi16 (ta, t1[1]);
- t1[1] = _mm_mulhi_epi16 (ta, t1[1]);
-
- sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
- sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
- t1[1] = _mm_unpackhi_epi16 (tb, tb);
+ sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
+ t1 = _mm_unpackhi_epi16 (tb, tb);
ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
- t1[0] = _mm_mullo_epi16 (ta, t1[1]);
- t1[1] = _mm_mulhi_epi16 (ta, t1[1]);
- sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
- sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
+ sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
}
- sum[0] = _mm_add_epi32 (sum[0], sum[1]);
- sum[0] =
- _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+ sum =
+ _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
- r64 = _mm_cvtsi128_si64 (sum[0]);
- r[0] = r64 >> 32;
- r[1] = r64 & 0xffffffff;
- for (; i < len; i++) {
- r[0] += (gint32) a[2 * i] * (gint32) b[i];
- r[1] += (gint32) a[2 * i + 1] * (gint32) b[i];
- }
- r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
- r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
- o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1);
- o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1);
+ sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+ sum = _mm_srai_epi32 (sum, PRECISION_S16);
+ sum = _mm_packs_epi32 (sum, sum);
+ *(gint32*)o = _mm_cvtsi128_si32 (sum);
}
-#endif
-#ifdef HAVE_EMMINTRIN_H
static inline void
-inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
+inner_product_gdouble_2_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
- gdouble r[2];
__m128d sum = _mm_setzero_pd (), t;
- for (; i < len - 3; i += 4) {
- t = _mm_loadu_pd (b + i);
+ for (; i < len; i += 4) {
+ t = _mm_load_pd (b + i);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
_mm_unpacklo_pd (t, t)));
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
_mm_unpackhi_pd (t, t)));
- t = _mm_loadu_pd (b + i + 2);
+ t = _mm_load_pd (b + i + 2);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
_mm_unpacklo_pd (t, t)));
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
_mm_unpackhi_pd (t, t)));
}
- _mm_store_pd (r, sum);
-
- for (; i < len; i++) {
- r[0] += a[2 * i] * b[i];
- r[1] += a[2 * i + 1] * b[i];
- }
- o[0] = r[0];
- o[1] = r[1];
+ _mm_store_pd (o, sum);
}
-#endif
-#ifdef HAVE_EMMINTRIN_H
-MAKE_RESAMPLE_FUNC (gint16, 1, sse);
+MAKE_RESAMPLE_FUNC (gint16, 1, sse2);
MAKE_RESAMPLE_FUNC (gfloat, 1, sse);
-MAKE_RESAMPLE_FUNC (gdouble, 1, sse);
-MAKE_RESAMPLE_FUNC (gint16, 2, sse);
-MAKE_RESAMPLE_FUNC (gdouble, 2, sse);
+MAKE_RESAMPLE_FUNC (gdouble, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, 2, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, 2, sse2);
#endif
static void
audio_resampler_check_x86 (const gchar *option)
{
- if (!strcmp (option, "sse2")) {
- GST_DEBUG ("enable SSE2 optimisations");
- resample_gint16_1 = resample_gint16_1_sse;
+#ifdef HAVE_EMMINTRIN_H
+ if (!strcmp (option, "sse")) {
+ GST_DEBUG ("enable SSE optimisations");
resample_gfloat_1 = resample_gfloat_1_sse;
- resample_gdouble_1 = resample_gdouble_1_sse;
- resample_gint16_2 = resample_gint16_2_sse;
- resample_gdouble_2 = resample_gdouble_2_sse;
+ } else if (!strcmp (option, "sse2")) {
+ GST_DEBUG ("enable SSE2 optimisations");
+ resample_gint16_1 = resample_gint16_1_sse2;
+ resample_gdouble_1 = resample_gdouble_1_sse2;
+ resample_gint16_2 = resample_gint16_2_sse2;
+ resample_gdouble_2 = resample_gdouble_2_sse2;
}
+#endif
}
gpointer * sbuf, gpointer in[], gsize in_frames);
typedef void (*MirrorFunc) (GstAudioResampler * resampler, gpointer * sbuf);
+#define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1)))
+
struct _GstAudioResampler
{
GstAudioResamplerMethod method;
guint n_taps;
Tap *taps;
gpointer coeff;
+ gpointer coeffmem;
+ gsize cstride;
gpointer tmpcoeff;
DeinterleaveFunc deinterleave;
ResampleFunc resample;
guint blocks;
+ guint inc;
gboolean filling;
gint samp_inc;
gint samp_frac;
#define CONVERT_TAPS(type, precision) \
G_STMT_START { \
- type *taps = t->taps = (type *) resampler->coeff + j * n_taps; \
+ type *taps = t->taps = (type *) ((gint8*)resampler->coeff + j * resampler->cstride); \
gdouble multiplier = (1 << precision); \
gint i, j; \
gdouble offset, l_offset, h_offset; \
switch (resampler->format) {
case GST_AUDIO_FORMAT_F64:
{
- gdouble *taps = t->taps = (gdouble *) resampler->coeff + j * n_taps;
+ gdouble *taps = t->taps =
+ (gdouble *) ((gint8 *) resampler->coeff + j * resampler->cstride);
for (l = 0; l < n_taps; l++)
taps[l] = tmpcoeff[l] / weight;
break;
}
case GST_AUDIO_FORMAT_F32:
{
- gfloat *taps = t->taps = (gfloat *) resampler->coeff + j * n_taps;
+ gfloat *taps = t->taps =
+ (gfloat *) ((gint8 *) resampler->coeff + j * resampler->cstride);
for (l = 0; l < n_taps; l++)
taps[l] = tmpcoeff[l] / weight;
break;
deinterleave_copy (GstAudioResampler * resampler, gpointer sbuf[],
gpointer in[], gsize in_frames)
{
- gsize samples_avail = resampler->samples_avail;
- gint bpf = resampler->bpf;
+ guint c, blocks = resampler->blocks;
+ gsize bytes_avail, in_bytes, bpf;
- if (in == NULL)
- memset ((guint8 *) sbuf[0] + samples_avail * bpf, 0, in_frames * bpf);
- else
- memcpy ((guint8 *) sbuf[0] + samples_avail * bpf, in[0], in_frames * bpf);
-}
+ bpf = resampler->bps * resampler->inc;
+ bytes_avail = resampler->samples_avail * bpf;
+ in_bytes = in_frames * bpf;
-static void
-deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[],
- gpointer in[], gsize in_frames)
-{
- guint c, channels = resampler->channels;
- gsize samples_avail = resampler->samples_avail;
- gint bps = resampler->bps;
-
- for (c = 0; c < channels; c++) {
+ for (c = 0; c < blocks; c++) {
if (in == NULL)
- memset ((guint8 *) sbuf[c] + samples_avail * bps, 0, in_frames * bps);
+ memset ((guint8 *) sbuf[c] + bytes_avail, 0, in_bytes);
else
- memcpy ((guint8 *) sbuf[c] + samples_avail * bps, in[c], in_frames * bps);
+ memcpy ((guint8 *) sbuf[c] + bytes_avail, in[c], in_bytes);
}
}
static void \
mirror_ ##type (GstAudioResampler * resampler, gpointer sbuf[]) \
{ \
- guint i, c, channels = resampler->channels; \
+ guint i, c, blocks = resampler->blocks; \
gint si = resampler->n_taps / 2; \
gint n_taps = resampler->n_taps; \
- for (c = 0; c < channels; c++) { \
+ for (c = 0; c < blocks; c++) { \
type *s = sbuf[c]; \
for (i = 0; i < si; i++) \
s[i] = -s[n_taps - i]; \
gint n_taps;
gint out_rate;
gint in_rate;
+ gboolean non_interleaved;
switch (resampler->method) {
case GST_AUDIO_RESAMPLER_METHOD_NEAREST:
GST_LOG ("using n_taps %d cutoff %f", n_taps, resampler->cutoff);
resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap));
- resampler->coeff = g_realloc_n (resampler->coeff, out_rate, bps * n_taps);
+
+ resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + 16));
+ g_free (resampler->coeffmem);
+ resampler->coeffmem = g_malloc0 (out_rate * resampler->cstride + 31);
+ resampler->coeff = MEM_ALIGN (resampler->coeffmem, 32);
+
resampler->tmpcoeff =
g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble));
t->next_phase = (j + in_rate) % out_rate;
}
+ non_interleaved =
+ (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED);
+
+ resampler->ostride = non_interleaved ? 1 : resampler->channels;
+
+ /* we resample each channel separately */
resampler->blocks = resampler->channels;
+ resampler->inc = 1;
+
switch (resampler->format) {
case GST_AUDIO_FORMAT_F64:
- if (resampler->channels == 2 && n_taps >= 4) {
+ if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) {
resampler->resample = resample_gdouble_2;
resampler->deinterleave = deinterleave_copy;
resampler->blocks = 1;
+ resampler->inc = resampler->channels;;
} else {
resampler->resample = resample_gdouble_1;
resampler->deinterleave = deinterleave_gdouble;
resampler->mirror = mirror_gint32;
break;
case GST_AUDIO_FORMAT_S16:
- if (resampler->channels == 2 && n_taps >= 4) {
+ if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) {
resampler->resample = resample_gint16_2;
resampler->deinterleave = deinterleave_copy;
resampler->blocks = 1;
+ resampler->inc = resampler->channels;;
} else {
resampler->resample = resample_gint16_1;
resampler->deinterleave = deinterleave_gint16;
default:
break;
}
- if (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED) {
- resampler->deinterleave = deinterleave_copy_n;
- resampler->ostride = 1;
- } else {
- resampler->ostride = resampler->channels;
- }
}
#define PRINT_TAPS(type,print) \
g_return_if_fail (resampler != NULL);
g_free (resampler->taps);
- g_free (resampler->coeff);
+ g_free (resampler->coeffmem);
g_free (resampler->tmpcoeff);
g_free (resampler->samples);
g_free (resampler->sbuf);
get_sample_bufs (GstAudioResampler * resampler, gsize need)
{
if (resampler->samples_len < need) {
- guint c, channels = resampler->channels;
+ guint c, blocks = resampler->blocks;
+ gsize bytes, bpf;
+ gint8 *ptr;
+
GST_LOG ("realloc %d -> %d", (gint) resampler->samples_len, (gint) need);
+
+ bpf = resampler->bps * resampler->inc;
+
+ bytes = (need + 8) * bpf;
+ bytes = GST_ROUND_UP_32 (bytes);
+
/* FIXME, move history */
- resampler->samples = g_realloc (resampler->samples, need * resampler->bpf);
- resampler->samples_len = need;
+ resampler->samples =
+ g_realloc (resampler->samples, resampler->blocks * bytes + 31);
+ resampler->samples_len = bytes / bpf;
+
+ ptr = MEM_ALIGN (resampler->samples, 32);
+
/* set up new pointers */
- for (c = 0; c < channels; c++)
- resampler->sbuf[c] =
- (gint8 *) resampler->samples +
- (c * resampler->samples_len * resampler->bps);
+ for (c = 0; c < blocks; c++) {
+ resampler->sbuf[c] = ptr + (c * bytes);
+ }
}
return resampler->sbuf;
}