gst-libs/gst/audio/audio-resampler-x86.h

   1 /* GStreamer
   2  * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
  21 #include <xmmintrin.h>
  22
  23 static inline void
  24 inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
  25     const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
  26 {
  27   gint i = 0;
  28   __m128 sum = _mm_setzero_ps ();
  29
  30   for (; i < len; i += 8) {
  31     sum =
  32         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
  33             _mm_load_ps (b + i + 0)));
  34     sum =
  35         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
  36             _mm_load_ps (b + i + 4)));
  37   }
  38   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  39   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  40   _mm_store_ss (o, sum);
  41 }
  42
  43 static inline void
  44 inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
  45     const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
  46 {
  47   gint i = 0;
  48   __m128 sum[2], t;
  49   const gfloat *c[2] = {(gfloat*)((gint8*)b + 0*bstride),
  50                         (gfloat*)((gint8*)b + 1*bstride)};
  51
  52   sum[0] = sum[1] = _mm_setzero_ps ();
  53
  54   for (; i < len; i += 8) {
  55     t = _mm_loadu_ps (a + i + 0);
  56     sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
  57     sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
  58     t = _mm_loadu_ps (a + i + 4);
  59     sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
  60     sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
  61   }
  62   sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
  63   sum[0] = _mm_add_ps (sum[0], sum[1]);
  64   sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
  65   sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
  66   _mm_store_ss (o, sum[0]);
  67 }
  68
  69 static inline void
  70 inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
  71     const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
  72 {
  73   gint i = 0;
  74   __m128 sum[4];
  75   __m128 t, f = _mm_loadu_ps(icoeff);
  76   const gfloat *c[4] = {(gfloat*)((gint8*)b + 0*bstride),
  77                         (gfloat*)((gint8*)b + 1*bstride),
  78                         (gfloat*)((gint8*)b + 2*bstride),
  79                         (gfloat*)((gint8*)b + 3*bstride)};
  80
  81   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
  82
  83   for (; i < len; i += 4) {
  84     t = _mm_loadu_ps (a + i);
  85     sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
  86     sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
  87     sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
  88     sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
  89   }
  90   sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
  91   sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
  92   sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
  93   sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
  94   sum[0] = _mm_add_ps (sum[0], sum[1]);
  95   sum[2] = _mm_add_ps (sum[2], sum[3]);
  96   sum[0] = _mm_add_ps (sum[0], sum[2]);
  97   sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
  98   sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
  99   _mm_store_ss (o, sum[0]);
 100 }
 101
 102 MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
 103 MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
 104 MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
 105
 106 static void
 107 interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
 108     gint len, const gpointer icp, gint astride)
 109 {
 110   gint i;
 111   gfloat *o = op, *a = ap, *ic = icp;
 112   __m128 f[2], t1, t2;
 113   const gfloat *c[2] = {(gfloat*)((gint8*)a + 0*astride),
 114                         (gfloat*)((gint8*)a + 1*astride)};
 115
 116   f[0] = _mm_load1_ps (ic+0);
 117   f[1] = _mm_load1_ps (ic+1);
 118
 119   for (i = 0; i < len; i += 8) {
 120     t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
 121     t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
 122     _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
 123
 124     t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
 125     t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
 126     _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
 127   }
 128 }
 129
 130 static void
 131 interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
 132     gint len, const gpointer icp, gint astride)
 133 {
 134   gint i;
 135   gfloat *o = op, *a = ap, *ic = icp;
 136   __m128 f[4], t[4];
 137   const gfloat *c[4] = {(gfloat*)((gint8*)a + 0*astride),
 138                         (gfloat*)((gint8*)a + 1*astride),
 139                         (gfloat*)((gint8*)a + 2*astride),
 140                         (gfloat*)((gint8*)a + 3*astride)};
 141
 142   f[0] = _mm_load1_ps (ic+0);
 143   f[1] = _mm_load1_ps (ic+1);
 144   f[2] = _mm_load1_ps (ic+2);
 145   f[3] = _mm_load1_ps (ic+3);
 146
 147   for (i = 0; i < len; i += 4) {
 148     t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
 149     t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
 150     t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
 151     t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
 152     t[0] = _mm_add_ps (t[0], t[1]);
 153     t[2] = _mm_add_ps (t[2], t[3]);
 154     _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
 155   }
 156 }
 157
 158 #endif
 159
 160 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
 161 #include <emmintrin.h>
 162
 163 static inline void
 164 inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
 165     const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
 166 {
 167   gint i;
 168   __m128i sum, t;
 169
 170   sum = _mm_setzero_si128 ();
 171
 172   for (i = 0; i < len; i += 16) {
 173     t = _mm_loadu_si128 ((__m128i *) (a + i));
 174     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0))));
 175
 176     t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
 177     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8))));
 178   }
 179   sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
 180   sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
 181
 182   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 183   sum = _mm_srai_epi32 (sum, PRECISION_S16);
 184   sum = _mm_packs_epi32 (sum, sum);
 185   *o = _mm_extract_epi16 (sum, 0);
 186 }
 187
 188 static inline void
 189 inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
 190     const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
 191 {
 192   gint i = 0;
 193   __m128i sum[2], t;
 194   __m128i f = _mm_set_epi64x (0, *((gint64*)icoeff));
 195   const gint16 *c[2] = {(gint16*)((gint8*)b + 0*bstride),
 196                         (gint16*)((gint8*)b + 1*bstride)};
 197
 198   sum[0] = sum[1] = _mm_setzero_si128 ();
 199   f = _mm_unpacklo_epi16 (f, sum[0]);
 200
 201   for (; i < len; i += 16) {
 202     t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
 203     sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
 204     sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
 205
 206     t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
 207     sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
 208     sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
 209   }
 210   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
 211   sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
 212
 213   sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f,  _MM_SHUFFLE (0, 0, 0, 0)));
 214   sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f,  _MM_SHUFFLE (1, 1, 1, 1)));
 215   sum[0] = _mm_add_epi32 (sum[0], sum[1]);
 216
 217   sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
 218   sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
 219
 220   sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 221   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
 222   sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
 223   *o = _mm_extract_epi16 (sum[0], 0);
 224 }
 225
 226 static inline void
 227 inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
 228     const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
 229 {
 230   gint i = 0;
 231   __m128i sum[4], t[4];
 232   __m128i f = _mm_set_epi64x (0, *((long long*)icoeff));
 233   const gint16 *c[4] = {(gint16*)((gint8*)b + 0*bstride),
 234                         (gint16*)((gint8*)b + 1*bstride),
 235                         (gint16*)((gint8*)b + 2*bstride),
 236                         (gint16*)((gint8*)b + 3*bstride)};
 237
 238   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
 239   f = _mm_unpacklo_epi16 (f, sum[0]);
 240
 241   for (; i < len; i += 8) {
 242     t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
 243     sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i))));
 244     sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i))));
 245     sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i))));
 246     sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i))));
 247   }
 248   t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
 249   t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
 250   t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
 251   t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
 252
 253   sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64(t[0], t[1]), _mm_unpackhi_epi64(t[0], t[1]));
 254   sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64(t[2], t[3]), _mm_unpackhi_epi64(t[2], t[3]));
 255   sum[0] = _mm_add_epi32 (sum[0], sum[2]);
 256
 257   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
 258   sum[0] = _mm_madd_epi16 (sum[0], f);
 259
 260   sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
 261   sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
 262
 263   sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 264   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
 265   sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
 266   *o = _mm_extract_epi16 (sum[0], 0);
 267 }
 268
 269 static inline void
 270 inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
 271     const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
 272 {
 273   gint i = 0;
 274   __m128d sum = _mm_setzero_pd ();
 275
 276   for (; i < len; i += 8) {
 277     sum =
 278         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
 279             _mm_load_pd (b + i + 0)));
 280     sum =
 281         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
 282             _mm_load_pd (b + i + 2)));
 283     sum =
 284         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
 285             _mm_load_pd (b + i + 4)));
 286     sum =
 287         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
 288             _mm_load_pd (b + i + 6)));
 289   }
 290   sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
 291   _mm_store_sd (o, sum);
 292 }
 293
 294 static inline void
 295 inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
 296     const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
 297 {
 298   gint i = 0;
 299   __m128d sum[2], t;
 300   const gdouble *c[2] = {(gdouble*)((gint8*)b + 0*bstride),
 301                          (gdouble*)((gint8*)b + 1*bstride)};
 302
 303   sum[0] = sum[1] = _mm_setzero_pd ();
 304
 305   for (; i < len; i += 4) {
 306     t = _mm_loadu_pd (a + i + 0);
 307     sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
 308     sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
 309     t = _mm_loadu_pd (a + i + 2);
 310     sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
 311     sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
 312   }
 313   sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
 314   sum[0] = _mm_add_pd (sum[0], sum[1]);
 315   sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
 316   _mm_store_sd (o, sum[0]);
 317 }
 318
 319 static inline void
 320 inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
 321     const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
 322 {
 323   gint i;
 324   __m128d f[2], sum[4], t;
 325   const gdouble *c[4] = {(gdouble*)((gint8*)b + 0*bstride),
 326                          (gdouble*)((gint8*)b + 1*bstride),
 327                          (gdouble*)((gint8*)b + 2*bstride),
 328                          (gdouble*)((gint8*)b + 3*bstride)};
 329
 330   f[0] = _mm_loadu_pd (icoeff + 0);
 331   f[1] = _mm_loadu_pd (icoeff + 2);
 332   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
 333
 334   for (i = 0; i < len; i += 2) {
 335     t = _mm_loadu_pd (a + i + 0);
 336     sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
 337     sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
 338     sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
 339     sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
 340   }
 341   sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
 342   sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
 343   sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
 344   sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
 345   sum[0] = _mm_add_pd (sum[0], sum[1]);
 346   sum[2] = _mm_add_pd (sum[2], sum[3]);
 347   sum[0] = _mm_add_pd (sum[0], sum[2]);
 348   sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
 349   _mm_store_sd (o, sum[0]);
 350 }
 351
 352 MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
 353 MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
 354 MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
 355
 356 MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
 357 MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
 358 MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
 359
 360 static inline void
 361 interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
 362     gint len, const gpointer icp, gint astride)
 363 {
 364   gint i = 0;
 365   gint16 *o = op, *a = ap, *ic = icp;
 366   __m128i ta, tb, t1, t2;
 367   __m128i f = _mm_set_epi64x (0, *((gint64*)ic));
 368   const gint16 *c[2] = {(gint16*)((gint8*)a + 0*astride),
 369                         (gint16*)((gint8*)a + 1*astride)};
 370
 371   f = _mm_unpacklo_epi32 (f, f);
 372   f = _mm_unpacklo_epi64 (f, f);
 373
 374   for (; i < len; i += 8) {
 375     ta = _mm_load_si128 ((__m128i *) (c[0] + i));
 376     tb = _mm_load_si128 ((__m128i *) (c[1] + i));
 377
 378     t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
 379     t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
 380
 381     t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 382     t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 383
 384     t1 = _mm_srai_epi32 (t1, PRECISION_S16);
 385     t2 = _mm_srai_epi32 (t2, PRECISION_S16);
 386
 387     t1 = _mm_packs_epi32 (t1, t2);
 388     _mm_store_si128 ((__m128i *) (o + i), t1);
 389   }
 390 }
 391
 392 static inline void
 393 interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
 394     gint len, const gpointer icp, gint astride)
 395 {
 396   gint i = 0;
 397   gint16 *o = op, *a = ap, *ic = icp;
 398   __m128i ta, tb, tl1, tl2, th1, th2;
 399   __m128i f[2];
 400   const gint16 *c[4] = {(gint16*)((gint8*)a + 0*astride),
 401                         (gint16*)((gint8*)a + 1*astride),
 402                         (gint16*)((gint8*)a + 2*astride),
 403                         (gint16*)((gint8*)a + 3*astride)};
 404
 405   f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
 406   f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
 407
 408   for (; i < len; i += 8) {
 409     ta = _mm_load_si128 ((__m128i *) (c[0] + i));
 410     tb = _mm_load_si128 ((__m128i *) (c[1] + i));
 411
 412     tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
 413     th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
 414
 415     ta = _mm_load_si128 ((__m128i *) (c[2] + i));
 416     tb = _mm_load_si128 ((__m128i *) (c[3] + i));
 417
 418     tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
 419     th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
 420
 421     tl1 = _mm_add_epi32 (tl1, tl2);
 422     th1 = _mm_add_epi32 (th1, th2);
 423
 424     tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 425     th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 426
 427     tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
 428     th1 = _mm_srai_epi32 (th1, PRECISION_S16);
 429
 430     tl1 = _mm_packs_epi32 (tl1, th1);
 431     _mm_store_si128 ((__m128i *) (o + i), tl1);
 432   }
 433 }
 434
 435 static void
 436 interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
 437     gint len, const gpointer icp, gint astride)
 438 {
 439   gint i;
 440   gdouble *o = op, *a = ap, *ic = icp;
 441   __m128d f[2], t1, t2;
 442   const gdouble *c[2] = {(gdouble*)((gint8*)a + 0*astride),
 443                          (gdouble*)((gint8*)a + 1*astride)};
 444
 445   f[0] = _mm_load1_pd (ic+0);
 446   f[1] = _mm_load1_pd (ic+1);
 447
 448   for (i = 0; i < len; i += 4) {
 449     t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
 450     t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
 451     _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
 452
 453     t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
 454     t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
 455     _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
 456   }
 457 }
 458
 459 static void
 460 interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
 461     gint len, const gpointer icp, gint astride)
 462 {
 463   gint i;
 464   gdouble *o = op, *a = ap, *ic = icp;
 465   __m128d f[4], t[4];
 466   const gdouble *c[4] = {(gdouble*)((gint8*)a + 0*astride),
 467                          (gdouble*)((gint8*)a + 1*astride),
 468                          (gdouble*)((gint8*)a + 2*astride),
 469                          (gdouble*)((gint8*)a + 3*astride)};
 470
 471   f[0] = _mm_load1_pd (ic+0);
 472   f[1] = _mm_load1_pd (ic+1);
 473   f[2] = _mm_load1_pd (ic+2);
 474   f[3] = _mm_load1_pd (ic+3);
 475
 476   for (i = 0; i < len; i += 2) {
 477     t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
 478     t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
 479     t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
 480     t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
 481     t[0] = _mm_add_pd (t[0], t[1]);
 482     t[2] = _mm_add_pd (t[2], t[3]);
 483     _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
 484   }
 485 }
 486
 487 #endif
 488
 489 #if 0
 490 #define __SSE4_1__
 491 #pragma GCC target("sse4.1")
 492 #endif
 493
 494 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
 495 #include <smmintrin.h>
 496
 497 static inline void
 498 inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
 499     const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
 500 {
 501   gint i = 0;
 502   __m128i sum, ta, tb;
 503   gint64 res;
 504
 505   sum = _mm_setzero_si128 ();
 506
 507   for (; i < len; i += 8) {
 508     ta = _mm_loadu_si128 ((__m128i *) (a + i));
 509     tb = _mm_load_si128 ((__m128i *) (b + i));
 510
 511     sum =
 512         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 513             _mm_unpacklo_epi32 (tb, tb)));
 514     sum =
 515         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 516             _mm_unpackhi_epi32 (tb, tb)));
 517
 518     ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
 519     tb = _mm_load_si128 ((__m128i *) (b + i + 4));
 520
 521     sum =
 522         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 523             _mm_unpacklo_epi32 (tb, tb)));
 524     sum =
 525         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 526             _mm_unpackhi_epi32 (tb, tb)));
 527   }
 528   sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
 529   res = _mm_cvtsi128_si64 (sum);
 530
 531   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
 532   *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
 533 }
 534
 535 static inline void
 536 inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
 537     const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
 538 {
 539   gint i = 0;
 540   gint64 res;
 541   __m128i sum[2], ta, tb;
 542   __m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
 543   const gint32 *c[2] = {(gint32*)((gint8*)b + 0*bstride),
 544                         (gint32*)((gint8*)b + 1*bstride)};
 545
 546   sum[0] = sum[1] = _mm_setzero_si128 ();
 547
 548   for (; i < len; i += 4) {
 549     ta = _mm_loadu_si128 ((__m128i *)(a + i));
 550
 551     tb = _mm_load_si128 ((__m128i *)(c[0] + i));
 552     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 553               _mm_unpacklo_epi32 (tb, tb)));
 554     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 555               _mm_unpackhi_epi32 (tb, tb)));
 556
 557     tb = _mm_load_si128 ((__m128i *)(c[1] + i));
 558     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 559               _mm_unpacklo_epi32 (tb, tb)));
 560     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 561               _mm_unpackhi_epi32 (tb, tb)));
 562   }
 563   sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
 564   sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
 565   sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
 566   sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
 567   sum[0] = _mm_add_epi64 (sum[0], sum[1]);
 568   sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
 569   res = _mm_cvtsi128_si64 (sum[0]);
 570
 571   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
 572   *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
 573 }
 574
 575 static inline void
 576 inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
 577     const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
 578 {
 579   gint i = 0;
 580   gint64 res;
 581   __m128i sum[4], ta, tb;
 582   __m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
 583   const gint32 *c[4] = {(gint32*)((gint8*)b + 0*bstride),
 584                         (gint32*)((gint8*)b + 1*bstride),
 585                         (gint32*)((gint8*)b + 2*bstride),
 586                         (gint32*)((gint8*)b + 3*bstride)};
 587
 588   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
 589
 590   for (; i < len; i += 4) {
 591     ta = _mm_loadu_si128 ((__m128i *)(a + i));
 592
 593     tb = _mm_load_si128 ((__m128i *)(c[0] + i));
 594     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 595               _mm_unpacklo_epi32 (tb, tb)));
 596     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 597               _mm_unpackhi_epi32 (tb, tb)));
 598
 599     tb = _mm_load_si128 ((__m128i *)(c[1] + i));
 600     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 601               _mm_unpacklo_epi32 (tb, tb)));
 602     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 603               _mm_unpackhi_epi32 (tb, tb)));
 604
 605     tb = _mm_load_si128 ((__m128i *)(c[2] + i));
 606     sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 607               _mm_unpacklo_epi32 (tb, tb)));
 608     sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 609               _mm_unpackhi_epi32 (tb, tb)));
 610
 611     tb = _mm_load_si128 ((__m128i *)(c[3] + i));
 612     sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 613               _mm_unpacklo_epi32 (tb, tb)));
 614     sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 615               _mm_unpackhi_epi32 (tb, tb)));
 616   }
 617   sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
 618   sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
 619   sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
 620   sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
 621   sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
 622   sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
 623   sum[2] = _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
 624   sum[3] = _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
 625   sum[0] = _mm_add_epi64 (sum[0], sum[1]);
 626   sum[2] = _mm_add_epi64 (sum[2], sum[3]);
 627   sum[0] = _mm_add_epi64 (sum[0], sum[2]);
 628   sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
 629   res = _mm_cvtsi128_si64 (sum[0]);
 630
 631   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
 632   *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
 633 }
 634
 635 MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
 636 MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
 637 MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
 638 #endif
 639
 640 static void
 641 audio_resampler_check_x86 (const gchar *option)
 642 {
 643   if (!strcmp (option, "sse")) {
 644 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
 645     GST_DEBUG ("enable SSE optimisations");
 646     resample_gfloat_full_1 = resample_gfloat_full_1_sse;
 647     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
 648     resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
 649
 650     interpolate_gfloat_linear = interpolate_gfloat_linear_sse;
 651     interpolate_gfloat_cubic = interpolate_gfloat_cubic_sse;
 652 #else
 653     GST_DEBUG ("SSE optimisations not enabled");
 654 #endif
 655   } else if (!strcmp (option, "sse2")) {
 656 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
 657     GST_DEBUG ("enable SSE2 optimisations");
 658     resample_gint16_full_1 = resample_gint16_full_1_sse2;
 659     resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
 660     resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2;
 661
 662     interpolate_gint16_linear = interpolate_gint16_linear_sse2;
 663     interpolate_gint16_cubic = interpolate_gint16_cubic_sse2;
 664
 665     resample_gdouble_full_1 = resample_gdouble_full_1_sse2;
 666     resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2;
 667     resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2;
 668
 669     interpolate_gdouble_linear = interpolate_gdouble_linear_sse2;
 670     interpolate_gdouble_cubic = interpolate_gdouble_cubic_sse2;
 671 #else
 672     GST_DEBUG ("SSE2 optimisations not enabled");
 673 #endif
 674   } else if (!strcmp (option, "sse41")) {
 675 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
 676     GST_DEBUG ("enable SSE41 optimisations");
 677     resample_gint32_full_1 = resample_gint32_full_1_sse41;
 678     resample_gint32_linear_1 = resample_gint32_linear_1_sse41;
 679     resample_gint32_cubic_1 = resample_gint32_cubic_1_sse41;
 680 #else
 681     GST_DEBUG ("SSE41 optimisations not enabled");
 682 #endif
 683   }
 684 }