gst-libs/gst/audio/audio-resampler-x86.h

   1 /* GStreamer
   2  * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
  21 #include <xmmintrin.h>
  22
  23 static inline void
  24 inner_product_gfloat_none_1_sse (gfloat * o, const gfloat * a,
  25     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  26 {
  27   gint i = 0;
  28   __m128 sum = _mm_setzero_ps ();
  29
  30   for (; i < len; i += 8) {
  31     sum =
  32         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
  33             _mm_load_ps (b + i + 0)));
  34     sum =
  35         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
  36             _mm_load_ps (b + i + 4)));
  37   }
  38   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  39   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  40   _mm_store_ss (o, sum);
  41 }
  42
  43 static inline void
  44 inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
  45     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  46 {
  47   gint i = 0;
  48   __m128 sum = _mm_setzero_ps (), t, b0;
  49   __m128 f = _mm_loadu_ps(icoeff);
  50
  51   for (; i < len; i += 4) {
  52     t = _mm_loadu_ps (a + i);
  53
  54     b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+0)*oversample));
  55     b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+1)*oversample));
  56
  57     sum =
  58         _mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t), b0));
  59
  60     b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+2)*oversample));
  61     b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+3)*oversample));
  62
  63     sum =
  64         _mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t), b0));
  65   }
  66   sum = _mm_mul_ps (sum, f);
  67   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  68   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  69   _mm_store_ss (o, sum);
  70 }
  71
  72 static inline void
  73 inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
  74     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  75 {
  76   gint i = 0;
  77   __m128 sum = _mm_setzero_ps ();
  78   __m128 f = _mm_loadu_ps(icoeff);
  79
  80   for (; i < len; i += 2) {
  81     sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 0),
  82           _mm_loadu_ps (b + (i + 0) * oversample)));
  83     sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 1),
  84           _mm_loadu_ps (b + (i + 1) * oversample)));
  85   }
  86   sum = _mm_mul_ps (sum, f);
  87   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  88   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  89   _mm_store_ss (o, sum);
  90 }
  91
  92 static inline void
  93 inner_product_gfloat_none_2_sse (gfloat * o, const gfloat * a,
  94     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  95 {
  96   gint i = 0;
  97   __m128 sum = _mm_setzero_ps (), t;
  98
  99   for (; i < len; i += 8) {
 100     t = _mm_load_ps (b + i);
 101     sum =
 102         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 0),
 103             _mm_unpacklo_ps (t, t)));
 104     sum =
 105         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 4),
 106             _mm_unpackhi_ps (t, t)));
 107
 108     t = _mm_load_ps (b + i + 4);
 109     sum =
 110         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 8),
 111             _mm_unpacklo_ps (t, t)));
 112     sum =
 113         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 12),
 114             _mm_unpackhi_ps (t, t)));
 115   }
 116   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
 117   *(gint64*)o = _mm_cvtsi128_si64 ((__m128i)sum);
 118 }
 119
 120 MAKE_RESAMPLE_FUNC (gfloat, none, 1, sse);
 121 MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
 122 MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
 123 MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
 124 #endif
 125
 126 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
 127 #include <emmintrin.h>
 128
 129 static inline void
 130 inner_product_gint16_none_1_sse2 (gint16 * o, const gint16 * a,
 131     const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
 132 {
 133   gint i = 0;
 134   __m128i sum, ta, tb;
 135
 136   sum = _mm_setzero_si128 ();
 137
 138   for (; i < len; i += 8) {
 139     ta = _mm_loadu_si128 ((__m128i *) (a + i));
 140     tb = _mm_load_si128 ((__m128i *) (b + i));
 141
 142     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb));
 143   }
 144   sum =
 145       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
 146               3)));
 147   sum =
 148       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
 149               1)));
 150
 151   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 152   sum = _mm_srai_epi32 (sum, PRECISION_S16);
 153   sum = _mm_packs_epi32 (sum, sum);
 154   *o = _mm_extract_epi16 (sum, 0);
 155 }
 156
 157 static inline void
 158 inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
 159     const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
 160 {
 161   gint i = 0;
 162   __m128d sum = _mm_setzero_pd ();
 163
 164   for (; i < len; i += 8) {
 165     sum =
 166         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
 167             _mm_load_pd (b + i + 0)));
 168     sum =
 169         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
 170             _mm_load_pd (b + i + 2)));
 171     sum =
 172         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
 173             _mm_load_pd (b + i + 4)));
 174     sum =
 175         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
 176             _mm_load_pd (b + i + 6)));
 177   }
 178   sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
 179   _mm_store_sd (o, sum);
 180 }
 181
 182 static inline void
 183 inner_product_gint16_none_2_sse2 (gint16 * o, const gint16 * a,
 184     const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
 185 {
 186   gint i = 0;
 187   __m128i sum, ta, tb, t1;
 188
 189   sum = _mm_setzero_si128 ();
 190
 191   for (; i < len; i += 8) {
 192     tb = _mm_load_si128 ((__m128i *) (b + i));
 193
 194     t1 = _mm_unpacklo_epi16 (tb, tb);
 195     ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
 196
 197     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
 198
 199     t1 = _mm_unpackhi_epi16 (tb, tb);
 200     ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
 201
 202     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
 203   }
 204   sum =
 205       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
 206               3)));
 207
 208   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 209   sum = _mm_srai_epi32 (sum, PRECISION_S16);
 210   sum = _mm_packs_epi32 (sum, sum);
 211   *(gint32*)o = _mm_cvtsi128_si32 (sum);
 212 }
 213
 214 static inline void
 215 inner_product_gdouble_none_2_sse2 (gdouble * o, const gdouble * a,
 216     const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
 217 {
 218   gint i = 0;
 219   __m128d sum = _mm_setzero_pd (), t;
 220
 221   for (; i < len; i += 4) {
 222     t = _mm_load_pd (b + i);
 223     sum =
 224         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
 225             _mm_unpacklo_pd (t, t)));
 226     sum =
 227         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
 228             _mm_unpackhi_pd (t, t)));
 229
 230     t = _mm_load_pd (b + i + 2);
 231     sum =
 232         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
 233             _mm_unpacklo_pd (t, t)));
 234     sum =
 235         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
 236             _mm_unpackhi_pd (t, t)));
 237   }
 238   _mm_store_pd (o, sum);
 239 }
 240
 241 MAKE_RESAMPLE_FUNC (gint16, none, 1, sse2);
 242 MAKE_RESAMPLE_FUNC (gdouble, none, 1, sse2);
 243 MAKE_RESAMPLE_FUNC (gint16, none, 2, sse2);
 244 MAKE_RESAMPLE_FUNC (gdouble, none, 2, sse2);
 245 #endif
 246
 247 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
 248 #include <smmintrin.h>
 249
 250 static inline void
 251 inner_product_gint32_none_1_sse41 (gint32 * o, const gint32 * a,
 252     const gint32 * b, gint len, const gint32 * icoeff, gint oversample)
 253 {
 254   gint i = 0;
 255   __m128i sum, ta, tb;
 256   gint64 res;
 257
 258   sum = _mm_setzero_si128 ();
 259
 260   for (; i < len; i += 8) {
 261     ta = _mm_loadu_si128 ((__m128i *) (a + i));
 262     tb = _mm_load_si128 ((__m128i *) (b + i));
 263
 264     sum =
 265         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 266             _mm_unpacklo_epi32 (tb, tb)));
 267     sum =
 268         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 269             _mm_unpackhi_epi32 (tb, tb)));
 270
 271     ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
 272     tb = _mm_load_si128 ((__m128i *) (b + i + 4));
 273
 274     sum =
 275         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 276             _mm_unpacklo_epi32 (tb, tb)));
 277     sum =
 278         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 279             _mm_unpackhi_epi32 (tb, tb)));
 280   }
 281   sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
 282   res = _mm_cvtsi128_si64 (sum);
 283
 284   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
 285   *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
 286 }
 287
 288 MAKE_RESAMPLE_FUNC (gint32, none, 1, sse41);
 289 #endif
 290
 291 static void
 292 audio_resampler_check_x86 (const gchar *option)
 293 {
 294   if (!strcmp (option, "sse")) {
 295 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
 296     GST_DEBUG ("enable SSE optimisations");
 297     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
 298     resample_gfloat_none_2 = resample_gfloat_none_2_sse;
 299     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
 300     resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
 301 #endif
 302   } else if (!strcmp (option, "sse2")) {
 303 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
 304     GST_DEBUG ("enable SSE2 optimisations");
 305     resample_gint16_none_1 = resample_gint16_none_1_sse2;
 306     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
 307     resample_gfloat_none_2 = resample_gfloat_none_2_sse;
 308     resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
 309     resample_gint16_none_2 = resample_gint16_none_2_sse2;
 310     resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
 311     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
 312     resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
 313 #endif
 314   } else if (!strcmp (option, "sse41")) {
 315 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
 316     GST_DEBUG ("enable SSE41 optimisations");
 317     resample_gint32_none_1 = resample_gint32_none_1_sse41;
 318 #endif
 319   }
 320 }