gst-libs/gst/audio/audio-resampler-x86.h

   1 /* GStreamer
   2  * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
  21 #include <xmmintrin.h>
  22
  23 static inline void
  24 inner_product_gfloat_none_1_sse (gfloat * o, const gfloat * a,
  25     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  26 {
  27   gint i = 0;
  28   __m128 sum = _mm_setzero_ps ();
  29
  30   for (; i < len; i += 8) {
  31     sum =
  32         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
  33             _mm_load_ps (b + i + 0)));
  34     sum =
  35         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
  36             _mm_load_ps (b + i + 4)));
  37   }
  38   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  39   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  40   _mm_store_ss (o, sum);
  41 }
  42
  43 static inline void
  44 inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
  45     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  46 {
  47   gint i = 0;
  48   __m128 sum = _mm_setzero_ps (), t, b0;
  49   __m128 f = _mm_loadu_ps(icoeff);
  50
  51   for (; i < len; i += 4) {
  52     t = _mm_loadu_ps (a + i);
  53
  54     b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+0)*oversample));
  55     b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+1)*oversample));
  56
  57     sum =
  58         _mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t), b0));
  59
  60     b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+2)*oversample));
  61     b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+3)*oversample));
  62
  63     sum =
  64         _mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t), b0));
  65   }
  66   sum = _mm_mul_ps (sum, f);
  67   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  68   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  69   _mm_store_ss (o, sum);
  70 }
  71
  72 static inline void
  73 inner_product_gfloat_none_2_sse (gfloat * o, const gfloat * a,
  74     const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
  75 {
  76   gint i = 0;
  77   __m128 sum = _mm_setzero_ps (), t;
  78
  79   for (; i < len; i += 8) {
  80     t = _mm_load_ps (b + i);
  81     sum =
  82         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 0),
  83             _mm_unpacklo_ps (t, t)));
  84     sum =
  85         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 4),
  86             _mm_unpackhi_ps (t, t)));
  87
  88     t = _mm_load_ps (b + i + 4);
  89     sum =
  90         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 8),
  91             _mm_unpacklo_ps (t, t)));
  92     sum =
  93         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 12),
  94             _mm_unpackhi_ps (t, t)));
  95   }
  96   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  97   *(gint64*)o = _mm_cvtsi128_si64 ((__m128i)sum);
  98 }
  99
 100 MAKE_RESAMPLE_FUNC (gfloat, none, 1, sse);
 101 MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
 102 MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
 103 #endif
 104
 105 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
 106 #include <emmintrin.h>
 107
 108 static inline void
 109 inner_product_gint16_none_1_sse2 (gint16 * o, const gint16 * a,
 110     const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
 111 {
 112   gint i = 0;
 113   __m128i sum, ta, tb;
 114
 115   sum = _mm_setzero_si128 ();
 116
 117   for (; i < len; i += 8) {
 118     ta = _mm_loadu_si128 ((__m128i *) (a + i));
 119     tb = _mm_load_si128 ((__m128i *) (b + i));
 120
 121     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb));
 122   }
 123   sum =
 124       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
 125               3)));
 126   sum =
 127       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
 128               1)));
 129
 130   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 131   sum = _mm_srai_epi32 (sum, PRECISION_S16);
 132   sum = _mm_packs_epi32 (sum, sum);
 133   *o = _mm_extract_epi16 (sum, 0);
 134 }
 135
 136 static inline void
 137 inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
 138     const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
 139 {
 140   gint i = 0;
 141   __m128d sum = _mm_setzero_pd ();
 142
 143   for (; i < len; i += 8) {
 144     sum =
 145         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
 146             _mm_load_pd (b + i + 0)));
 147     sum =
 148         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
 149             _mm_load_pd (b + i + 2)));
 150     sum =
 151         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
 152             _mm_load_pd (b + i + 4)));
 153     sum =
 154         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
 155             _mm_load_pd (b + i + 6)));
 156   }
 157   sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
 158   _mm_store_sd (o, sum);
 159 }
 160
 161 static inline void
 162 inner_product_gint16_none_2_sse2 (gint16 * o, const gint16 * a,
 163     const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
 164 {
 165   gint i = 0;
 166   __m128i sum, ta, tb, t1;
 167
 168   sum = _mm_setzero_si128 ();
 169
 170   for (; i < len; i += 8) {
 171     tb = _mm_load_si128 ((__m128i *) (b + i));
 172
 173     t1 = _mm_unpacklo_epi16 (tb, tb);
 174     ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
 175
 176     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
 177
 178     t1 = _mm_unpackhi_epi16 (tb, tb);
 179     ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
 180
 181     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
 182   }
 183   sum =
 184       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
 185               3)));
 186
 187   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 188   sum = _mm_srai_epi32 (sum, PRECISION_S16);
 189   sum = _mm_packs_epi32 (sum, sum);
 190   *(gint32*)o = _mm_cvtsi128_si32 (sum);
 191 }
 192
 193 static inline void
 194 inner_product_gdouble_none_2_sse2 (gdouble * o, const gdouble * a,
 195     const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
 196 {
 197   gint i = 0;
 198   __m128d sum = _mm_setzero_pd (), t;
 199
 200   for (; i < len; i += 4) {
 201     t = _mm_load_pd (b + i);
 202     sum =
 203         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
 204             _mm_unpacklo_pd (t, t)));
 205     sum =
 206         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
 207             _mm_unpackhi_pd (t, t)));
 208
 209     t = _mm_load_pd (b + i + 2);
 210     sum =
 211         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
 212             _mm_unpacklo_pd (t, t)));
 213     sum =
 214         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
 215             _mm_unpackhi_pd (t, t)));
 216   }
 217   _mm_store_pd (o, sum);
 218 }
 219
 220 MAKE_RESAMPLE_FUNC (gint16, none, 1, sse2);
 221 MAKE_RESAMPLE_FUNC (gdouble, none, 1, sse2);
 222 MAKE_RESAMPLE_FUNC (gint16, none, 2, sse2);
 223 MAKE_RESAMPLE_FUNC (gdouble, none, 2, sse2);
 224 #endif
 225
 226 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
 227 #include <smmintrin.h>
 228
 229 static inline void
 230 inner_product_gint32_none_1_sse41 (gint32 * o, const gint32 * a,
 231     const gint32 * b, gint len, const gint32 * icoeff, gint oversample)
 232 {
 233   gint i = 0;
 234   __m128i sum, ta, tb;
 235   gint64 res;
 236
 237   sum = _mm_setzero_si128 ();
 238
 239   for (; i < len; i += 8) {
 240     ta = _mm_loadu_si128 ((__m128i *) (a + i));
 241     tb = _mm_load_si128 ((__m128i *) (b + i));
 242
 243     sum =
 244         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 245             _mm_unpacklo_epi32 (tb, tb)));
 246     sum =
 247         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 248             _mm_unpackhi_epi32 (tb, tb)));
 249
 250     ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
 251     tb = _mm_load_si128 ((__m128i *) (b + i + 4));
 252
 253     sum =
 254         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
 255             _mm_unpacklo_epi32 (tb, tb)));
 256     sum =
 257         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
 258             _mm_unpackhi_epi32 (tb, tb)));
 259   }
 260   sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
 261   res = _mm_cvtsi128_si64 (sum);
 262
 263   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
 264   *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
 265 }
 266
 267 MAKE_RESAMPLE_FUNC (gint32, none, 1, sse41);
 268 #endif
 269
 270 static void
 271 audio_resampler_check_x86 (const gchar *option)
 272 {
 273   if (!strcmp (option, "sse")) {
 274 #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
 275     GST_DEBUG ("enable SSE optimisations");
 276     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
 277     resample_gfloat_none_2 = resample_gfloat_none_2_sse;
 278     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
 279 #endif
 280   } else if (!strcmp (option, "sse2")) {
 281 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
 282     GST_DEBUG ("enable SSE2 optimisations");
 283     resample_gint16_none_1 = resample_gint16_none_1_sse2;
 284     resample_gfloat_none_1 = resample_gfloat_none_1_sse;
 285     resample_gfloat_none_2 = resample_gfloat_none_2_sse;
 286     resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
 287     resample_gint16_none_2 = resample_gint16_none_2_sse2;
 288     resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
 289     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
 290 #endif
 291   } else if (!strcmp (option, "sse41")) {
 292 #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
 293     GST_DEBUG ("enable SSE41 optimisations");
 294     resample_gint32_none_1 = resample_gint32_none_1_sse41;
 295 #endif
 296   }
 297 }