gst-libs/gst/audio/audio-resampler-x86.h

   1 /* GStreamer
   2  * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Library General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Library General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Library General Public
  15  * License along with this library; if not, write to the
  16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17  * Boston, MA 02110-1301, USA.
  18  */
  19
  20 #ifdef HAVE_EMMINTRIN_H
  21 #include <emmintrin.h>
  22
  23 static inline void
  24 inner_product_gint16_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
  25 {
  26   gint i = 0;
  27   __m128i sum, ta, tb;
  28
  29   sum = _mm_setzero_si128 ();
  30
  31   for (; i < len; i += 8) {
  32     ta = _mm_loadu_si128 ((__m128i *) (a + i));
  33     tb = _mm_load_si128 ((__m128i *) (b + i));
  34
  35     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb));
  36   }
  37   sum =
  38       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
  39               3)));
  40   sum =
  41       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
  42               1)));
  43
  44   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  45   sum = _mm_srai_epi32 (sum, PRECISION_S16);
  46   sum = _mm_packs_epi32 (sum, sum);
  47   *o = _mm_extract_epi16 (sum, 0);
  48 }
  49
  50 static inline void
  51 inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len)
  52 {
  53   gint i = 0;
  54   __m128 sum = _mm_setzero_ps ();
  55
  56   for (; i < len; i += 8) {
  57     sum =
  58         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
  59             _mm_load_ps (b + i + 0)));
  60     sum =
  61         _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
  62             _mm_load_ps (b + i + 4)));
  63   }
  64   sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  65   sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
  66   _mm_store_ss (o, sum);
  67 }
  68
  69 static inline void
  70 inner_product_gdouble_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
  71     gint len)
  72 {
  73   gint i = 0;
  74   __m128d sum = _mm_setzero_pd ();
  75
  76   for (; i < len; i += 8) {
  77     sum =
  78         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
  79             _mm_load_pd (b + i + 0)));
  80     sum =
  81         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
  82             _mm_load_pd (b + i + 2)));
  83     sum =
  84         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
  85             _mm_load_pd (b + i + 4)));
  86     sum =
  87         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
  88             _mm_load_pd (b + i + 6)));
  89   }
  90   sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
  91   _mm_store_sd (o, sum);
  92 }
  93
  94 static inline void
  95 inner_product_gint16_2_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
  96 {
  97   gint i = 0;
  98   __m128i sum, ta, tb, t1;
  99
 100   sum = _mm_setzero_si128 ();
 101
 102   for (; i < len; i += 8) {
 103     tb = _mm_load_si128 ((__m128i *) (b + i));
 104
 105     t1 = _mm_unpacklo_epi16 (tb, tb);
 106     ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
 107
 108     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
 109
 110     t1 = _mm_unpackhi_epi16 (tb, tb);
 111     ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
 112
 113     sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
 114   }
 115   sum =
 116       _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
 117               3)));
 118
 119   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
 120   sum = _mm_srai_epi32 (sum, PRECISION_S16);
 121   sum = _mm_packs_epi32 (sum, sum);
 122   *(gint32*)o = _mm_cvtsi128_si32 (sum);
 123 }
 124
 125 static inline void
 126 inner_product_gdouble_2_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
 127     gint len)
 128 {
 129   gint i = 0;
 130   __m128d sum = _mm_setzero_pd (), t;
 131
 132   for (; i < len; i += 4) {
 133     t = _mm_load_pd (b + i);
 134     sum =
 135         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
 136             _mm_unpacklo_pd (t, t)));
 137     sum =
 138         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
 139             _mm_unpackhi_pd (t, t)));
 140
 141     t = _mm_load_pd (b + i + 2);
 142     sum =
 143         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
 144             _mm_unpacklo_pd (t, t)));
 145     sum =
 146         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
 147             _mm_unpackhi_pd (t, t)));
 148   }
 149   _mm_store_pd (o, sum);
 150 }
 151
 152 MAKE_RESAMPLE_FUNC (gint16, 1, sse2);
 153 MAKE_RESAMPLE_FUNC (gfloat, 1, sse);
 154 MAKE_RESAMPLE_FUNC (gdouble, 1, sse2);
 155 MAKE_RESAMPLE_FUNC (gint16, 2, sse2);
 156 MAKE_RESAMPLE_FUNC (gdouble, 2, sse2);
 157 #endif
 158
 159 static void
 160 audio_resampler_check_x86 (const gchar *option)
 161 {
 162 #ifdef HAVE_EMMINTRIN_H
 163   if (!strcmp (option, "sse")) {
 164     GST_DEBUG ("enable SSE optimisations");
 165     resample_gfloat_1 = resample_gfloat_1_sse;
 166   } else if (!strcmp (option, "sse2")) {
 167     GST_DEBUG ("enable SSE2 optimisations");
 168     resample_gint16_1 = resample_gint16_1_sse2;
 169     resample_gfloat_1 = resample_gfloat_1_sse;
 170     resample_gdouble_1 = resample_gdouble_1_sse2;
 171     resample_gint16_2 = resample_gint16_2_sse2;
 172     resample_gdouble_2 = resample_gdouble_2_sse2;
 173   }
 174 #endif
 175 }