tizen 2.0 init

[framework/multimedia/gst-plugins-base0.10.git] / gst / audioresample / resample.c
diff --git a/gst/audioresample/resample.c b/gst/audioresample/resample.c

index 7d42f0e..490eebc 100644 (file)
--- a/gst/audioresample/resample.c
+++ b/gst/audioresample/resample.c
@@ -64,10 +64,30 @@
  #ifdef OUTSIDE_SPEEX
  #include <stdlib.h>
  
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
  #include <glib.h>
  
+#ifdef HAVE_ORC
+#include <orc/orc.h>
+#endif
+
  #define EXPORT G_GNUC_INTERNAL
  
+#ifdef _USE_SSE
+#ifndef HAVE_XMMINTRIN_H
+#undef _USE_SSE
+#endif
+#endif
+
+#ifdef _USE_SSE2
+#ifndef HAVE_EMMINTRIN_H
+#undef _USE_SSE2
+#endif
+#endif
+
  static inline void *
  speex_alloc (int size)
  {
@@ -97,10 +117,6 @@ speex_free (void *ptr)
  
  #include <math.h>
  
-#ifndef M_PI
-#define M_PI 3.14159263
-#endif
-
  #ifdef FIXED_POINT
  #define WORD2INT(x) ((x) < -32767 ? -32768 : ((x) > 32766 ? 32767 : (x)))
  #else
@@ -114,7 +130,7 @@ speex_free (void *ptr)
  #define NULL 0
  #endif
  
-#ifdef _USE_SSE
+#if defined _USE_SSE || defined _USE_SSE2
  #include "resample_sse.h"
  #endif
  
@@ -125,6 +141,28 @@ speex_free (void *ptr)
  #define FIXED_STACK_ALLOC 1024
  #endif
  
+/* Allow selecting SSE or not when compiled with SSE support */
+#ifdef _USE_SSE
+#define SSE_FALLBACK(macro) \
+  if (st->use_sse) goto sse_##macro##_sse; {
+#define SSE_IMPLEMENTATION(macro) \
+  goto sse_##macro##_end; } sse_##macro##_sse: {
+#define SSE_END(macro) sse_##macro##_end:; }
+#else
+#define SSE_FALLBACK(macro)
+#endif
+
+#ifdef _USE_SSE2
+#define SSE2_FALLBACK(macro) \
+  if (st->use_sse2) goto sse2_##macro##_sse2; {
+#define SSE2_IMPLEMENTATION(macro) \
+  goto sse2_##macro##_end; } sse2_##macro##_sse2: {
+#define SSE2_END(macro) sse2_##macro##_end:; }
+#else
+#define SSE2_FALLBACK(macro)
+#endif
+
+
  typedef int (*resampler_basic_func) (SpeexResamplerState *, spx_uint32_t,
      const spx_word16_t *, spx_uint32_t *, spx_word16_t *, spx_uint32_t *);
  
@@ -159,6 +197,9 @@ struct SpeexResamplerState_
  
    int in_stride;
    int out_stride;
+
+  int use_sse:1;
+  int use_sse2:1;
  };
  
  static double kaiser12_table[68] = {
@@ -323,7 +364,7 @@ sinc (float cutoff, float x, int N, struct FuncDef *window_func)
    else if (fabs (x) > .5f * N)
      return 0;
    /*FIXME: Can it really be any slower than this? */
-  return WORD2INT (32768. * cutoff * sin (M_PI * xx) / (M_PI * xx) *
+  return WORD2INT (32768. * cutoff * sin (G_PI * xx) / (G_PI * xx) *
        compute_func (fabs (2. * x / N), window_func));
  }
  #else
@@ -346,7 +387,7 @@ sinc (float cutoff, float x, int N, struct FuncDef *window_func)
    else if (fabs (x) > .5 * N)
      return 0;
    /*FIXME: Can it really be any slower than this? */
-  return cutoff * sin (M_PI * xx) / (M_PI * xx) * compute_func (fabs (2. * x /
+  return cutoff * sin (G_PI * xx) / (G_PI * xx) * compute_func (fabs (2. * x /
            N), window_func);
  }
  #endif
@@ -414,13 +455,13 @@ resampler_basic_direct_single (SpeexResamplerState * st,
      const spx_word16_t *sinc = &sinc_table[samp_frac_num * N];
      const spx_word16_t *iptr = &in[last_sample];
  
-#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
+    SSE_FALLBACK (INNER_PRODUCT_SINGLE)
      sum = 0;
      for (j = 0; j < N; j++)
        sum += MULT16_16 (sinc[j], iptr[j]);
  
  /*    This code is slower on most DSPs which have only 2 accumulators.
-      Plus this this forces truncation to 32 bits and you lose the HW guard bits.
+      Plus this forces truncation to 32 bits and you lose the HW guard bits.
        I think we can trust the compiler and let it vectorize and/or unroll itself.
        spx_word32_t accum[4] = {0,0,0,0};
        for(j=0;j<N;j+=4) {
@@ -431,8 +472,10 @@ resampler_basic_direct_single (SpeexResamplerState * st,
        }
        sum = accum[0] + accum[1] + accum[2] + accum[3];
  */
-#else
+#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
+    SSE_IMPLEMENTATION (INNER_PRODUCT_SINGLE)
      sum = inner_product_single (sinc, iptr, N);
+    SSE_END(INNER_PRODUCT_SINGLE)
  #endif
  
      out[out_stride * out_sample++] = SATURATE32 (PSHR32 (sum, 15), 32767);
@@ -475,7 +518,7 @@ resampler_basic_direct_double (SpeexResamplerState * st,
      const spx_word16_t *sinc = &sinc_table[samp_frac_num * N];
      const spx_word16_t *iptr = &in[last_sample];
  
-#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
+    SSE2_FALLBACK (INNER_PRODUCT_DOUBLE)
      double accum[4] = { 0, 0, 0, 0 };
  
      for (j = 0; j < N; j += 4) {
@@ -485,8 +528,10 @@ resampler_basic_direct_double (SpeexResamplerState * st,
        accum[3] += sinc[j + 3] * iptr[j + 3];
      }
      sum = accum[0] + accum[1] + accum[2] + accum[3];
-#else
+#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
+    SSE2_IMPLEMENTATION (INNER_PRODUCT_DOUBLE)
      sum = inner_product_double (sinc, iptr, N);
+    SSE2_END (INNER_PRODUCT_DOUBLE)
  #endif
  
      out[out_stride * out_sample++] = PSHR32 (sum, 15);
@@ -538,7 +583,7 @@ resampler_basic_interpolate_single (SpeexResamplerState * st,
      spx_word16_t interp[4];
  
  
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+    SSE_FALLBACK (INTERPOLATE_PRODUCT_SINGLE)
      spx_word32_t accum[4] = { 0, 0, 0, 0 };
  
      for (j = 0; j < N; j++) {
@@ -563,12 +608,14 @@ resampler_basic_interpolate_single (SpeexResamplerState * st,
              1)) + MULT16_32_Q15 (interp[1], SHR32 (accum[1],
              1)) + MULT16_32_Q15 (interp[2], SHR32 (accum[2],
              1)) + MULT16_32_Q15 (interp[3], SHR32 (accum[3], 1));
-#else
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
+    SSE_IMPLEMENTATION (INTERPOLATE_PRODUCT_SINGLE)
      cubic_coef (frac, interp);
      sum =
          interpolate_product_single (iptr,
          st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample,
          interp);
+    SSE_END (INTERPOLATE_PRODUCT_SINGLE)
  #endif
  
      out[out_stride * out_sample++] = SATURATE32 (PSHR32 (sum, 14), 32767);
@@ -628,7 +675,7 @@ resampler_basic_interpolate_double (SpeexResamplerState * st,
      spx_word16_t interp[4];
  
  
-#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+    SSE2_FALLBACK (INTERPOLATE_PRODUCT_DOUBLE)
      double accum[4] = { 0, 0, 0, 0 };
  
      for (j = 0; j < N; j++) {
@@ -652,12 +699,14 @@ resampler_basic_interpolate_double (SpeexResamplerState * st,
          MULT16_32_Q15 (interp[0], accum[0]) + MULT16_32_Q15 (interp[1],
          accum[1]) + MULT16_32_Q15 (interp[2],
          accum[2]) + MULT16_32_Q15 (interp[3], accum[3]);
-#else
+#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
+    SSE2_IMPLEMENTATION (INTERPOLATE_PRODUCT_DOUBLE)
      cubic_coef (frac, interp);
      sum =
          interpolate_product_double (iptr,
          st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample,
          interp);
+    SSE2_END (INTERPOLATE_PRODUCT_DOUBLE)
  #endif
  
      out[out_stride * out_sample++] = PSHR32 (sum, 15);
@@ -879,6 +928,19 @@ speex_resampler_init (spx_uint32_t nb_channels, spx_uint32_t in_rate,
        out_rate, quality, err);
  }
  
+#if defined HAVE_ORC && !defined DISABLE_ORC
+static void
+check_insn_set (SpeexResamplerState * st, const char *name)
+{
+  if (!name)
+    return;
+  if (!strcmp (name, "sse"))
+    st->use_sse = 1;
+  if (!strcmp (name, "sse2"))
+    st->use_sse = st->use_sse2 = 1;
+}
+#endif
+
  EXPORT SpeexResamplerState *
  speex_resampler_init_frac (spx_uint32_t nb_channels, spx_uint32_t ratio_num,
      spx_uint32_t ratio_den, spx_uint32_t in_rate, spx_uint32_t out_rate,
@@ -916,6 +978,23 @@ speex_resampler_init_frac (spx_uint32_t nb_channels, spx_uint32_t ratio_num,
    st->buffer_size = 160;
  #endif
  
+  st->use_sse = st->use_sse2 = 0;
+#if defined HAVE_ORC && !defined DISABLE_ORC
+  orc_init ();
+  {
+    OrcTarget *target = orc_target_get_default ();
+    if (target) {
+      unsigned int flags = orc_target_get_default_flags (target);
+      check_insn_set (st, orc_target_get_name (target));
+      for (i = 0; i < 32; ++i) {
+        if (flags & (1 << i)) {
+          check_insn_set (st, orc_target_get_flag_name (target, i));
+        }
+      }
+    }
+  }
+#endif
+
    /* Per channel data */
    st->last_sample = (spx_int32_t *) speex_alloc (nb_channels * sizeof (int));
    st->magic_samples = (spx_uint32_t *) speex_alloc (nb_channels * sizeof (int));