3dnow2 implementation of imdct.

author Loren Merritt <lorenm@u.washington.edu>

Tue, 8 Aug 2006 04:01:04 +0000 (04:01 +0000)

committer Loren Merritt <lorenm@u.washington.edu>

Tue, 8 Aug 2006 04:01:04 +0000 (04:01 +0000)
author Loren Merritt <lorenm@u.washington.edu>
Tue, 8 Aug 2006 04:01:04 +0000 (04:01 +0000)
committer Loren Merritt <lorenm@u.washington.edu>
Tue, 8 Aug 2006 04:01:04 +0000 (04:01 +0000)
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h

index a608350294e2b154cd2ecfd0f04c46ea15f948f5..1083c39c71ac282769880db9928418326523dfec 100644 (file)
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -594,6 +594,8 @@ void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
     FFTSample type */
  typedef float FFTSample;
  
+struct MDCTContext;
+
  typedef struct FFTComplex {
      FFTSample re, im;
  } FFTComplex;
@@ -605,6 +607,8 @@ typedef struct FFTContext {
      FFTComplex *exptab;
      FFTComplex *exptab1; /* only used by SSE code */
      void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
+    void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
+                       const FFTSample *input, FFTSample *tmp);
  } FFTContext;
  
  int ff_fft_init(FFTContext *s, int nbits, int inverse);
@@ -635,6 +639,8 @@ typedef struct MDCTContext {
  int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
  void ff_imdct_calc(MDCTContext *s, FFTSample *output,
                  const FFTSample *input, FFTSample *tmp);
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp);
  void ff_mdct_calc(MDCTContext *s, FFTSample *out,
                 const FFTSample *input, FFTSample *tmp);
  void ff_mdct_end(MDCTContext *s);
diff --git a/libavcodec/fft.c b/libavcodec/fft.c

index 1c63f6889ff3e8ec782168fedc142c7955b4181c..3d5c221ebff0f3698d78aa9a3fa2e0070771bb64 100644 (file)
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -54,6 +54,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
          s->exptab[i].im = s1;
      }
      s->fft_calc = ff_fft_calc_c;
+    s->imdct_calc = ff_imdct_calc;
      s->exptab1 = NULL;
  
      /* compute constant table for HAVE_SSE version */
@@ -62,11 +63,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
          int has_vectors = 0;
  
  #if defined(HAVE_MMX)
-#ifdef HAVE_MM3DNOW
          has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2);
-#else
-        has_vectors = mm_support() & (MM_SSE | MM_SSE2);
-#endif
  #endif
  #if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE)
          has_vectors = mm_support() & MM_ALTIVEC;
@@ -98,6 +95,8 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
              } while (nblocks != 0);
              av_freep(&s->exptab);
  #if defined(HAVE_MMX)
+            if (has_vectors & MM_3DNOWEXT)
+                s->imdct_calc = ff_imdct_calc_3dn2;
  #ifdef HAVE_MM3DNOW
              if (has_vectors & MM_3DNOWEXT)
                  /* 3DNowEx for Athlon(XP) */
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c

index aa8f0aee2e903b1d47a134c4bed6cc7f926565cc..40ec9d8eb1bc3893ea9bd8e123f45807bd3cffc8 100644 (file)
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -1,6 +1,6 @@
  /*
   * FFT/MDCT transform with Extended 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie.
+ * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
   * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
   *
   * This library is free software; you can redistribute it and/or
@@ -134,3 +134,84 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
  }
  
  #endif
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp)
+{
+    int k, n8, n4, n2, n;
+    const uint16_t *revtab = s->fft.revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    for(k = 0; k < n4; k++) {
+        asm volatile(
+            "movd       %1, %%mm0 \n\t"
+            "movd       %3, %%mm1 \n\t"
+            "punpckldq  %2, %%mm0 \n\t"
+            "punpckldq  %4, %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "pfmul   %%mm1, %%mm0 \n\t"
+            "pswapd  %%mm1, %%mm1 \n\t"
+            "pfmul   %%mm1, %%mm2 \n\t"
+            "pfpnacc %%mm2, %%mm0 \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"=m"(z[revtab[k]])
+            :"m"(in2[-2*k]), "m"(in1[2*k]),
+             "m"(tcos[k]), "m"(tsin[k])
+        );
+    }
+
+    ff_fft_calc(&s->fft, z);
+
+    /* post rotation + reordering */
+    for(k = 0; k < n4; k++) {
+        asm volatile(
+            "movq       %0, %%mm0 \n\t"
+            "movd       %1, %%mm1 \n\t"
+            "punpckldq  %2, %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "pfmul   %%mm1, %%mm0 \n\t"
+            "pswapd  %%mm1, %%mm1 \n\t"
+            "pfmul   %%mm1, %%mm2 \n\t"
+            "pfpnacc %%mm2, %%mm0 \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"+m"(z[k])
+            :"m"(tcos[k]), "m"(tsin[k])
+        );
+    }
+
+    asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+    for(k = 0; k < n8; k++) {
+        asm volatile(
+            "movq         %4, %%mm0 \n\t"
+            "pswapd       %5, %%mm1 \n\t"
+            "movq      %%mm0, %%mm2 \n\t"
+            "pxor      %%mm7, %%mm2 \n\t"
+            "punpckldq %%mm1, %%mm2 \n\t"
+            "pswapd    %%mm2, %%mm3 \n\t"
+            "punpckhdq %%mm1, %%mm0 \n\t"
+            "pswapd    %%mm0, %%mm4 \n\t"
+            "pxor      %%mm7, %%mm0 \n\t"
+            "pxor      %%mm7, %%mm4 \n\t"
+            "movq      %%mm0, %0    \n\t" // { -z[n8+k].im, z[n8-1-k].re }
+            "movq      %%mm4, %1    \n\t" // { -z[n8-1-k].re, z[n8+k].im }
+            "movq      %%mm2, %2    \n\t" // { -z[n8+k].re, z[n8-1-k].im }
+            "movq      %%mm3, %3    \n\t" // { z[n8-1-k].im, -z[n8+k].re }
+            :"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
+             "=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
+            :"m"(z[n8+k]), "m"(z[n8-1-k])
+            :"memory"
+        );
+    }
+    asm volatile("emms");
+}
diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c

index 0bd318d6b12fe477487e8828a5214443cdd84e9e..bf2cb358bf1fd0e2901c1ecb69f7872b52e286b4 100644 (file)
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -1598,7 +1598,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) {
  
          saved_start=vc->saved_start;
  
-        ff_imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
+        vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp);
  
          if (vc->modes[mode_number].blockflag) {
              // -- overlap/add
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c

index 227c9695ba8a53bda94d2aefaf04c02f3bd4a58a..6f33d2a8f85b08fadbacfcc1818f6a0f0b44c7bf 100644 (file)
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -1113,7 +1113,7 @@ static int wma_decode_block(WMADecodeContext *s)
  
              n = s->block_len;
              n4 = s->block_len / 2;
-            ff_imdct_calc(&s->mdct_ctx[bsize],
+            s->mdct_ctx[bsize].fft.imdct_calc(&s->mdct_ctx[bsize],
                            output, s->coefs[ch], s->mdct_tmp);
  
              /* XXX: optimize all that by build the window and
author	Loren Merritt <lorenm@u.washington.edu>
	Tue, 8 Aug 2006 04:01:04 +0000 (04:01 +0000)
committer	Loren Merritt <lorenm@u.washington.edu>
	Tue, 8 Aug 2006 04:01:04 +0000 (04:01 +0000)
libavcodec/dsputil.h		patch \| blob \| history
libavcodec/fft.c		patch \| blob \| history
libavcodec/i386/fft_3dn2.c		patch \| blob \| history
libavcodec/vorbis.c		patch \| blob \| history
libavcodec/wmadec.c		patch \| blob \| history