gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registe...

author Loren Merritt <lorenm@u.washington.edu>

Wed, 9 Aug 2006 06:33:49 +0000 (06:33 +0000)

committer Loren Merritt <lorenm@u.washington.edu>

Wed, 9 Aug 2006 06:33:49 +0000 (06:33 +0000)
author Loren Merritt <lorenm@u.washington.edu>
Wed, 9 Aug 2006 06:33:49 +0000 (06:33 +0000)
committer Loren Merritt <lorenm@u.washington.edu>
Wed, 9 Aug 2006 06:33:49 +0000 (06:33 +0000)
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c

index 80dece700d328021d25118fa2c94c1cc3c9882f4..24d7799d5828ee33d2cd667c5a8d3bec745d309f 100644 (file)
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -154,20 +154,23 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
      in1 = input;
      in2 = input + n2 - 1;
      for(k = 0; k < n4; k++) {
+        // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
          asm volatile(
-            "movd       %1, %%mm0 \n\t"
-            "movd       %3, %%mm1 \n\t"
-            "punpckldq  %2, %%mm0 \n\t"
-            "punpckldq  %4, %%mm1 \n\t"
+            "movd       %0, %%mm0 \n\t"
+            "movd       %2, %%mm1 \n\t"
+            "punpckldq  %1, %%mm0 \n\t"
+            "punpckldq  %3, %%mm1 \n\t"
              "movq    %%mm0, %%mm2 \n\t"
              "pfmul   %%mm1, %%mm0 \n\t"
              "pswapd  %%mm1, %%mm1 \n\t"
              "pfmul   %%mm1, %%mm2 \n\t"
              "pfpnacc %%mm2, %%mm0 \n\t"
+            ::"m"(in2[-2*k]), "m"(in1[2*k]),
+              "m"(tcos[k]), "m"(tsin[k])
+        );
+        asm volatile(
              "movq    %%mm0, %0    \n\t"
              :"=m"(z[revtab[k]])
-            :"m"(in2[-2*k]), "m"(in1[2*k]),
-             "m"(tcos[k]), "m"(tsin[k])
          );
      }
  
@@ -190,11 +193,15 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
          );
      }
  
+    z += n8;
      asm volatile("movd %0, %%mm7" ::"r"(1<<31));
      for(k = 0; k < n8; k++) {
          asm volatile(
-            "movq         %4, %%mm0 \n\t"
-            "pswapd       %5, %%mm1 \n\t"
+            "movq         %0, %%mm0 \n\t"
+            "pswapd       %1, %%mm1 \n\t"
+            ::"m"(z[k]), "m"(z[-1-k])
+        );
+        asm volatile(
              "movq      %%mm0, %%mm2 \n\t"
              "pxor      %%mm7, %%mm2 \n\t"
              "punpckldq %%mm1, %%mm2 \n\t"
@@ -209,8 +216,7 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
              "movq      %%mm3, %3    \n\t" // { z[n8-1-k].im, -z[n8+k].re }
              :"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
               "=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
-            :"m"(z[n8+k]), "m"(z[n8-1-k])
-            :"memory"
+            ::"memory"
          );
      }
      asm volatile("emms");
author	Loren Merritt <lorenm@u.washington.edu>
	Wed, 9 Aug 2006 06:33:49 +0000 (06:33 +0000)
committer	Loren Merritt <lorenm@u.washington.edu>
	Wed, 9 Aug 2006 06:33:49 +0000 (06:33 +0000)