x86: fft: replace call to memcpy by a loop

author Christophe Gisquet <christophe.gisquet@gmail.com>

Tue, 26 Jun 2012 14:10:33 +0000 (16:10 +0200)

committer Mans Rullgard <mans@mansr.com>

Wed, 27 Jun 2012 11:49:33 +0000 (12:49 +0100)
author Christophe Gisquet <christophe.gisquet@gmail.com>
Tue, 26 Jun 2012 14:10:33 +0000 (16:10 +0200)
committer Mans Rullgard <mans@mansr.com>
Wed, 27 Jun 2012 11:49:33 +0000 (12:49 +0100)
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm

index 007f5caf7778cf7db0e178152887444b0ae8a704..1a430b9c2c8038eae46d2f6ee282c25c992d17e7 100644 (file)
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -615,8 +615,6 @@ cglobal fft_calc, 2,5,8
  .end:
      REP_RET
  
-cextern_naked memcpy
-
  cglobal fft_permute, 2,7,1
      mov     r4,  [r0 + FFTContext.revtab]
      mov     r5,  [r0 + FFTContext.tmpbuf]
@@ -637,29 +635,18 @@ cglobal fft_permute, 2,7,1
      cmp     r0, r2
      jl      .loop
      shl     r2, 3
-%if ARCH_X86_64
-    mov     r0, r1
-    mov     r1, r5
-%endif
-%if WIN64
-    sub     rsp, 8
-    call    memcpy
-    add     rsp, 8
-    RET
-%elif ARCH_X86_64
-%ifdef PIC
-    jmp     memcpy wrt ..plt
-%else
-    jmp     memcpy
-%endif
-%else
-    push    r2
-    push    r5
-    push    r1
-    call    memcpy
-    add     esp, 12
-    RET
-%endif
+    add     r1, r2
+    add     r5, r2
+    neg     r2
+; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
+.loopcopy:
+    movaps  xmm0, [r5 + r2]
+    movaps  xmm1, [r5 + r2 + 16]
+    movaps  [r1 + r2], xmm0
+    movaps  [r1 + r2 + 16], xmm1
+    add     r2, 32
+    jl      .loopcopy
+    REP_RET
  
  cglobal imdct_calc, 3,5,3
      mov     r3d, [r0 + FFTContext.mdctsize]
author	Christophe Gisquet <christophe.gisquet@gmail.com>
	Tue, 26 Jun 2012 14:10:33 +0000 (16:10 +0200)
committer	Mans Rullgard <mans@mansr.com>
	Wed, 27 Jun 2012 11:49:33 +0000 (12:49 +0100)