fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm

author Justin Ruggles <justin.ruggles@gmail.com>

Mon, 10 Oct 2011 03:52:03 +0000 (23:52 -0400)

committer Justin Ruggles <justin.ruggles@gmail.com>

Fri, 21 Oct 2011 14:13:05 +0000 (10:13 -0400)
author Justin Ruggles <justin.ruggles@gmail.com>
Mon, 10 Oct 2011 03:52:03 +0000 (23:52 -0400)
committer Justin Ruggles <justin.ruggles@gmail.com>
Fri, 21 Oct 2011 14:13:05 +0000 (10:13 -0400)
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm

index 2a21084..fe96d8b 100644 (file)
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1055,14 +1055,6 @@ emu_edge mmx
  ;                           int32_t max, unsigned int len)
  ;-----------------------------------------------------------------------------
  
-%macro SPLATD_MMX 1
-    punpckldq  %1, %1
-%endmacro
-
-%macro SPLATD_SSE2 1
-    pshufd  %1, %1, 0
-%endmacro
-
  %macro VECTOR_CLIP_INT32 4
  cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
  %ifidn %1, sse2
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm

index d314a4e..e3eb5d2 100644 (file)
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,52 @@
  
  SECTION_TEXT
  
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%ifdef ARCH_X86_64
+cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+    movss   m0, mulm
+%endif
+    SPLATD  m0
+    shl     lenq, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+%ifidn %1, sse2
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     lenq, 32
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM
+%define SPLATD SPLATD_SSE
+%define movdqa movaps
+INT32_TO_FLOAT_FMUL_SCALAR sse, 5
+%undef movdqa
+%define SPLATD SPLATD_SSE2
+INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
+%undef SPLATD
+
+
  ;------------------------------------------------------------------------------
  ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  ;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c

index 6e43280..86957b4 100644 (file)
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -26,52 +26,11 @@
  #include "libavutil/x86_cpu.h"
  #include "libavcodec/fmtconvert.h"
  
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
-        "movlhps  %%xmm1,    %%xmm0 \n"
-        "movlhps  %%xmm3,    %%xmm2 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm2 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm2, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm1 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm1, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
  #if HAVE_YASM
  
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
+
  void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
  void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
  void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
@@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
  {
      int mm_flags = av_get_cpu_flags();
  
-    if (mm_flags & AV_CPU_FLAG_MMX) {
  #if HAVE_YASM
+    if (mm_flags & AV_CPU_FLAG_MMX) {
          c->float_interleave = float_interleave_mmx;
  
          if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
@@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
                  c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
              }
          }
-#endif
          if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-#if HAVE_YASM
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
              c->float_to_int16 = ff_float_to_int16_sse;
              c->float_to_int16_interleave = float_to_int16_interleave_sse;
              c->float_interleave = float_interleave_sse;
-#endif
          }
          if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-#if HAVE_YASM
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
              c->float_to_int16 = ff_float_to_int16_sse2;
              c->float_to_int16_interleave = float_to_int16_interleave_sse2;
-#endif
          }
      }
+#endif
  }
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm

index 7e16c15..874443a 100644 (file)
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -536,6 +536,18 @@
  %endif
  %endmacro
  
+%macro SPLATD_MMX 1
+    punpckldq  %1, %1
+%endmacro
+
+%macro SPLATD_SSE 1
+    shufps  %1, %1, 0
+%endmacro
+
+%macro SPLATD_SSE2 1
+    pshufd  %1, %1, 0
+%endmacro
+
  %macro CLIPW 3 ;(dst, min, max)
      pmaxsw %1, %2
      pminsw %1, %3
author	Justin Ruggles <justin.ruggles@gmail.com>
	Mon, 10 Oct 2011 03:52:03 +0000 (23:52 -0400)
committer	Justin Ruggles <justin.ruggles@gmail.com>
	Fri, 21 Oct 2011 14:13:05 +0000 (10:13 -0400)
libavcodec/x86/dsputil_yasm.asm		patch \| blob \| history
libavcodec/x86/fmtconvert.asm		patch \| blob \| history
libavcodec/x86/fmtconvert_mmx.c		patch \| blob \| history
libavutil/x86/x86util.asm		patch \| blob \| history