Implement an sse version of scalarproduct_float().

author Alex Converse <alex.converse@gmail.com>

Fri, 22 Jan 2010 23:07:58 +0000 (23:07 +0000)

committer Alex Converse <alex.converse@gmail.com>

Fri, 22 Jan 2010 23:07:58 +0000 (23:07 +0000)
author Alex Converse <alex.converse@gmail.com>
Fri, 22 Jan 2010 23:07:58 +0000 (23:07 +0000)
committer Alex Converse <alex.converse@gmail.com>
Fri, 22 Jan 2010 23:07:58 +0000 (23:07 +0000)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c

index 0283fb4..8263ac5 100644 (file)
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2510,6 +2510,8 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
                                    int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
  
  
+float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
+
  void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  {
      mm_flags = mm_support();
@@ -2965,6 +2967,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
              c->vector_clipf = vector_clipf_sse;
              c->float_to_int16 = float_to_int16_sse;
              c->float_to_int16_interleave = float_to_int16_interleave_sse;
+#if HAVE_YASM
+            c->scalarproduct_float = ff_scalarproduct_float_sse;
+#endif
          }
          if(mm_flags & FF_MM_3DNOW)
              c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm

index 023fc4d..e2478a4 100644 (file)
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -397,3 +397,27 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  .unaligned:
      ADD_HFYU_LEFT_LOOP 0
  
+
+; float ff_scalarproduct_float_sse(const float *v1, const float *v2, int len)
+cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
+    neg offsetq
+    shl offsetq, 2
+    sub v1q, offsetq
+    sub v2q, offsetq
+    xorps xmm0, xmm0
+    .loop:
+        movaps   xmm1, [v1q+offsetq]
+        mulps    xmm1, [v2q+offsetq]
+        addps    xmm0, xmm1
+        add      offsetq, 16
+        js       .loop
+    movhlps xmm1, xmm0
+    addps   xmm0, xmm1
+    movss   xmm1, xmm0
+    shufps  xmm0, xmm0, 1
+    addss   xmm0, xmm1
+%ifndef ARCH_X86_64
+    movd    r0m,  xmm0
+    fld     dword r0m
+%endif
+    RET
author	Alex Converse <alex.converse@gmail.com>
	Fri, 22 Jan 2010 23:07:58 +0000 (23:07 +0000)
committer	Alex Converse <alex.converse@gmail.com>
	Fri, 22 Jan 2010 23:07:58 +0000 (23:07 +0000)
libavcodec/x86/dsputil_mmx.c		patch \| blob \| history
libavcodec/x86/dsputil_yasm.asm		patch \| blob \| history