From 8d147f1f60f8912fa21b09c60973b9758e953295 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 24 Dec 2010 17:23:22 +0000 Subject: [PATCH] For rounding in chroma MC SSSE3, use 16-byte pw_3/4 instead of reading 8 bytes and then using movlhps to dup it into the higher half of the register. Originally committed as revision 26086 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/dsputil_mmx.c | 2 +- libavcodec/x86/dsputil_mmx.h | 2 +- libavcodec/x86/h264_chromamc.asm | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index db1e6e3..909ec41 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -41,7 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; -DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; +DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index 2bd05ce..d9c2f44 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -32,7 +32,7 @@ extern const uint64_t ff_wtwo; extern const uint64_t ff_pdw_80000000[2]; -extern const uint64_t ff_pw_3; +extern const xmm_reg ff_pw_3; extern const xmm_reg ff_pw_4; extern const xmm_reg ff_pw_5; extern const xmm_reg ff_pw_8; diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm index 6df82cc..3bb5ed4 100644 --- a/libavcodec/x86/h264_chromamc.asm +++ b/libavcodec/x86/h264_chromamc.asm @@ -530,9 +530,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 add r4, 8 sub r4, r5 ; 255*x+8 = x<<8 | (8-x) movd m7, r4d - movq m6, [rnd_1d_%2] + movdqa m6, [rnd_1d_%2] pshuflw m7, m7, 0 - movlhps m6, m6 movlhps m7, m7 .next2xrows @@ -568,9 +567,8 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 add r5, 8 sub r5, r4 ; 255*y+8 = y<<8 | (8-y) movd m7, r5d - movq m6, [rnd_1d_%2] + movdqa m6, [rnd_1d_%2] pshuflw m7, m7, 0 - movlhps m6, m6 movlhps m7, m7 .next2yrows -- 2.7.4