From 71155d7b4157fee44c0d3d0fc1b660ebfb9ccf46 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Sun, 27 Jan 2013 03:45:43 +0000 Subject: [PATCH] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm Signed-off-by: Luca Barbato --- libavcodec/x86/Makefile | 2 + libavcodec/x86/dsputil_avg_template.c | 789 ++---------------------------- libavcodec/x86/dsputil_mmx.c | 874 +++++++++++++--------------------- libavcodec/x86/hpeldsp.asm | 465 ++++++++++++++++++ libavcodec/x86/mpeg4qpel.asm | 558 ++++++++++++++++++++++ libavcodec/x86/vc1dsp_mmx.c | 4 + 6 files changed, 1380 insertions(+), 1312 deletions(-) create mode 100644 libavcodec/x86/hpeldsp.asm create mode 100644 libavcodec/x86/mpeg4qpel.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 9b8b653..1feb060 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -71,3 +71,5 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o YASM-OBJS += x86/dsputil.o \ x86/deinterlace.o \ x86/fmtconvert.o \ + x86/hpeldsp.o \ + x86/mpeg4qpel.o \ diff --git a/libavcodec/x86/dsputil_avg_template.c b/libavcodec/x86/dsputil_avg_template.c index 4fc188c..90e4074 100644 --- a/libavcodec/x86/dsputil_avg_template.c +++ b/libavcodec/x86/dsputil_avg_template.c @@ -24,781 +24,54 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm - clobber bug - now it will work with 2.95.2 and also with -fPIC - */ -static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -#ifndef SKIP_FOR_3DNOW -static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "pcmpeqb %%mm6, %%mm6 \n\t" - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq (%2), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} -#endif /* SKIP_FOR_3DNOW */ - -static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 8(%1, %3), %%mm3 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 9(%1), %%mm2 \n\t" - PAVGB" 9(%1, %3), %%mm3 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm2, 8(%2) \n\t" - "movq %%mm3, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 8(%1, %3), %%mm3 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 9(%1), %%mm2 \n\t" - PAVGB" 9(%1, %3), %%mm3 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm2, 8(%2) \n\t" - "movq %%mm3, 8(%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -#ifndef SKIP_FOR_3DNOW -static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "pcmpeqb %%mm6, %%mm6 \n\t" - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "movq (%2), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq (%2), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} -#endif /* SKIP_FOR_3DNOW */ - -/* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BONE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm0 \n\t" - "psubusb %%mm6, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm0 \n\t" - "psubusb %%mm6, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile ( - "pcmpeqb %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq (%1, %3,2), %%mm0 \n\t" - "movq 1(%1, %3,2), %%mm1 \n\t" - "movq (%1, %4), %%mm2 \n\t" - "movq 1(%1, %4), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "movq %%mm0, (%2, %3,2) \n\t" - "movq %%mm2, (%2, %4) \n\t" - "lea (%1, %3,4), %1 \n\t" - "lea (%2, %3,4), %2 \n\t" - "subl $4, %0 \n\t" - "jg 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size) - : "memory" - ); -} - -static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +//FIXME the following could be optimized too ... +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, + const uint8_t *pixels, + int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - "sub %3, %2 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D" (block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); + DEF(ff_put_no_rnd_pixels8_x2)(block, pixels, line_size, h); + DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h); } -/* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { - MOVQ_BONE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - "sub %3, %2 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm1 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D" (block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); + DEF(ff_put_pixels8_y2)(block, pixels, line_size, h); + DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h); } -static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, + const uint8_t *pixels, + int line_size, int h) { - __asm__ volatile ( - "movq (%1), %%mm0 \n\t" - "pcmpeqb %%mm6, %%mm6 \n\t" - "add %3, %1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq (%1, %3,2), %%mm1 \n\t" - "movq (%1, %4), %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm2, (%2, %3,2) \n\t" - "movq %%mm1, (%2, %4) \n\t" - "lea (%1, %3,4), %1 \n\t" - "lea (%2, %3,4), %2 \n\t" - "subl $4, %0 \n\t" - "jg 1b \n\t" - :"+g"(h), "+r"(pixels), "+r" (block) - :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size) - :"memory" - ); + DEF(ff_put_no_rnd_pixels8_y2)(block, pixels, line_size, h); + DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h); } -static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%2), %%mm0 \n\t" - "movq (%2, %3), %%mm1 \n\t" - PAVGB" (%1), %%mm0 \n\t" - PAVGB" (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%2), %%mm0 \n\t" - "movq (%2, %3), %%mm1 \n\t" - PAVGB" (%1), %%mm0 \n\t" - PAVGB" (%1, %3), %%mm1 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); + DEF(ff_avg_pixels8)(block, pixels, line_size, h); + DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h); } -static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm2 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" (%2, %3), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm2 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" (%2, %3), %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); + DEF(ff_avg_pixels8_x2)(block, pixels, line_size, h); + DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h); } -static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - "sub %3, %2 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "movq (%2, %3), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - PAVGB" %%mm3, %%mm0 \n\t" - PAVGB" %%mm4, %%mm1 \n\t" - "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - "movq (%2, %3), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - PAVGB" %%mm4, %%mm1 \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); + DEF(ff_avg_pixels8_y2)(block, pixels, line_size, h); + DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h); } -/* Note this is not correctly rounded, but this function is only - * used for B-frames so it does not matter. */ -static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, + int line_size, int h) { - MOVQ_BONE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psubusb %%mm6, %%mm2 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" (%2, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - PAVGB" (%2), %%mm2 \n\t" - PAVGB" (%2, %3), %%mm1 \n\t" - "movq %%mm2, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -//FIXME the following could be optimized too ... -static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); - DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); -} -static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put_pixels8_y2)(block , pixels , line_size, h); - DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); -} -static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); - DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8)(block , pixels , line_size, h); - DEF(avg_pixels8)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8_x2)(block , pixels , line_size, h); - DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8_y2)(block , pixels , line_size, h); - DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8_xy2)(block , pixels , line_size, h); - DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); + DEF(ff_avg_pixels8_xy2)(block, pixels, line_size, h); + DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h); } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index f72500e..743a7c1 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -80,6 +80,107 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; + +void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, + uint8_t *src2, int dstStride, + int src1Stride, int h); +void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, + int dstStride, int src1Stride, int h); +void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, + const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, + const uint8_t *pixels, + int line_size, int h); +void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, + const uint8_t *pixels, + int line_size, int h); +void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, + const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h); +void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, + int line_size, int h); + +void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h); +static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + int line_size, int h) +{ + ff_put_pixels8_mmxext(block, pixels, line_size, h); + ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h); +} + +void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride, int h); +void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride, int h); +void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride, + int h); +void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride, int h); +void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride, int h); +void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride, + int h); +void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride); +void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride); +void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride); +void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride); +void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride); +void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, + int dstStride, int srcStride); +#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext +#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext + + #if HAVE_INLINE_ASM #define JUMPALIGN() __asm__ volatile (".p2align 3"::) @@ -190,32 +291,34 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; #undef PAVGB #undef OP_AVG +#endif /* HAVE_INLINE_ASM */ + + +#if HAVE_YASM +#define ff_put_pixels8_mmx ff_put_pixels8_mmxext + /***********************************/ /* 3Dnow specific */ #define DEF(x) x ## _3dnow -#define PAVGB "pavgusb" -#define SKIP_FOR_3DNOW #include "dsputil_avg_template.c" #undef DEF -#undef PAVGB -#undef SKIP_FOR_3DNOW /***********************************/ /* MMXEXT specific */ #define DEF(x) x ## _mmxext -/* Introduced only in MMXEXT set */ -#define PAVGB "pavgb" - #include "dsputil_avg_template.c" #undef DEF -#undef PAVGB +#endif /* HAVE_YASM */ + + +#if HAVE_INLINE_ASM #define put_no_rnd_pixels16_mmx put_pixels16_mmx #define put_no_rnd_pixels8_mmx put_pixels8_mmx #define put_pixels16_mmxext put_pixels16_mmx @@ -815,382 +918,15 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, } } } +#endif /* HAVE_INLINE_ASM */ -#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \ - in0, in1, in2, in7, out, OP) \ - "paddw "#m4", "#m3" \n\t" /* x1 */ \ - "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \ - "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \ - "movq "#in7", "#m3" \n\t" /* d */ \ - "movq "#in0", %%mm5 \n\t" /* D */ \ - "paddw "#m3", %%mm5 \n\t" /* x4 */ \ - "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \ - "movq "#in1", %%mm5 \n\t" /* C */ \ - "movq "#in2", %%mm6 \n\t" /* B */ \ - "paddw "#m6", %%mm5 \n\t" /* x3 */ \ - "paddw "#m5", %%mm6 \n\t" /* x2 */ \ - "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \ - "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \ - "paddw "#rnd", %%mm4 \n\t" /* x2 */ \ - "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \ - "psraw $5, %%mm5 \n\t" \ - "packuswb %%mm5, %%mm5 \n\t" \ - OP(%%mm5, out, %%mm7, d) - -#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \ -static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ - uint64_t temp; \ - \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "1: \n\t" \ - "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \ - "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \ - "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \ - "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \ - "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \ - "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \ - "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \ - "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \ - "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \ - "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \ - "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \ - "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \ - "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \ - "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \ - "paddw %%mm3, %%mm5 \n\t" /* b */ \ - "paddw %%mm2, %%mm6 \n\t" /* c */ \ - "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ - "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \ - "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \ - "paddw %%mm4, %%mm0 \n\t" /* a */ \ - "paddw %%mm1, %%mm5 \n\t" /* d */ \ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \ - "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \ - "paddw %6, %%mm6 \n\t" \ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ - "psraw $5, %%mm0 \n\t" \ - "movq %%mm0, %5 \n\t" \ - /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \ - \ - "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \ - "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \ - "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \ - "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \ - "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \ - "paddw %%mm0, %%mm2 \n\t" /* b */ \ - "paddw %%mm5, %%mm3 \n\t" /* c */ \ - "paddw %%mm2, %%mm2 \n\t" /* 2b */ \ - "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \ - "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \ - "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \ - "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \ - "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \ - "paddw %%mm2, %%mm1 \n\t" /* a */ \ - "paddw %%mm6, %%mm4 \n\t" /* d */ \ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \ - "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \ - "paddw %6, %%mm1 \n\t" \ - "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \ - "psraw $5, %%mm3 \n\t" \ - "movq %5, %%mm1 \n\t" \ - "packuswb %%mm3, %%mm1 \n\t" \ - OP_MMXEXT(%%mm1, (%1), %%mm4, q) \ - /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \ - \ - "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \ - "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \ - "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \ - "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \ - "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \ - "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \ - "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \ - "paddw %%mm1, %%mm5 \n\t" /* b */ \ - "paddw %%mm4, %%mm0 \n\t" /* c */ \ - "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ - "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \ - "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \ - "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \ - "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \ - "paddw %%mm3, %%mm2 \n\t" /* d */ \ - "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \ - "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \ - "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \ - "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \ - "paddw %%mm2, %%mm6 \n\t" /* a */ \ - "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \ - "paddw %6, %%mm0 \n\t" \ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ - "psraw $5, %%mm0 \n\t" \ - /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \ - /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \ - \ - "paddw %%mm5, %%mm3 \n\t" /* a */ \ - "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \ - "paddw %%mm4, %%mm6 \n\t" /* b */ \ - "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \ - "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \ - "paddw %%mm1, %%mm4 \n\t" /* c */ \ - "paddw %%mm2, %%mm5 \n\t" /* d */ \ - "paddw %%mm6, %%mm6 \n\t" /* 2b */ \ - "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \ - "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \ - "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \ - "paddw %6, %%mm4 \n\t" \ - "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \ - "psraw $5, %%mm4 \n\t" \ - "packuswb %%mm4, %%mm0 \n\t" \ - OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \ - \ - "add %3, %0 \n\t" \ - "add %4, %1 \n\t" \ - "decl %2 \n\t" \ - "jnz 1b \n\t" \ - : "+a"(src), "+c"(dst), "+D"(h) \ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \ - /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \ - : "memory" \ - ); \ -} \ - \ -static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride, \ - int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "1: \n\t" \ - "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \ - "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \ - "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \ - "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \ - "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \ - "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \ - "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \ - "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \ - "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \ - "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \ - "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \ - "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \ - "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \ - "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \ - "paddw %%mm3, %%mm5 \n\t" /* b */ \ - "paddw %%mm2, %%mm6 \n\t" /* c */ \ - "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ - "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \ - "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \ - "paddw %%mm4, %%mm0 \n\t" /* a */ \ - "paddw %%mm1, %%mm5 \n\t" /* d */ \ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \ - "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \ - "paddw %5, %%mm6 \n\t" \ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ - "psraw $5, %%mm0 \n\t" \ - /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \ - \ - "movd 5(%0), %%mm5 \n\t" /* FGHI */ \ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \ - "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \ - "paddw %%mm5, %%mm1 \n\t" /* a */ \ - "paddw %%mm6, %%mm2 \n\t" /* b */ \ - "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \ - "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \ - "paddw %%mm6, %%mm3 \n\t" /* c */ \ - "paddw %%mm5, %%mm4 \n\t" /* d */ \ - "paddw %%mm2, %%mm2 \n\t" /* 2b */ \ - "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \ - "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \ - "paddw %5, %%mm1 \n\t" \ - "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \ - "psraw $5, %%mm3 \n\t" \ - "packuswb %%mm3, %%mm0 \n\t" \ - OP_MMXEXT(%%mm0, (%1), %%mm4, q) \ - \ - "add %3, %0 \n\t" \ - "add %4, %1 \n\t" \ - "decl %2 \n\t" \ - "jnz 1b \n\t" \ - : "+a"(src), "+c"(dst), "+d"(h) \ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \ - /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \ - : "memory" \ - ); \ -} +#if HAVE_YASM #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \ -static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride) \ -{ \ - uint64_t temp[17 * 4]; \ - uint64_t *temp_ptr = temp; \ - int count = 17; \ - \ - /* FIXME unroll */ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "1: \n\t" \ - "movq (%0), %%mm0 \n\t" \ - "movq (%0), %%mm1 \n\t" \ - "movq 8(%0), %%mm2 \n\t" \ - "movq 8(%0), %%mm3 \n\t" \ - "punpcklbw %%mm7, %%mm0 \n\t" \ - "punpckhbw %%mm7, %%mm1 \n\t" \ - "punpcklbw %%mm7, %%mm2 \n\t" \ - "punpckhbw %%mm7, %%mm3 \n\t" \ - "movq %%mm0, (%1) \n\t" \ - "movq %%mm1, 17 * 8(%1) \n\t" \ - "movq %%mm2, 2 * 17 * 8(%1) \n\t" \ - "movq %%mm3, 3 * 17 * 8(%1) \n\t" \ - "add $8, %1 \n\t" \ - "add %3, %0 \n\t" \ - "decl %2 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(src), "+r"(temp_ptr), "+r"(count) \ - : "r"((x86_reg)srcStride) \ - : "memory" \ - ); \ - \ - temp_ptr = temp; \ - count = 4; \ - \ - /* FIXME reorder for speed */ \ - __asm__ volatile ( \ - /* "pxor %%mm7, %%mm7 \n\t" */ \ - "1: \n\t" \ - "movq (%0), %%mm0 \n\t" \ - "movq 8(%0), %%mm1 \n\t" \ - "movq 16(%0), %%mm2 \n\t" \ - "movq 24(%0), %%mm3 \n\t" \ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \ - \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \ - \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \ - \ - "add $136, %0 \n\t" \ - "add %6, %1 \n\t" \ - "decl %2 \n\t" \ - "jnz 1b \n\t" \ - \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count) \ - : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \ - /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \ - "g"(4 - 14 * (x86_reg)dstStride) \ - : "memory" \ - ); \ -} \ - \ -static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \ - uint8_t *src, \ - int dstStride, \ - int srcStride) \ -{ \ - uint64_t temp[9 * 2]; \ - uint64_t *temp_ptr = temp; \ - int count = 9; \ - \ - /* FIXME unroll */ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "1: \n\t" \ - "movq (%0), %%mm0 \n\t" \ - "movq (%0), %%mm1 \n\t" \ - "punpcklbw %%mm7, %%mm0 \n\t" \ - "punpckhbw %%mm7, %%mm1 \n\t" \ - "movq %%mm0, (%1) \n\t" \ - "movq %%mm1, 9*8(%1) \n\t" \ - "add $8, %1 \n\t" \ - "add %3, %0 \n\t" \ - "decl %2 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(src), "+r"(temp_ptr), "+r"(count) \ - : "r"((x86_reg)srcStride) \ - : "memory" \ - ); \ - \ - temp_ptr = temp; \ - count = 2; \ - \ - /* FIXME reorder for speed */ \ - __asm__ volatile ( \ - /* "pxor %%mm7, %%mm7 \n\t" */ \ - "1: \n\t" \ - "movq (%0), %%mm0 \n\t" \ - "movq 8(%0), %%mm1 \n\t" \ - "movq 16(%0), %%mm2 \n\t" \ - "movq 24(%0), %%mm3 \n\t" \ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \ - \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \ - \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \ - \ - "add $72, %0 \n\t" \ - "add %6, %1 \n\t" \ - "decl %2 \n\t" \ - "jnz 1b \n\t" \ - \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count) \ - : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \ - /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \ - "g"(4 - 6 * (x86_reg)dstStride) \ - : "memory" \ - ); \ -} \ - \ static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ int stride) \ { \ - OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \ + ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \ } \ \ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1198,16 +934,17 @@ static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[8]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ - stride, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ + stride, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ + stride, stride, 8); \ } \ \ static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ int stride) \ { \ - OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \ - stride, 8); \ + ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \ + stride, 8); \ } \ \ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1215,10 +952,10 @@ static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[8]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ - stride, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ - stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ + stride, 8); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ + stride, 8); \ } \ \ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1226,14 +963,17 @@ static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[8]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \ - OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ + 8, stride); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ + stride, stride, 8); \ } \ \ static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ int stride) \ { \ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \ + ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \ + stride, stride); \ } \ \ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1241,9 +981,10 @@ static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[8]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \ - OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \ - stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ + 8, stride); \ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\ + stride, 8); \ } \ \ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1252,11 +993,13 @@ static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half) + 64; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ + stride, 9); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ + stride, 8, 8); \ } \ \ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1265,12 +1008,13 @@ static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half) + 64; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ + stride, 9); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ + stride, 8, 8); \ } \ \ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1279,11 +1023,13 @@ static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half) + 64; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ + stride, 9); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ + stride, 8, 8); \ } \ \ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1292,12 +1038,13 @@ static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half) + 64; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ + stride, 9); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ + stride, 8, 8); \ } \ \ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1306,10 +1053,11 @@ static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half) + 64; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ + stride, 8, 8); \ } \ \ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1318,10 +1066,11 @@ static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half) + 64; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ + stride, 8, 8); \ } \ \ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1329,10 +1078,12 @@ static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \ + 8, stride, 9); \ + ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ + stride, 8); \ } \ \ static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1340,11 +1091,12 @@ static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t half[8 + 9]; \ uint8_t * const halfH = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ - stride, 9); \ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ + stride, 9); \ + ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ + stride, 8); \ } \ \ static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1352,15 +1104,16 @@ static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t half[9]; \ uint8_t * const halfH = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ - stride, 9); \ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ + ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ + stride, 9); \ + ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ + stride, 8); \ } \ \ static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ int stride) \ { \ - OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \ + ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \ } \ \ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1368,16 +1121,17 @@ static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[32]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ - stride, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ + stride, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ + stride, 16); \ } \ \ static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ int stride) \ { \ - OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \ - stride, stride, 16); \ + ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \ + stride, stride, 16);\ } \ \ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1385,10 +1139,10 @@ static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[32]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ - stride, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ - stride, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ + stride, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ + stride, stride, 16); \ } \ \ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1396,15 +1150,17 @@ static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[32]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ - stride); \ - OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ + stride); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ + stride, 16); \ } \ \ static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ int stride) \ { \ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \ + ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \ + stride, stride); \ } \ \ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1412,10 +1168,10 @@ static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t temp[32]; \ uint8_t * const half = (uint8_t*)temp; \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ - stride); \ - OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ - stride, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ + stride); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ + stride, stride, 16); \ } \ \ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1424,13 +1180,14 @@ static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[16 * 2 + 17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half) + 256; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ + stride, 17); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ + 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ + stride, 16, 16); \ } \ \ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1439,13 +1196,14 @@ static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[16 * 2 + 17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half) + 256; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ + stride, 17); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ + 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ + stride, 16, 16); \ } \ \ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1454,14 +1212,14 @@ static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[16 * 2 + 17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half) + 256; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \ - 16, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ + stride, 17); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ + 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ + stride, 16, 16); \ } \ \ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1470,14 +1228,14 @@ static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[16 * 2 + 17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half) + 256; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \ - 16, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ + stride, 17); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ + 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ + stride, 16, 16); \ } \ \ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1486,11 +1244,12 @@ static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[16 * 2 + 17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half) + 256; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ + 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ + stride, 16, 16); \ } \ \ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1499,12 +1258,12 @@ static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ uint64_t half[16 * 2 + 17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half) + 256; \ uint8_t * const halfHV = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ - 16, 16); \ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \ - 16, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ + 16, 16); \ + ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ + stride, 16, 16); \ } \ \ static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1512,11 +1271,12 @@ static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t half[17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ - stride, 17); \ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ + stride, 17); \ + ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ + stride, 16); \ } \ \ static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1524,11 +1284,12 @@ static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t half[17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ - stride, 17); \ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ + stride, 17); \ + ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ + stride, 16); \ } \ \ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ @@ -1536,9 +1297,10 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ { \ uint64_t half[17 * 2]; \ uint8_t * const halfH = ((uint8_t*)half); \ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ - stride, 17); \ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ + ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ + stride, 17); \ + ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ + stride, 16); \ } #define PUT_OP(a, b, temp, size) \ @@ -1549,13 +1311,13 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ "pavgb "#temp", "#a" \n\t" \ "mov"#size" "#a", "#b" \n\t" -QPEL_BASE(put_, ff_pw_16, _, PUT_OP) -QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP) -QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP) QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext) QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext) QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext) +#endif /* HAVE_YASM */ + +#if HAVE_INLINE_ASM void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) { put_pixels8_xy2_mmx(dst, src, stride, 8); @@ -1760,20 +1522,24 @@ void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride) { avg_pixels16_mmx(dst, src, stride, 16); } +#endif /* HAVE_INLINE_ASM */ +#if HAVE_YASM /* VC-1-specific */ void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { - put_pixels8_mmx(dst, src, stride, 8); + ff_put_pixels8_mmx(dst, src, stride, 8); } void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, int stride, int rnd) { - avg_pixels8_mmxext(dst, src, stride, 8); + ff_avg_pixels8_mmxext(dst, src, stride, 8); } +#endif /* HAVE_YASM */ +#if HAVE_INLINE_ASM static void vector_clipf_sse(float *dst, const float *src, float min, float max, int len) { @@ -1950,7 +1716,7 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, const int bit_depth = avctx->bits_per_raw_sample; const int high_bit_depth = bit_depth > 8; -#if HAVE_INLINE_ASM +#if HAVE_YASM SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); @@ -1960,47 +1726,49 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); if (!high_bit_depth) { - c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext; - c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext; - c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; + c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext; - c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext; - c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; - c->avg_pixels_tab[1][0] = avg_pixels8_mmxext; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; } if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { if (!high_bit_depth) { - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; } } +#endif /* HAVE_YASM */ +#if HAVE_INLINE_ASM if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { c->idct_put = ff_idct_xvid_mmxext_put; c->idct_add = ff_idct_xvid_mmxext_add; c->idct = ff_idct_xvid_mmxext; } +#endif /* HAVE_INLINE_ASM */ +#if HAVE_MMXEXT_EXTERNAL if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 || avctx->codec_id == AV_CODEC_ID_THEORA)) { - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; } -#endif /* HAVE_INLINE_ASM */ -#if HAVE_MMXEXT_EXTERNAL if (!high_bit_depth && CONFIG_H264CHROMA) { c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext; @@ -2034,41 +1802,39 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, { const int high_bit_depth = avctx->bits_per_raw_sample > 8; -#if HAVE_INLINE_ASM +#if HAVE_YASM if (!high_bit_depth) { - c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; - c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow; - c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; + c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow; + c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow; + c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow; - c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; - c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; - c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; + c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; + c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; + c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; if (!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow; + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; } } if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 || avctx->codec_id == AV_CODEC_ID_THEORA)) { - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; } -#endif /* HAVE_INLINE_ASM */ -#if HAVE_YASM if (!high_bit_depth && CONFIG_H264CHROMA) { c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow; c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow; diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm new file mode 100644 index 0000000..8afd955 --- /dev/null +++ b/libavcodec/x86/hpeldsp.asm @@ -0,0 +1,465 @@ +;****************************************************************************** +;* MMX optimized hpel functions +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +cextern pb_1 + +SECTION_TEXT + +; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_PIXELS8_X2 0 +cglobal put_pixels8_x2, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*2] +.loop: + mova m0, [r1] + mova m1, [r1+r2] + PAVGB m0, [r1+1] + PAVGB m1, [r1+r2+1] + mova [r0], m0 + mova [r0+r2], m1 + add r1, r4 + add r0, r4 + mova m0, [r1] + mova m1, [r1+r2] + PAVGB m0, [r1+1] + PAVGB m1, [r1+r2+1] + add r1, r4 + mova [r0], m0 + mova [r0+r2], m1 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_PIXELS8_X2 +INIT_MMX 3dnow +PUT_PIXELS8_X2 + + +; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_PIXELS_16 0 +cglobal put_pixels16_x2, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*2] +.loop: + mova m0, [r1] + mova m1, [r1+r2] + mova m2, [r1+8] + mova m3, [r1+r2+8] + PAVGB m0, [r1+1] + PAVGB m1, [r1+r2+1] + PAVGB m2, [r1+9] + PAVGB m3, [r1+r2+9] + mova [r0], m0 + mova [r0+r2], m1 + mova [r0+8], m2 + mova [r0+r2+8], m3 + add r1, r4 + add r0, r4 + mova m0, [r1] + mova m1, [r1+r2] + mova m2, [r1+8] + mova m3, [r1+r2+8] + PAVGB m0, [r1+1] + PAVGB m1, [r1+r2+1] + PAVGB m2, [r1+9] + PAVGB m3, [r1+r2+9] + add r1, r4 + mova [r0], m0 + mova [r0+r2], m1 + mova [r0+8], m2 + mova [r0+r2+8], m3 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_PIXELS_16 +INIT_MMX 3dnow +PUT_PIXELS_16 + + +; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_NO_RND_PIXELS8_X2 0 +cglobal put_no_rnd_pixels8_x2, 4,5 + mova m6, [pb_1] + movsxdifnidn r2, r2d + lea r4, [r2*2] +.loop: + mova m0, [r1] + mova m2, [r1+r2] + mova m1, [r1+1] + mova m3, [r1+r2+1] + add r1, r4 + psubusb m0, m6 + psubusb m2, m6 + PAVGB m0, m1 + PAVGB m2, m3 + mova [r0], m0 + mova [r0+r2], m2 + mova m0, [r1] + mova m1, [r1+1] + mova m2, [r1+r2] + mova m3, [r1+r2+1] + add r0, r4 + add r1, r4 + psubusb m0, m6 + psubusb m2, m6 + PAVGB m0, m1 + PAVGB m2, m3 + mova [r0], m0 + mova [r0+r2], m2 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_NO_RND_PIXELS8_X2 +INIT_MMX 3dnow +PUT_NO_RND_PIXELS8_X2 + + +; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_NO_RND_PIXELS8_X2_EXACT 0 +cglobal put_no_rnd_pixels8_x2_exact, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*3] + pcmpeqb m6, m6 +.loop: + mova m0, [r1] + mova m2, [r1+r2] + mova m1, [r1+1] + mova m3, [r1+r2+1] + pxor m0, m6 + pxor m2, m6 + pxor m1, m6 + pxor m3, m6 + PAVGB m0, m1 + PAVGB m2, m3 + pxor m0, m6 + pxor m2, m6 + mova [r0], m0 + mova [r0+r2], m2 + mova m0, [r1+r2*2] + mova m1, [r1+r2*2+1] + mova m2, [r1+r4] + mova m3, [r1+r4+1] + pxor m0, m6 + pxor m1, m6 + pxor m2, m6 + pxor m3, m6 + PAVGB m0, m1 + PAVGB m2, m3 + pxor m0, m6 + pxor m2, m6 + mova [r0+r2*2], m0 + mova [r0+r4], m2 + lea r1, [r1+r2*4] + lea r0, [r0+r2*4] + sub r3d, 4 + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_NO_RND_PIXELS8_X2_EXACT +INIT_MMX 3dnow +PUT_NO_RND_PIXELS8_X2_EXACT + + +; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_PIXELS8_Y2 0 +cglobal put_pixels8_y2, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*2] + mova m0, [r1] + sub r0, r2 +.loop: + mova m1, [r1+r2] + mova m2, [r1+r4] + add r1, r4 + PAVGB m0, m1 + PAVGB m1, m2 + mova [r0+r2], m0 + mova [r0+r4], m1 + mova m1, [r1+r2] + mova m0, [r1+r4] + add r0, r4 + add r1, r4 + PAVGB m2, m1 + PAVGB m1, m0 + mova [r0+r2], m2 + mova [r0+r4], m1 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_PIXELS8_Y2 +INIT_MMX 3dnow +PUT_PIXELS8_Y2 + + +; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_NO_RND_PIXELS8_Y2 0 +cglobal put_no_rnd_pixels8_y2, 4,5 + mova m6, [pb_1] + movsxdifnidn r2, r2d + lea r4, [r2+r2] + mova m0, [r1] + sub r0, r2 +.loop: + mova m1, [r1+r2] + mova m2, [r1+r4] + add r1, r4 + psubusb m1, m6 + PAVGB m0, m1 + PAVGB m1, m2 + mova [r0+r2], m0 + mova [r0+r4], m1 + mova m1, [r1+r2] + mova m0, [r1+r4] + add r0, r4 + add r1, r4 + psubusb m1, m6 + PAVGB m2, m1 + PAVGB m1, m0 + mova [r0+r2], m2 + mova [r0+r4], m1 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_NO_RND_PIXELS8_Y2 +INIT_MMX 3dnow +PUT_NO_RND_PIXELS8_Y2 + + +; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 +cglobal put_no_rnd_pixels8_y2_exact, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*3] + mova m0, [r1] + pcmpeqb m6, m6 + add r1, r2 + pxor m0, m6 +.loop: + mova m1, [r1] + mova m2, [r1+r2] + pxor m1, m6 + pxor m2, m6 + PAVGB m0, m1 + PAVGB m1, m2 + pxor m0, m6 + pxor m1, m6 + mova [r0], m0 + mova [r0+r2], m1 + mova m1, [r1+r2*2] + mova m0, [r1+r4] + pxor m1, m6 + pxor m0, m6 + PAVGB m2, m1 + PAVGB m1, m0 + pxor m2, m6 + pxor m1, m6 + mova [r0+r2*2], m2 + mova [r0+r4], m1 + lea r1, [r1+r2*4] + lea r0, [r0+r2*4] + sub r3d, 4 + jg .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_NO_RND_PIXELS8_Y2_EXACT +INIT_MMX 3dnow +PUT_NO_RND_PIXELS8_Y2_EXACT + + +; avg_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro AVG_PIXELS8 0 +cglobal avg_pixels8, 4,5 + movsxdifnidn r2, edx + lea r4, [r2+r2] +.loop: + mova m0, [r0] + mova m1, [r0+r2] + PAVGB m0, [r1] + PAVGB m1, [r1+r2] + mova [r0], m0 + mova [r0+r2], m1 + add r1, r4 + add r0, r4 + mova m0, [r0] + mova m1, [r0+r2] + PAVGB m0, [r1] + PAVGB m1, [r1+r2] + add r1, r4 + mova [r0], m0 + mova [r0+r2], m1 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX 3dnow +AVG_PIXELS8 + + +; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro AVG_PIXELS8_X2 0 +cglobal avg_pixels8_x2, 4,5 + movsxdifnidn r2, edx + lea r4, [r2*2] +.loop: + mova m0, [r1] + mova m2, [r1+r2] + PAVGB m0, [r1+1] + PAVGB m2, [r1+r2+1] + PAVGB m0, [r0] + PAVGB m2, [r0+r2] + add r1, r4 + mova [r0], m0 + mova [r0+r2], m2 + mova m0, [r1] + mova m2, [r1+r2] + PAVGB m0, [r1+1] + PAVGB m2, [r1+r2+1] + add r0, r4 + add r1, r4 + PAVGB m0, [r0] + PAVGB m2, [r0+r2] + mova [r0], m0 + mova [r0+r2], m2 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +AVG_PIXELS8_X2 +INIT_MMX 3dnow +AVG_PIXELS8_X2 + + +; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro AVG_PIXELS8_Y2 0 +cglobal avg_pixels8_y2, 4,5 + movsxdifnidn r2, r2d + lea r4, [r2*2] + mova m0, [r1] + sub r0, r2 +.loop: + mova m1, [r1+r2] + mova m2, [r1+r4] + add r1, r4 + PAVGB m0, m1 + PAVGB m1, m2 + mova m3, [r0+r2] + mova m4, [r0+r4] + PAVGB m0, m3 + PAVGB m1, m4 + mova [r0+r2], m0 + mova [r0+r4], m1 + mova m1, [r1+r2] + mova m0, [r1+r4] + PAVGB m2, m1 + PAVGB m1, m0 + add r0, r4 + add r1, r4 + mova m3, [r0+r2] + mova m4, [r0+r4] + PAVGB m2, m3 + PAVGB m1, m4 + mova [r0+r2], m2 + mova [r0+r4], m1 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +AVG_PIXELS8_Y2 +INIT_MMX 3dnow +AVG_PIXELS8_Y2 + + +; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +%macro AVG_PIXELS8_XY2 0 +cglobal avg_pixels8_xy2, 4,5 + mova m6, [pb_1] + movsxdifnidn r2, r2d + lea r4, [r2*2] + mova m0, [r1] + pavgb m0, [r1+1] +.loop: + mova m2, [r1+r4] + mova m1, [r1+r2] + psubusb m2, m6 + pavgb m1, [r1+r2+1] + pavgb m2, [r1+r4+1] + add r1, r4 + pavgb m0, m1 + pavgb m1, m2 + pavgb m0, [r0] + pavgb m1, [r0+r2] + mova [r0], m0 + mova [r0+r2], m1 + mova m1, [r1+r2] + mova m0, [r1+r4] + pavgb m1, [r1+r2+1] + pavgb m0, [r1+r4+1] + add r0, r4 + add r1, r4 + pavgb m2, m1 + pavgb m1, m0 + pavgb m2, [r0] + pavgb m1, [r0+r2] + mova [r0], m2 + mova [r0+r2], m2 + add r0, r4 + sub r3d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +AVG_PIXELS8_XY2 +INIT_MMX 3dnow +AVG_PIXELS8_XY2 diff --git a/libavcodec/x86/mpeg4qpel.asm b/libavcodec/x86/mpeg4qpel.asm new file mode 100644 index 0000000..39c9fc8 --- /dev/null +++ b/libavcodec/x86/mpeg4qpel.asm @@ -0,0 +1,558 @@ +;****************************************************************************** +;* mpeg4 qpel +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +cextern pb_1 +cextern pw_3 +cextern pw_15 +cextern pw_16 +cextern pw_20 + + +SECTION_TEXT + +; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +%macro PUT_NO_RND_PIXELS8_L2 0 +cglobal put_no_rnd_pixels8_l2, 6,6 + movsxdifnidn r4, r4d + movsxdifnidn r3, r3d + pcmpeqb m6, m6 + test r5d, 1 + je .loop + mova m0, [r1] + mova m1, [r2] + add r1, r4 + add r2, 8 + pxor m0, m6 + pxor m1, m6 + PAVGB m0, m1 + pxor m0, m6 + mova [r0], m0 + add r0, r3 + dec r5d +.loop: + mova m0, [r1] + add r1, r4 + mova m1, [r1] + add r1, r4 + mova m2, [r2] + mova m3, [r2+8] + pxor m0, m6 + pxor m1, m6 + pxor m2, m6 + pxor m3, m6 + PAVGB m0, m2 + PAVGB m1, m3 + pxor m0, m6 + pxor m1, m6 + mova [r0], m0 + add r0, r3 + mova [r0], m1 + add r0, r3 + mova m0, [r1] + add r1, r4 + mova m1, [r1] + add r1, r4 + mova m2, [r2+16] + mova m3, [r2+24] + pxor m0, m6 + pxor m1, m6 + pxor m2, m6 + pxor m3, m6 + PAVGB m0, m2 + PAVGB m1, m3 + pxor m0, m6 + pxor m1, m6 + mova [r0], m0 + add r0, r3 + mova [r0], m1 + add r0, r3 + add r2, 32 + sub r5d, 4 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_NO_RND_PIXELS8_L2 + + +; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +%macro PUT_NO_RND_PIXELS16_l2 0 +cglobal put_no_rnd_pixels16_l2, 5,5 + movsxdifnidn r3, r3 + movsxdifnidn r4, r4d + pcmpeqb m6, m6 + test r5d, 1 + je .loop + mova m0, [r1] + mova m1, [r1+8] + mova m2, [r2] + mova m3, [r2+8] + pxor m0, m6 + pxor m1, m6 + pxor m2, m6 + pxor m3, m6 + PAVGB m0, m2 + PAVGB m1, m3 + pxor m0, m6 + pxor m1, m6 + add r1, r4 + add r2, 16 + mova [r0], m0 + mova [r0+8], m1 + add r0, r3 + dec r5d +.loop: + mova m0, [r1] + mova m1, [r1+8] + add r1, r4 + mova m2, [r2] + mova m3, [r2+8] + pxor m0, m6 + pxor m1, m6 + pxor m2, m6 + pxor m3, m6 + PAVGB m0, m2 + PAVGB m1, m3 + pxor m0, m6 + pxor m1, m6 + mova [r0], m0 + mova [r0+8], m1 + add r0, r3 + mova m0, [r1] + mova m1, [r1+8] + add r1, r4 + mova m2, [r2+16] + mova m3, [r2+24] + pxor m0, m6 + pxor m1, m6 + pxor m2, m6 + pxor m3, m6 + PAVGB m0, m2 + PAVGB m1, m3 + pxor m0, m6 + pxor m1, m6 + mova [r0], m0 + mova [r0+8], m1 + add r0, r3 + add r2, 32 + sub r5d, 2 + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +PUT_NO_RND_PIXELS16_l2 +INIT_MMX 3dnow +PUT_NO_RND_PIXELS16_l2 + +%macro MPEG4_QPEL16_H_LOWPASS 1 +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8 + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + pxor m7, m7 +.loop: + mova m0, [r1] + mova m1, m0 + mova m2, m0 + punpcklbw m0, m7 + punpckhbw m1, m7 + pshufw m5, m0, 0x90 + pshufw m6, m0, 0x41 + mova m3, m2 + mova m4, m2 + psllq m2, 8 + psllq m3, 16 + psllq m4, 24 + punpckhbw m2, m7 + punpckhbw m3, m7 + punpckhbw m4, m7 + paddw m5, m3 + paddw m6, m2 + paddw m5, m5 + psubw m6, m5 + pshufw m5, m0, 6 + pmullw m6, [pw_3] + paddw m0, m4 + paddw m5, m1 + pmullw m0, [pw_20] + psubw m0, m5 + paddw m6, [PW_ROUND] + paddw m0, m6 + psraw m0, 5 + mova [rsp-8], m0 + mova m0, [r1+5] + mova m5, m0 + mova m6, m0 + psrlq m0, 8 + psrlq m5, 16 + punpcklbw m0, m7 + punpcklbw m5, m7 + paddw m2, m0 + paddw m3, m5 + paddw m2, m2 + psubw m3, m2 + mova m2, m6 + psrlq m6, 24 + punpcklbw m2, m7 + punpcklbw m6, m7 + pmullw m3, [pw_3] + paddw m1, m2 + paddw m4, m6 + pmullw m1, [pw_20] + psubw m3, m4 + paddw m1, [PW_ROUND] + paddw m3, m1 + psraw m3, 5 + mova m1, [rsp-8] + packuswb m1, m3 + OP_MOV [r0], m1, m4 + mova m1, [r1+9] + mova m4, m1 + mova m3, m1 + psrlq m1, 8 + psrlq m4, 16 + punpcklbw m1, m7 + punpcklbw m4, m7 + paddw m5, m1 + paddw m0, m4 + paddw m5, m5 + psubw m0, m5 + mova m5, m3 + psrlq m3, 24 + pmullw m0, [pw_3] + punpcklbw m3, m7 + paddw m2, m3 + psubw m0, m2 + mova m2, m5 + punpcklbw m2, m7 + punpckhbw m5, m7 + paddw m6, m2 + pmullw m6, [pw_20] + paddw m0, [PW_ROUND] + paddw m0, m6 + psraw m0, 5 + paddw m3, m5 + pshufw m6, m5, 0xf9 + paddw m6, m4 + pshufw m4, m5, 0xbe + pshufw m5, m5, 0x6f + paddw m4, m1 + paddw m5, m2 + paddw m6, m6 + psubw m4, m6 + pmullw m3, [pw_20] + pmullw m4, [pw_3] + psubw m3, m5 + paddw m4, [PW_ROUND] + paddw m4, m3 + psraw m4, 5 + packuswb m0, m4 + OP_MOV [r0+8], m0, m4 + add r1, r3 + add r0, r2 + dec r4d + jne .loop + REP_RET +%endmacro + +%macro PUT_OP 2-3 + mova %1, %2 +%endmacro + +%macro AVG_OP 2-3 + mova %3, %1 + pavgb %2, %3 + mova %1, %2 +%endmacro + +INIT_MMX mmxext +%define PW_ROUND pw_16 +%define OP_MOV PUT_OP +MPEG4_QPEL16_H_LOWPASS put +%define PW_ROUND pw_16 +%define OP_MOV AVG_OP +MPEG4_QPEL16_H_LOWPASS avg +%define PW_ROUND pw_15 +%define OP_MOV PUT_OP +MPEG4_QPEL16_H_LOWPASS put_no_rnd + + + +%macro MPEG4_QPEL8_H_LOWPASS 1 +cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8 + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + pxor m7, m7 +.loop: + mova m0, [r1] + mova m1, m0 + mova m2, m0 + punpcklbw m0, m7 + punpckhbw m1, m7 + pshufw m5, m0, 0x90 + pshufw m6, m0, 0x41 + mova m3, m2 + mova m4, m2 + psllq m2, 8 + psllq m3, 16 + psllq m4, 24 + punpckhbw m2, m7 + punpckhbw m3, m7 + punpckhbw m4, m7 + paddw m5, m3 + paddw m6, m2 + paddw m5, m5 + psubw m6, m5 + pshufw m5, m0, 0x6 + pmullw m6, [pw_3] + paddw m0, m4 + paddw m5, m1 + pmullw m0, [pw_20] + psubw m0, m5 + paddw m6, [PW_ROUND] + paddw m0, m6 + psraw m0, 5 + movh m5, [r1+5] + punpcklbw m5, m7 + pshufw m6, m5, 0xf9 + paddw m1, m5 + paddw m2, m6 + pshufw m6, m5, 0xbe + pshufw m5, m5, 0x6f + paddw m3, m6 + paddw m4, m5 + paddw m2, m2 + psubw m3, m2 + pmullw m1, [pw_20] + pmullw m3, [pw_3] + psubw m3, m4 + paddw m1, [PW_ROUND] + paddw m3, m1 + psraw m3, 5 + packuswb m0, m3 + OP_MOV [r0], m0, m4 + add r1, r3 + add r0, r2 + dec r4d + jne .loop + REP_RET +%endmacro + +INIT_MMX mmxext +%define PW_ROUND pw_16 +%define OP_MOV PUT_OP +MPEG4_QPEL8_H_LOWPASS put +%define PW_ROUND pw_16 +%define OP_MOV AVG_OP +MPEG4_QPEL8_H_LOWPASS avg +%define PW_ROUND pw_15 +%define OP_MOV PUT_OP +MPEG4_QPEL8_H_LOWPASS put_no_rnd + + + +%macro QPEL_V_LOW 5 + paddw m0, m1 + mova m4, [pw_20] + pmullw m4, m0 + mova m0, %4 + mova m5, %1 + paddw m5, m0 + psubw m4, m5 + mova m5, %2 + mova m6, %3 + paddw m5, m3 + paddw m6, m2 + paddw m6, m6 + psubw m5, m6 + pmullw m5, [pw_3] + paddw m4, [PW_ROUND] + paddw m5, m4 + psraw m5, 5 + packuswb m5, m5 + OP_MOV %5, m5, m7 + SWAP 0,1,2,3 +%endmacro + +%macro MPEG4_QPEL16_V_LOWPASS 1 +cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + + mov r4d, 17 + mov r5, rsp + pxor m7, m7 +.looph: + mova m0, [r1] + mova m1, [r1] + mova m2, [r1+8] + mova m3, [r1+8] + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + mova [r5], m0 + mova [r5+0x88], m1 + mova [r5+0x110], m2 + mova [r5+0x198], m3 + add r5, 8 + add r1, r3 + dec r4d + jne .looph + + + ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride + mov r4d, 4 + mov r1, 4 + neg r2 + lea r1, [r1+r2*8] + lea r1, [r1+r2*4] + lea r1, [r1+r2*2] + neg r2 + mov r5, rsp +.loopv: + pxor m7, m7 + mova m0, [r5+ 0x0] + mova m1, [r5+ 0x8] + mova m2, [r5+0x10] + mova m3, [r5+0x18] + QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] + QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] + QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] + QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] + QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] + QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] + QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] + QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] + QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] + + add r5, 0x88 + add r0, r1 + dec r4d + jne .loopv + REP_RET +%endmacro + +%macro PUT_OPH 2-3 + movh %1, %2 +%endmacro + +%macro AVG_OPH 2-3 + movh %3, %1 + pavgb %2, %3 + movh %1, %2 +%endmacro + +INIT_MMX mmxext +%define PW_ROUND pw_16 +%define OP_MOV PUT_OPH +MPEG4_QPEL16_V_LOWPASS put +%define PW_ROUND pw_16 +%define OP_MOV AVG_OPH +MPEG4_QPEL16_V_LOWPASS avg +%define PW_ROUND pw_15 +%define OP_MOV PUT_OPH +MPEG4_QPEL16_V_LOWPASS put_no_rnd + + + +%macro MPEG4_QPEL8_V_LOWPASS 1 +cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288 + movsxdifnidn r2, r2d + movsxdifnidn r3, r3d + + mov r4d, 9 + mov r5, rsp + pxor m7, m7 +.looph: + mova m0, [r1] + mova m1, [r1] + punpcklbw m0, m7 + punpckhbw m1, m7 + mova [r5], m0 + mova [r5+0x48], m1 + add r5, 8 + add r1, r3 + dec r4d + jne .looph + + + ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride + mov r4d, 2 + mov r1, 4 + neg r2 + lea r1, [r1+r2*4] + lea r1, [r1+r2*2] + neg r2 + mov r5, rsp +.loopv: + pxor m7, m7 + mova m0, [r5+ 0x0] + mova m1, [r5+ 0x8] + mova m2, [r5+0x10] + mova m3, [r5+0x18] + QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] + QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] + QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] + QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] + lea r0, [r0+r2*2] + QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] + QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] + + add r5, 0x48 + add r0, r1 + dec r4d + jne .loopv + REP_RET +%endmacro + +INIT_MMX mmxext +%define PW_ROUND pw_16 +%define OP_MOV PUT_OPH +MPEG4_QPEL8_V_LOWPASS put +%define PW_ROUND pw_16 +%define OP_MOV AVG_OPH +MPEG4_QPEL8_V_LOWPASS avg +%define PW_ROUND pw_15 +%define OP_MOV PUT_OPH +MPEG4_QPEL8_V_LOWPASS put_no_rnd diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index a64ec41..5037aee 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -697,7 +697,9 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) { +#if HAVE_YASM dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; +#endif /* HAVE_YASM */ dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; @@ -720,7 +722,9 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) { +#if HAVE_YASM dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmxext; +#endif /* HAVE_YASM */ dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; -- 2.7.4