From 110d0cdc9d1ec414a658f841a3fbefbf6f796d61 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Thu, 19 Apr 2012 22:36:17 +0200 Subject: [PATCH] rv40dsp x86: MMX/MMX2/3DNow/SSE2/SSSE3 implementations of MC Code mostly inspired by vp8's MC, however: - its MMX2 horizontal filter is worse because it can't take advantage of the coefficient redundancy - that same coefficient redundancy allows better code for non-SSSE3 versions Benchmark (rounded to tens of unit): V8x8 H8x8 2D8x8 V16x16 H16x16 2D16x16 C 445 358 985 1785 1559 3280 MMX* 219 271 478 714 929 1443 SSE2 131 158 294 425 515 892 SSSE3 120 122 248 387 390 763 End result is overall around a 15% speedup for SSSE3 version (on 6 sequences); all loop filter functions now take around 55% of decoding time, while luma MC dsp functions are around 6%, chroma ones are 1.3% and biweight around 2.3%. Signed-off-by: Diego Biurrun --- libavcodec/x86/dsputil_mmx.c | 16 +++ libavcodec/x86/dsputil_mmx.h | 5 + libavcodec/x86/rv40dsp.asm | 316 +++++++++++++++++++++++++++++++++++++++++- libavcodec/x86/rv40dsp_init.c | 146 +++++++++++++++++++ 4 files changed, 480 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 3ef19c5..6377a73 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -1791,6 +1791,22 @@ QPEL_2TAP(avg_, 16, 3dnow) QPEL_2TAP(put_, 8, 3dnow) QPEL_2TAP(avg_, 8, 3dnow) +void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) +{ + put_pixels8_xy2_mmx(dst, src, stride, 8); +} +void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) +{ + put_pixels16_xy2_mmx(dst, src, stride, 16); +} +void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) +{ + avg_pixels8_xy2_mmx(dst, src, stride, 8); +} +void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) +{ + avg_pixels16_xy2_mmx(dst, src, stride, 16); +} #if HAVE_YASM typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h index 097739c..37f4581 100644 --- a/libavcodec/x86/dsputil_mmx.h +++ b/libavcodec/x86/dsputil_mmx.h @@ -199,6 +199,11 @@ void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); +void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); +void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); +void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); +void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size); + void ff_mmx_idct(DCTELEM *block); void ff_mmxext_idct(DCTELEM *block); diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm index 721d3df..e0213f4 100644 --- a/libavcodec/x86/rv40dsp.asm +++ b/libavcodec/x86/rv40dsp.asm @@ -1,5 +1,7 @@ ;****************************************************************************** ;* MMX/SSE2-optimized functions for the RV40 decoder +;* Copyright (c) 2010 Ronald S. Bultje +;* Copyright (c) 2010 Jason Garrett-Glaser ;* Copyright (C) 2012 Christophe Gisquet ;* ;* This file is part of Libav. @@ -25,11 +27,319 @@ SECTION_RODATA align 16 -shift_round: times 8 dw 1 << (16 - 6) -cextern pw_16 +pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 + +sixtap_filter_hb_m: times 8 db 1, -5 + times 8 db 52, 20 + ; multiplied by 2 to have the same shift + times 8 db 2, -10 + times 8 db 40, 40 + ; back to normal + times 8 db 1, -5 + times 8 db 20, 52 + +sixtap_filter_v_m: times 8 dw 1 + times 8 dw -5 + times 8 dw 52 + times 8 dw 20 + ; multiplied by 2 to have the same shift + times 8 dw 2 + times 8 dw -10 + times 8 dw 40 + times 8 dw 40 + ; back to normal + times 8 dw 1 + times 8 dw -5 + times 8 dw 20 + times 8 dw 52 + +%ifdef PIC +%define sixtap_filter_hw picregq +%define sixtap_filter_hb picregq +%define sixtap_filter_v picregq +%define npicregs 1 +%else +%define sixtap_filter_hw sixtap_filter_hw_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define sixtap_filter_v sixtap_filter_v_m +%define npicregs 0 +%endif + +filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 + +cextern pw_32 +cextern pw_16 +cextern pw_512 SECTION .text +;----------------------------------------------------------------------------- +; subpel MC functions: +; +; void [put|rv40]_rv40_qpel_[h|v]_(uint8_t *dst, int deststride, +; uint8_t *src, int srcstride, +; int len, int m); +;---------------------------------------------------------------------- +%macro LOAD 2 +%if WIN64 + movsxd %1q, %1d +%endif +%ifdef PIC + add %1q, picregq +%else + add %1q, %2 +%endif +%endmacro + +%macro STORE 3 +%ifidn %3, avg + movh %2, [dstq] +%endif + packuswb %1, %1 +%ifidn %3, avg +%if cpuflag(3dnow) + pavgusb %1, %2 +%else + pavgb %1, %2 +%endif +%endif + movh [dstq], %1 +%endmacro + +%macro FILTER_V 1 +cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg +%ifdef PIC + lea picregq, [sixtap_filter_v_m] +%endif + pxor m7, m7 + LOAD my, sixtap_filter_v + + ; read 5 lines + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + +%ifdef m8 + mova m8, [myq+ 0] + mova m9, [myq+16] + mova m10, [myq+32] + mova m11, [myq+48] +%define COEFF05 m8 +%define COEFF14 m9 +%define COEFF2 m10 +%define COEFF3 m11 +%else +%define COEFF05 [myq+ 0] +%define COEFF14 [myq+16] +%define COEFF2 [myq+32] +%define COEFF3 [myq+48] +%endif +.nextrow: + mova m6, m1 + movh m5, [srcq+2*srcstrideq] ; read new row + paddw m6, m4 + punpcklbw m5, m7 + pmullw m6, COEFF14 + paddw m0, m5 + pmullw m0, COEFF05 + paddw m6, m0 + mova m0, m1 + paddw m6, [pw_32] + mova m1, m2 + pmullw m2, COEFF2 + paddw m6, m2 + mova m2, m3 + pmullw m3, COEFF3 + paddw m6, m3 + + ; round/clip/store + mova m3, m4 + psraw m6, 6 + mova m4, m5 + STORE m6, m5, %1 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + REP_RET +%endmacro + +%macro FILTER_H 1 +cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg +%ifdef PIC + lea picregq, [sixtap_filter_v_m] +%endif + pxor m7, m7 + LOAD mx, sixtap_filter_v + mova m6, [pw_32] +%ifdef m8 + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] +%define COEFF05 m8 +%define COEFF14 m9 +%define COEFF2 m10 +%define COEFF3 m11 +%else +%define COEFF05 [mxq+ 0] +%define COEFF14 [mxq+16] +%define COEFF2 [mxq+32] +%define COEFF3 [mxq+48] +%endif +.nextrow: + movq m0, [srcq-2] + movq m5, [srcq+3] + movq m1, [srcq-1] + movq m4, [srcq+2] + punpcklbw m0, m7 + punpcklbw m5, m7 + punpcklbw m1, m7 + punpcklbw m4, m7 + movq m2, [srcq-0] + movq m3, [srcq+1] + paddw m0, m5 + paddw m1, m4 + punpcklbw m2, m7 + punpcklbw m3, m7 + pmullw m0, COEFF05 + pmullw m1, COEFF14 + pmullw m2, COEFF2 + pmullw m3, COEFF3 + paddw m0, m6 + paddw m1, m2 + paddw m0, m3 + paddw m0, m1 + psraw m0, 6 + STORE m0, m1, %1 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +FILTER_V put +FILTER_H put + +INIT_MMX mmx2 +FILTER_V avg +FILTER_H avg + +INIT_MMX 3dnow +FILTER_V avg +FILTER_H avg +%endif + +INIT_XMM sse2 +FILTER_H put +FILTER_H avg +FILTER_V put +FILTER_V avg + +%macro FILTER_SSSE3 1 +cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg +%ifdef PIC + lea picregq, [sixtap_filter_hb_m] +%endif + + ; read 5 lines + sub srcq, srcstrideq + LOAD my, sixtap_filter_hb + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + mova m5, [myq] + movh m3, [srcq] + movh m4, [srcq+srcstrideq] + lea srcq, [srcq+2*srcstrideq] + +.nextrow: + mova m6, m2 + punpcklbw m0, m1 + punpcklbw m6, m3 + pmaddubsw m0, m5 + pmaddubsw m6, [myq+16] + movh m7, [srcq] ; read new row + paddw m6, m0 + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + mova m4, m7 + punpcklbw m7, m3 + pmaddubsw m7, m5 + paddw m6, m7 + pmulhrsw m6, [pw_512] + STORE m6, m7, %1 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + REP_RET + +cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg +%ifdef PIC + lea picregq, [sixtap_filter_hb_m] +%endif + mova m3, [filter_h6_shuf2] + mova m4, [filter_h6_shuf3] + LOAD mx, sixtap_filter_hb + mova m5, [mxq] ; set up 6tap filter in bytes + mova m6, [mxq+16] + mova m7, [filter_h6_shuf1] + +.nextrow: + movu m0, [srcq-2] + mova m1, m0 + mova m2, m0 + pshufb m0, m7 + pshufb m1, m3 + pshufb m2, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m6 + pmaddubsw m2, m5 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, [pw_512] + STORE m0, m1, %1 + + ; go to next line + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row + jg .nextrow + REP_RET +%endmacro + +INIT_XMM ssse3 +FILTER_SSSE3 put +FILTER_SSSE3 avg + ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 %macro RV40_WCORE 4-5 movh m4, [%3 + r6 + 0] @@ -143,7 +453,7 @@ SECTION .text %macro RV40_WEIGHT 3 cglobal rv40_weight_func_%1_%2, 6, 7, 8 %if cpuflag(ssse3) - mova m1, [shift_round] + mova m1, [pw_1024] %else mova m1, [pw_16] %endif diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c index df468aa..3f42363 100644 --- a/libavcodec/x86/rv40dsp_init.c +++ b/libavcodec/x86/rv40dsp_init.c @@ -22,8 +22,11 @@ /** * @file * RV40 decoder motion compensation functions x86-optimised + * 2,0 and 0,2 have h264 equivalents. + * 3,3 is bugged in the rv40 format and maps to _xy2 version */ +#include "libavcodec/x86/dsputil_mmx.h" #include "libavcodec/rv34dsp.h" void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, @@ -53,6 +56,132 @@ DECLARE_WEIGHT(mmx) DECLARE_WEIGHT(sse2) DECLARE_WEIGHT(ssse3) +/** @{ */ +/** + * Define one qpel function. + * LOOPSIZE must be already set to the number of pixels processed per + * iteration in the inner loop of the called functions. + * COFF(x) must be already defined so as to provide the offset into any + * array of coeffs used by the called function for the qpel position x. + */ +#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ +static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ + uint8_t *src, \ + int stride) \ +{ \ + int i; \ + if (PH && PV) { \ + DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \ + uint8_t *tmpptr = tmp + SIZE * 2; \ + src -= stride * 2; \ + \ + for (i = 0; i < SIZE; i += LOOPSIZE) \ + ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ + SIZE + 5, HCOFF(PH)); \ + for (i = 0; i < SIZE; i += LOOPSIZE) \ + ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ + SIZE, SIZE, VCOFF(PV)); \ + } else if (PV) { \ + for (i = 0; i < SIZE; i += LOOPSIZE) \ + ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ + stride, SIZE, VCOFF(PV)); \ + } else { \ + for (i = 0; i < SIZE; i += LOOPSIZE) \ + ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ + stride, SIZE, HCOFF(PH)); \ + } \ +}; + +/** Declare functions for sizes 8 and 16 and given operations + * and qpel position. */ +#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ + QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ + QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) + +/** Declare all functions for all sizes and qpel positions */ +#define QPEL_MC_DECL(OP, OPT) \ +void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ + const uint8_t *src, \ + ptrdiff_t srcStride, \ + int len, int m); \ +void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ + const uint8_t *src, \ + ptrdiff_t srcStride, \ + int len, int m); \ +QPEL_FUNCS_DECL(OP, 0, 1, OPT) \ +QPEL_FUNCS_DECL(OP, 0, 3, OPT) \ +QPEL_FUNCS_DECL(OP, 1, 0, OPT) \ +QPEL_FUNCS_DECL(OP, 1, 1, OPT) \ +QPEL_FUNCS_DECL(OP, 1, 2, OPT) \ +QPEL_FUNCS_DECL(OP, 1, 3, OPT) \ +QPEL_FUNCS_DECL(OP, 2, 1, OPT) \ +QPEL_FUNCS_DECL(OP, 2, 2, OPT) \ +QPEL_FUNCS_DECL(OP, 2, 3, OPT) \ +QPEL_FUNCS_DECL(OP, 3, 0, OPT) \ +QPEL_FUNCS_DECL(OP, 3, 1, OPT) \ +QPEL_FUNCS_DECL(OP, 3, 2, OPT) +/** @} */ + +#define LOOPSIZE 8 +#define HCOFF(x) (32 * (x - 1)) +#define VCOFF(x) (32 * (x - 1)) +QPEL_MC_DECL(put_, _ssse3) +QPEL_MC_DECL(avg_, _ssse3) + +#undef LOOPSIZE +#undef HCOFF +#undef VCOFF +#define LOOPSIZE 8 +#define HCOFF(x) (64 * (x - 1)) +#define VCOFF(x) (64 * (x - 1)) +QPEL_MC_DECL(put_, _sse2) +QPEL_MC_DECL(avg_, _sse2) + +#if ARCH_X86_32 +#undef LOOPSIZE +#undef HCOFF +#undef VCOFF +#define LOOPSIZE 4 +#define HCOFF(x) (64 * (x - 1)) +#define VCOFF(x) (64 * (x - 1)) + +QPEL_MC_DECL(put_, _mmx) + +#define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx +#define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx +QPEL_MC_DECL(avg_, _mmx2) + +#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx +#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx +QPEL_MC_DECL(avg_, _3dnow) +#endif + +/** @{ */ +/** Set one function */ +#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ + c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; + +/** Set functions put and avg for sizes 8 and 16 and a given qpel position */ +#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ + QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ + QPEL_FUNC_SET(OP, 16, PH, PV, OPT) + +/** Set all functions for all sizes and qpel positions */ +#define QPEL_MC_SET(OP, OPT) \ +QPEL_FUNCS_SET (OP, 0, 1, OPT) \ +QPEL_FUNCS_SET (OP, 0, 3, OPT) \ +QPEL_FUNCS_SET (OP, 1, 0, OPT) \ +QPEL_FUNCS_SET (OP, 1, 1, OPT) \ +QPEL_FUNCS_SET (OP, 1, 2, OPT) \ +QPEL_FUNCS_SET (OP, 1, 3, OPT) \ +QPEL_FUNCS_SET (OP, 2, 1, OPT) \ +QPEL_FUNCS_SET (OP, 2, 2, OPT) \ +QPEL_FUNCS_SET (OP, 2, 3, OPT) \ +QPEL_FUNCS_SET (OP, 3, 0, OPT) \ +QPEL_FUNCS_SET (OP, 3, 1, OPT) \ +QPEL_FUNCS_SET (OP, 3, 2, OPT) +/** @} */ + void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) { #if HAVE_YASM @@ -65,25 +194,42 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx; c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx; + c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx; + c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx; + c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx; + c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx; +#if ARCH_X86_32 + QPEL_MC_SET(put_, _mmx) +#endif } if (mm_flags & AV_CPU_FLAG_MMX2) { c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2; +#if ARCH_X86_32 + QPEL_MC_SET(avg_, _mmx2) +#endif } else if (mm_flags & AV_CPU_FLAG_3DNOW) { c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; +#if ARCH_X86_32 + QPEL_MC_SET(avg_, _3dnow) +#endif } if (mm_flags & AV_CPU_FLAG_SSE2) { c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; + QPEL_MC_SET(put_, _sse2) + QPEL_MC_SET(avg_, _sse2) } if (mm_flags & AV_CPU_FLAG_SSSE3) { c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; + QPEL_MC_SET(put_, _ssse3) + QPEL_MC_SET(avg_, _ssse3) } #endif } -- 2.7.4