From b1159ad92818cd8f0885d252b0800f5960fe7241 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sat, 5 Dec 2009 15:09:10 +0000 Subject: [PATCH] refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/apedec.c | 12 +-- libavcodec/dsputil.c | 25 +++--- libavcodec/dsputil.h | 18 ++--- libavcodec/ppc/int_altivec.c | 66 ++++++++-------- libavcodec/x86/dsputil_mmx.c | 19 +++-- libavcodec/x86/dsputil_yasm.asm | 164 +++++++++++++++++++++++++++++++--------- 6 files changed, 193 insertions(+), 111 deletions(-) diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c index b8d1e9e..c27d086 100644 --- a/libavcodec/apedec.c +++ b/libavcodec/apedec.c @@ -648,22 +648,16 @@ static void init_filter(APEContext * ctx, APEFilter *f, int16_t * buf, int order do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order); } -static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) +static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits) { int res; int absres; while (count--) { /* round fixedpoint scalar product */ - res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits; - - if (*data < 0) - ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order); - else if (*data > 0) - ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order); - + res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data)); + res = (res + (1 << (fracbits - 1))) >> fracbits; res += *data; - *data++ = res; /* Update the output history */ diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index a04b8a4..ffa8cec 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -4298,18 +4298,6 @@ void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, i } } -static void add_int16_c(int16_t * v1, int16_t * v2, int order) -{ - while (order--) - *v1++ += *v2++; -} - -static void sub_int16_c(int16_t * v1, int16_t * v2, int order) -{ - while (order--) - *v1++ -= *v2++; -} - static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) { int res = 0; @@ -4320,6 +4308,16 @@ static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int return res; } +static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +{ + int res = 0; + while (order--) { + res += *v1 * *v2++; + *v1++ += mul * *v3++; + } + return res; +} + #define W0 2048 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ @@ -4848,9 +4846,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->vector_clipf = vector_clipf_c; c->float_to_int16 = ff_float_to_int16_c; c->float_to_int16_interleave = ff_float_to_int16_interleave_c; - c->add_int16 = add_int16_c; - c->sub_int16 = sub_int16_c; c->scalarproduct_int16 = scalarproduct_int16_c; + c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; c->scalarproduct_float = scalarproduct_float_c; c->butterflies_float = butterflies_float_c; c->vector_fmul_scalar = vector_fmul_scalar_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 4079396..e483276 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -560,23 +560,19 @@ typedef struct DSPContext { void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize, int * range, int * sum, int edges); - /* ape functions */ - /** - * Add contents of the second vector to the first one. - * @param len length of vectors, should be multiple of 16 - */ - void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len); - /** - * Add contents of the second vector to the first one. - * @param len length of vectors, should be multiple of 16 - */ - void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len); /** * Calculate scalar product of two vectors. * @param len length of vectors, should be multiple of 16 * @param shift number of bits to discard from product */ int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift); + /* ape functions */ + /** + * Calculate scalar product of v1 and v2, + * and v1[i] += v3[i] * mul + * @param len length of vectors, should be multiple of 16 + */ + int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul); /* rv30 functions */ qpel_mc_func put_rv30_tpel_pixels_tab[4][16]; diff --git a/libavcodec/ppc/int_altivec.c b/libavcodec/ppc/int_altivec.c index d76a220..4f7529f 100644 --- a/libavcodec/ppc/int_altivec.c +++ b/libavcodec/ppc/int_altivec.c @@ -79,34 +79,6 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, return u.score[3]; } -static void add_int16_altivec(int16_t * v1, int16_t * v2, int order) -{ - int i; - register vec_s16 vec, *pv; - - for(i = 0; i < order; i += 8){ - pv = (vec_s16*)v2; - vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); - vec_st(vec_add(vec_ld(0, v1), vec), 0, v1); - v1 += 8; - v2 += 8; - } -} - -static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) -{ - int i; - register vec_s16 vec, *pv; - - for(i = 0; i < order; i += 8){ - pv = (vec_s16*)v2; - vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); - vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); - v1 += 8; - v2 += 8; - } -} - static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift) { int i; @@ -137,10 +109,44 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order return ires; } +static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +{ + LOAD_ZERO; + vec_s16 *pv1 = (vec_s16*)v1; + vec_s16 *pv2 = (vec_s16*)v2; + vec_s16 *pv3 = (vec_s16*)v3; + register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; + register vec_s16 t0, t1, i0, i1; + register vec_s16 i2 = pv2[0], i3 = pv3[0]; + register vec_s32 res = zero_s32v; + register vec_u8 align = vec_lvsl(0, v2); + int32_t ires; + order >>= 4; + do { + t0 = vec_perm(i2, pv2[1], align); + i2 = pv2[2]; + t1 = vec_perm(pv2[1], i2, align); + i0 = pv1[0]; + i1 = pv1[1]; + res = vec_msum(t0, i0, res); + res = vec_msum(t1, i1, res); + t0 = vec_perm(i3, pv3[1], align); + i3 = pv3[2]; + t1 = vec_perm(pv3[1], i3, align); + pv1[0] = vec_mladd(t0, muls, i0); + pv1[1] = vec_mladd(t1, muls, i1); + pv1 += 2; + pv2 += 2; + pv3 += 2; + } while(--order); + res = vec_splat(vec_sums(res, zero_s32v), 3); + vec_ste(res, 0, &ires); + return ires; +} + void int_init_altivec(DSPContext* c, AVCodecContext *avctx) { c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; - c->add_int16 = add_int16_altivec; - c->sub_int16 = sub_int16_altivec; c->scalarproduct_int16 = scalarproduct_int16_altivec; + c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec; } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 93d4af5..66c3a00 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2384,12 +2384,11 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); -void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); -void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); -void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); -void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); -int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); -int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift); +int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift); +int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift); +int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); +int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul); void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); @@ -2951,9 +2950,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } if(mm_flags & FF_MM_MMX2){ #if HAVE_YASM - c->add_int16 = ff_add_int16_mmx2; - c->sub_int16 = ff_sub_int16_mmx2; c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; #endif } if(mm_flags & FF_MM_SSE){ @@ -2975,11 +2973,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16 = float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; #if HAVE_YASM - c->add_int16 = ff_add_int16_sse2; - c->sub_int16 = ff_sub_int16_sse2; c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; #endif } + if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; } if (CONFIG_ENCODERS) diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index c8a4230..96080be 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -100,43 +100,7 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 %macro SCALARPRODUCT 1 -; void add_int16(int16_t * v1, int16_t * v2, int order) -cglobal add_int16_%1, 3,3,2, v1, v2, order - shl orderq, 1 - add v1q, orderq - add v2q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - paddw m0, [v1q + orderq] - paddw m1, [v1q + orderq + mmsize] - mova [v1q + orderq], m0 - mova [v1q + orderq + mmsize], m1 - add orderq, mmsize*2 - jl .loop - REP_RET - -; void sub_int16(int16_t * v1, int16_t * v2, int order) -cglobal sub_int16_%1, 3,3,4, v1, v2, order - shl orderq, 1 - add v1q, orderq - add v2q, orderq - neg orderq -.loop: - movu m2, [v2q + orderq] - movu m3, [v2q + orderq + mmsize] - mova m0, [v1q + orderq] - mova m1, [v1q + orderq + mmsize] - psubw m0, m2 - psubw m1, m3 - mova [v1q + orderq], m0 - mova [v1q + orderq + mmsize], m1 - add orderq, mmsize*2 - jl .loop - REP_RET - -; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) +; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift shl orderq, 1 add v1q, orderq @@ -165,6 +129,51 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift paddd m2, m0 movd eax, m2 RET + +; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm +%if mmsize == 16 + pshuflw m7, m7, 0 + punpcklqdq m7, m7 +%else + pshufw m7, m7, 0 +%endif + pxor m6, m6 + add v1q, orderq + add v2q, orderq + add v3q, orderq + neg orderq +.loop: + movu m0, [v2q + orderq] + movu m1, [v2q + orderq + mmsize] + mova m4, [v1q + orderq] + mova m5, [v1q + orderq + mmsize] + movu m2, [v3q + orderq] + movu m3, [v3q + orderq + mmsize] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmullw m2, m7 + pmullw m3, m7 + paddd m6, m0 + paddd m6, m1 + paddw m2, m4 + paddw m3, m5 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + add orderq, mmsize*2 + jl .loop +%if mmsize == 16 + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e +%else + pshufw m0, m6, 0x4e +%endif + paddd m6, m0 + movd eax, m6 + RET %endmacro INIT_MMX @@ -172,6 +181,87 @@ SCALARPRODUCT mmx2 INIT_XMM SCALARPRODUCT sse2 +%macro SCALARPRODUCT_LOOP 1 +align 16 +.loop%1: + sub orderq, mmsize*2 +%if %1 + mova m1, m4 + mova m4, [v2q + orderq] + mova m0, [v2q + orderq + mmsize] + palignr m1, m0, %1 + palignr m0, m4, %1 + mova m3, m5 + mova m5, [v3q + orderq] + mova m2, [v3q + orderq + mmsize] + palignr m3, m2, %1 + palignr m2, m5, %1 +%else + mova m0, [v2q + orderq] + mova m1, [v2q + orderq + mmsize] + mova m2, [v3q + orderq] + mova m3, [v3q + orderq + mmsize] +%endif + pmaddwd m0, [v1q + orderq] + pmaddwd m1, [v1q + orderq + mmsize] + pmullw m2, m7 + pmullw m3, m7 + paddw m2, [v1q + orderq] + paddw m3, [v1q + orderq + mmsize] + paddd m6, m0 + paddd m6, m1 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + jg .loop%1 +%if %1 + jmp .end +%endif +%endmacro + +; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm + pshuflw m7, m7, 0 + punpcklqdq m7, m7 + pxor m6, m6 + mov r4d, v2d + and r4d, 15 + and v2q, ~15 + and v3q, ~15 + mova m4, [v2q + orderq] + mova m5, [v3q + orderq] + ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) + cmp r4d, 0 + je .loop0 + cmp r4d, 2 + je .loop2 + cmp r4d, 4 + je .loop4 + cmp r4d, 6 + je .loop6 + cmp r4d, 8 + je .loop8 + cmp r4d, 10 + je .loop10 + cmp r4d, 12 + je .loop12 +SCALARPRODUCT_LOOP 14 +SCALARPRODUCT_LOOP 12 +SCALARPRODUCT_LOOP 10 +SCALARPRODUCT_LOOP 8 +SCALARPRODUCT_LOOP 6 +SCALARPRODUCT_LOOP 4 +SCALARPRODUCT_LOOP 2 +SCALARPRODUCT_LOOP 0 +.end: + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e + paddd m6, m0 + movd eax, m6 + RET + ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) -- 2.7.4