From 5693c083560463156f01d8a7362e30bf2e0ea238 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Fri, 30 Sep 2005 02:31:47 +0000 Subject: [PATCH] sse2 16x16 sum squared diff (306=>268 cycles on a K8) faster 8x8 mmx ssd (77=>70 cycles) Originally committed as revision 4623 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/i386/dsputil_mmx.c | 90 +++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 51ed07cd1..12d9900f6 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -744,31 +744,49 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int int tmp; asm volatile ( "movl %4,%%ecx\n" + "shr $1,%%ecx\n" "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ "1:\n" - "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ - "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ + "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ + "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ + "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ + "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: substract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ "movq %%mm1,%%mm5\n" + "movq %%mm3,%%mm6\n" "psubusb %%mm2,%%mm1\n" + "psubusb %%mm4,%%mm3\n" "psubusb %%mm5,%%mm2\n" + "psubusb %%mm6,%%mm4\n" "por %%mm1,%%mm2\n" + "por %%mm3,%%mm4\n" + /* now convert to 16-bit vectors so we can square them */ "movq %%mm2,%%mm1\n" + "movq %%mm4,%%mm3\n" "punpckhbw %%mm0,%%mm2\n" + "punpckhbw %%mm0,%%mm4\n" "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ + "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ "pmaddwd %%mm2,%%mm2\n" + "pmaddwd %%mm4,%%mm4\n" "pmaddwd %%mm1,%%mm1\n" + "pmaddwd %%mm3,%%mm3\n" - "add %3,%0\n" - "add %3,%1\n" + "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ + "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ "paddd %%mm2,%%mm1\n" + "paddd %%mm4,%%mm3\n" "paddd %%mm1,%%mm7\n" + "paddd %%mm3,%%mm7\n" "decl %%ecx\n" "jnz 1b\n" @@ -843,6 +861,68 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int return tmp; } +static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + asm volatile ( + "shr $1,%2\n" + "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ + "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ + "1:\n" + "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ + "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ + "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ + "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: substract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movdqa %%xmm1,%%xmm5\n" + "movdqa %%xmm3,%%xmm6\n" + "psubusb %%xmm2,%%xmm1\n" + "psubusb %%xmm4,%%xmm3\n" + "psubusb %%xmm5,%%xmm2\n" + "psubusb %%xmm6,%%xmm4\n" + + "por %%xmm1,%%xmm2\n" + "por %%xmm3,%%xmm4\n" + + /* now convert to 16-bit vectors so we can square them */ + "movdqa %%xmm2,%%xmm1\n" + "movdqa %%xmm4,%%xmm3\n" + + "punpckhbw %%xmm0,%%xmm2\n" + "punpckhbw %%xmm0,%%xmm4\n" + "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ + "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ + + "pmaddwd %%xmm2,%%xmm2\n" + "pmaddwd %%xmm4,%%xmm4\n" + "pmaddwd %%xmm1,%%xmm1\n" + "pmaddwd %%xmm3,%%xmm3\n" + + "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ + "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ + + "paddd %%xmm2,%%xmm1\n" + "paddd %%xmm4,%%xmm3\n" + "paddd %%xmm1,%%xmm7\n" + "paddd %%xmm3,%%xmm7\n" + + "decl %2\n" + "jnz 1b\n" + + "movdqa %%xmm7,%%xmm1\n" + "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ + "paddd %%xmm1,%%xmm7\n" + "movdqa %%xmm7,%%xmm1\n" + "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ + "paddd %%xmm1,%%xmm7\n" + "movd %%xmm7,%3\n" + : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) + : "r" ((long)line_size)); + return tmp; +} + static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { int tmp; asm volatile ( @@ -2626,7 +2706,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->hadamard8_diff[1]= hadamard8_diff_mmx; c->pix_norm1 = pix_norm1_mmx; - c->sse[0] = sse16_mmx; + c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; c->sse[1] = sse8_mmx; c->vsad[4]= vsad_intra16_mmx; -- 2.34.1