From: Fritz Koenig Date: Thu, 21 Oct 2010 17:53:15 +0000 (-0700) Subject: FDCT optimizations. X-Git-Tag: 1.0_branch~800 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5f0e0617bad80d06b7231ec11814e52d6f3edba8;p=profile%2Fivi%2Flibvpx.git FDCT optimizations. Fixed up the fdct for mmx and 8x4 sse2 to match them most recent changes. Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719 --- diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm index 5acaca8..f07b030 100644 --- a/vp8/encoder/x86/dct_mmx.asm +++ b/vp8/encoder/x86/dct_mmx.asm @@ -11,511 +11,231 @@ %include "vpx_ports/x86_abi_support.asm" -section .text - global sym(vp8_short_fdct4x4_mmx) - global sym(vp8_short_fdct8x4_wmt) - - -%define DCTCONSTANTSBITS (16) -%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) -%define x_c1 (60547) ; cos(pi /8) * (1<<15) -%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) -%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) - - ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_mmx) sym(vp8_short_fdct4x4_mmx): push rbp - mov rbp, rsp + mov rbp, rsp SHADOW_ARGS_TO_STACK 3 GET_GOT rbx - push rsi - push rdi + push rsi + push rdi ; end prolog - mov rsi, arg(0) ;input - mov rdi, arg(1) ;output - - lea rdx, [GLOBAL(dct_const_mmx)] - movsxd rax, dword ptr arg(2) ;pitch - - lea rcx, [rsi + rax*2] - ; read the input data - movq mm0, [rsi] - movq mm1, [rsi + rax ] - - movq mm2, [rcx] - movq mm3, [rcx + rax] - ; get the constants - ;shift to left by 1 for prescision - psllw mm0, 3 - psllw mm1, 3 - - psllw mm2, 3 - psllw mm3, 3 - - ; transpose for the second stage - movq mm4, mm0 ; 00 01 02 03 - movq mm5, mm2 ; 10 11 12 03 - - punpcklwd mm0, mm1 ; 00 10 01 11 - punpckhwd mm4, mm1 ; 02 12 03 13 - - punpcklwd mm2, mm3 ; 20 30 21 31 - punpckhwd mm5, mm3 ; 22 32 23 33 - - - movq mm1, mm0 ; 00 10 01 11 - punpckldq mm0, mm2 ; 00 10 20 30 - - punpckhdq mm1, mm2 ; 01 11 21 31 - - movq mm2, mm4 ; 02 12 03 13 - punpckldq mm2, mm5 ; 02 12 22 32 - - punpckhdq mm4, mm5 ; 03 13 23 33 - movq mm3, mm4 - - - ; first stage - movq mm5, mm0 - movq mm4, mm1 - - paddw mm0, mm3 ; a = 0 + 3 - paddw mm1, mm2 ; b = 1 + 2 - - psubw mm4, mm2 ; c = 1 - 2 - psubw mm5, mm3 ; d = 0 - 3 - - - ; output 0 and 2 - movq mm6, [rdx + 16] ; c2 - movq mm2, mm0 ; a - - paddw mm0, mm1 ; a + b - psubw mm2, mm1 ; a - b - - movq mm1, mm0 ; a + b - pmulhw mm0, mm6 ; 00 01 02 03 - - paddw mm0, mm1 ; output 00 01 02 03 - pmulhw mm6, mm2 ; 20 21 22 23 - - paddw mm2, mm6 ; output 20 21 22 23 - - ; output 1 and 3 - movq mm6, [rdx + 8] ; c1 - movq mm7, [rdx + 24] ; c3 - - movq mm1, mm4 ; c - movq mm3, mm5 ; d - - pmulhw mm1, mm7 ; c * c3 - pmulhw mm3, mm6 ; d * c1 - - paddw mm3, mm5 ; d * c1 rounded - paddw mm1, mm3 ; output 10 11 12 13 - - movq mm3, mm4 ; c - pmulhw mm5, mm7 ; d * c3 - - pmulhw mm4, mm6 ; c * c1 - paddw mm3, mm4 ; round c* c1 - - psubw mm5, mm3 ; output 30 31 32 33 - movq mm3, mm5 - - - ; done with vertical - ; transpose for the second stage - movq mm4, mm0 ; 00 01 02 03 - movq mm5, mm2 ; 10 11 12 03 - - punpcklwd mm0, mm1 ; 00 10 01 11 - punpckhwd mm4, mm1 ; 02 12 03 13 - - punpcklwd mm2, mm3 ; 20 30 21 31 - punpckhwd mm5, mm3 ; 22 32 23 33 - - - movq mm1, mm0 ; 00 10 01 11 - punpckldq mm0, mm2 ; 00 10 20 30 - - punpckhdq mm1, mm2 ; 01 11 21 31 - - movq mm2, mm4 ; 02 12 03 13 - punpckldq mm2, mm5 ; 02 12 22 32 - - punpckhdq mm4, mm5 ; 03 13 23 33 - movq mm3, mm4 - - - ; first stage - movq mm5, mm0 - movq mm4, mm1 - paddw mm0, mm3 ; a = 0 + 3 - paddw mm1, mm2 ; b = 1 + 2 + mov rsi, arg(0) ; input + mov rdi, arg(1) ; output - psubw mm4, mm2 ; c = 1 - 2 - psubw mm5, mm3 ; d = 0 - 3 + movsxd rax, dword ptr arg(2) ;pitch - - ; output 0 and 2 - movq mm6, [rdx + 16] ; c2 - movq mm2, mm0 ; a - paddw mm0, mm1 ; a + b - - psubw mm2, mm1 ; a - b - - movq mm1, mm0 ; a + b - pmulhw mm0, mm6 ; 00 01 02 03 - - paddw mm0, mm1 ; output 00 01 02 03 - pmulhw mm6, mm2 ; 20 21 22 23 - - paddw mm2, mm6 ; output 20 21 22 23 - - - ; output 1 and 3 - movq mm6, [rdx + 8] ; c1 - movq mm7, [rdx + 24] ; c3 - - movq mm1, mm4 ; c - movq mm3, mm5 ; d - - pmulhw mm1, mm7 ; c * c3 - pmulhw mm3, mm6 ; d * c1 - - paddw mm3, mm5 ; d * c1 rounded - paddw mm1, mm3 ; output 10 11 12 13 - - movq mm3, mm4 ; c - pmulhw mm5, mm7 ; d * c3 - - pmulhw mm4, mm6 ; c * c1 - paddw mm3, mm4 ; round c* c1 - - psubw mm5, mm3 ; output 30 31 32 33 - movq mm3, mm5 - ; done with vertical - - pcmpeqw mm4, mm4 - pcmpeqw mm5, mm5 - psrlw mm4, 15 - psrlw mm5, 15 - - psllw mm4, 2 - psllw mm5, 2 - - paddw mm0, mm4 - paddw mm1, mm5 - paddw mm2, mm4 - paddw mm3, mm5 - - psraw mm0, 3 - psraw mm1, 3 - psraw mm2, 3 - psraw mm3, 3 - - movq [rdi ], mm0 - movq [rdi+ 8], mm1 - movq [rdi+16], mm2 - movq [rdi+24], mm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) -sym(vp8_short_fdct8x4_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;input - mov rdi, arg(1) ;output - - lea rdx, [GLOBAL(dct_const_xmm)] - movsxd rax, dword ptr arg(2) ;pitch - - lea rcx, [rsi + rax*2] + lea rcx, [rsi + rax*2] ; read the input data - movdqa xmm0, [rsi] - movdqa xmm2, [rsi + rax] - - movdqa xmm4, [rcx] - movdqa xmm3, [rcx + rax] - ; get the constants - ;shift to left by 1 for prescision - psllw xmm0, 3 - psllw xmm2, 3 - - psllw xmm4, 3 - psllw xmm3, 3 - - ; transpose for the second stage - movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 - movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 - - punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 - punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + movq mm0, [rsi] + movq mm1, [rsi + rax] - punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 - punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + movq mm2, [rcx] + movq mm4, [rcx + rax] - movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 - punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + ; transpose for the first stage + movq mm3, mm0 ; 00 01 02 03 + movq mm5, mm2 ; 20 21 22 23 - punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + punpcklwd mm0, mm1 ; 00 10 01 11 + punpckhwd mm3, mm1 ; 02 12 03 13 + punpcklwd mm2, mm4 ; 20 30 21 31 + punpckhwd mm5, mm4 ; 22 32 23 33 - movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 - punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + movq mm1, mm0 ; 00 10 01 11 + punpckldq mm0, mm2 ; 00 10 20 30 - punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 - movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + punpckhdq mm1, mm2 ; 01 11 21 31 - punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 - punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + movq mm2, mm3 ; 02 12 03 13 + punpckldq mm2, mm5 ; 02 12 22 32 - movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 - punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + punpckhdq mm3, mm5 ; 03 13 23 33 - punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 - - ; xmm0 0 - ; xmm1 1 - ; xmm2 2 - ; xmm3 3 + ; mm0 0 + ; mm1 1 + ; mm2 2 + ; mm3 3 ; first stage - movdqa xmm5, xmm0 - movdqa xmm4, xmm1 - - paddw xmm0, xmm3 ; a = 0 + 3 - paddw xmm1, xmm2 ; b = 1 + 2 - - psubw xmm4, xmm2 ; c = 1 - 2 - psubw xmm5, xmm3 ; d = 0 - 3 + movq mm5, mm0 + movq mm4, mm1 + paddw mm0, mm3 ; a1 = 0 + 3 + paddw mm1, mm2 ; b1 = 1 + 2 - ; output 0 and 2 - movdqa xmm6, [rdx + 32] ; c2 - movdqa xmm2, xmm0 ; a + psubw mm4, mm2 ; c1 = 1 - 2 + psubw mm5, mm3 ; d1 = 0 - 3 - paddw xmm0, xmm1 ; a + b - psubw xmm2, xmm1 ; a - b + psllw mm5, 3 + psllw mm4, 3 - movdqa xmm1, xmm0 ; a + b - pmulhw xmm0, xmm6 ; 00 01 02 03 + psllw mm0, 3 + psllw mm1, 3 - paddw xmm0, xmm1 ; output 00 01 02 03 - pmulhw xmm6, xmm2 ; 20 21 22 23 + ; output 0 and 2 + movq mm2, mm0 ; a1 - paddw xmm2, xmm6 ; output 20 21 22 23 + paddw mm0, mm1 ; op[0] = a1 + b1 + psubw mm2, mm1 ; op[2] = a1 - b1 ; output 1 and 3 - movdqa xmm6, [rdx + 16] ; c1 - movdqa xmm7, [rdx + 48] ; c3 - - movdqa xmm1, xmm4 ; c - movdqa xmm3, xmm5 ; d + ; interleave c1, d1 + movq mm1, mm5 ; d1 + punpcklwd mm1, mm4 ; c1 d1 + punpckhwd mm5, mm4 ; c1 d1 - pmulhw xmm1, xmm7 ; c * c3 - pmulhw xmm3, xmm6 ; d * c1 + movq mm3, mm1 + movq mm4, mm5 - paddw xmm3, xmm5 ; d * c1 rounded - paddw xmm1, xmm3 ; output 10 11 12 13 + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - movdqa xmm3, xmm4 ; c - pmulhw xmm5, xmm7 ; d * c3 + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmulhw xmm4, xmm6 ; c * c1 - paddw xmm3, xmm4 ; round c* c1 + paddd mm1, MMWORD PTR[GLOBAL(_14500)] + paddd mm4, MMWORD PTR[GLOBAL(_14500)] + paddd mm3, MMWORD PTR[GLOBAL(_7500)] + paddd mm5, MMWORD PTR[GLOBAL(_7500)] - psubw xmm5, xmm3 ; output 30 31 32 33 - movdqa xmm3, xmm5 + psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + packssdw mm1, mm4 ; op[1] + packssdw mm3, mm5 ; op[3] ; done with vertical ; transpose for the second stage - movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 - movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 + movq mm4, mm0 ; 00 10 20 30 + movq mm5, mm2 ; 02 12 22 32 - movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 - movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 + punpcklwd mm0, mm1 ; 00 01 10 11 + punpckhwd mm4, mm1 ; 20 21 30 31 - punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 - punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 + punpcklwd mm2, mm3 ; 02 03 12 13 + punpckhwd mm5, mm3 ; 22 23 32 33 - punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 - punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + movq mm1, mm0 ; 00 01 10 11 + punpckldq mm0, mm2 ; 00 01 02 03 - movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 - punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 + punpckhdq mm1, mm2 ; 01 22 12 13 - punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 + movq mm2, mm4 ; 20 31 30 31 + punpckldq mm2, mm5 ; 20 21 22 23 + punpckhdq mm4, mm5 ; 30 31 32 33 - movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 - punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 + ; mm0 0 + ; mm1 1 + ; mm2 2 + ; mm3 4 - punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 - movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 + movq mm5, mm0 + movq mm3, mm1 - punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 - punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 + paddw mm0, mm4 ; a1 = 0 + 3 + paddw mm1, mm2 ; b1 = 1 + 2 - movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 - punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 + psubw mm3, mm2 ; c1 = 1 - 2 + psubw mm5, mm4 ; d1 = 0 - 3 - punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 + pxor mm6, mm6 ; zero out for compare - ; first stage - movdqa xmm5, xmm0 - movdqa xmm4, xmm1 - - paddw xmm0, xmm3 ; a = 0 + 3 - paddw xmm1, xmm2 ; b = 1 + 2 - - psubw xmm4, xmm2 ; c = 1 - 2 - psubw xmm5, xmm3 ; d = 0 - 3 + pcmpeqw mm6, mm5 ; d1 != 0 + pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, + ; and keep bit 0 of lower ; output 0 and 2 - movdqa xmm6, [rdx + 32] ; c2 - movdqa xmm2, xmm0 ; a + movq mm2, mm0 ; a1 - paddw xmm0, xmm1 ; a + b - psubw xmm2, xmm1 ; a - b + paddw mm0, mm1 ; a1 + b1 + psubw mm2, mm1 ; a1 - b1 - movdqa xmm1, xmm0 ; a + b - pmulhw xmm0, xmm6 ; 00 01 02 03 + paddw mm0, MMWORD PTR[GLOBAL(_7w)] + paddw mm2, MMWORD PTR[GLOBAL(_7w)] - paddw xmm0, xmm1 ; output 00 01 02 03 - pmulhw xmm6, xmm2 ; 20 21 22 23 + psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 - paddw xmm2, xmm6 ; output 20 21 22 23 + movq MMWORD PTR[rdi + 0 ], mm0 + movq MMWORD PTR[rdi + 16], mm2 ; output 1 and 3 - movdqa xmm6, [rdx + 16] ; c1 - movdqa xmm7, [rdx + 48] ; c3 + ; interleave c1, d1 + movq mm1, mm5 ; d1 + punpcklwd mm1, mm3 ; c1 d1 + punpckhwd mm5, mm3 ; c1 d1 - movdqa xmm1, xmm4 ; c - movdqa xmm3, xmm5 ; d + movq mm3, mm1 + movq mm4, mm5 - pmulhw xmm1, xmm7 ; c * c3 - pmulhw xmm3, xmm6 ; d * c1 + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - paddw xmm3, xmm5 ; d * c1 rounded - paddw xmm1, xmm3 ; output 10 11 12 13 + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - movdqa xmm3, xmm4 ; c - pmulhw xmm5, xmm7 ; d * c3 + paddd mm1, MMWORD PTR[GLOBAL(_12000)] + paddd mm4, MMWORD PTR[GLOBAL(_12000)] + paddd mm3, MMWORD PTR[GLOBAL(_51000)] + paddd mm5, MMWORD PTR[GLOBAL(_51000)] - pmulhw xmm4, xmm6 ; c * c1 - paddw xmm3, xmm4 ; round c* c1 + psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - psubw xmm5, xmm3 ; output 30 31 32 33 - movdqa xmm3, xmm5 - ; done with vertical + packssdw mm1, mm4 ; op[4] + packssdw mm3, mm5 ; op[12] + + paddw mm1, mm6 ; op[4] += (d1!=0) + movq MMWORD PTR[rdi + 8 ], mm1 + movq MMWORD PTR[rdi + 24], mm3 - pcmpeqw xmm4, xmm4 - pcmpeqw xmm5, xmm5; - psrlw xmm4, 15 - psrlw xmm5, 15 - - psllw xmm4, 2 - psllw xmm5, 2 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - paddw xmm2, xmm4 - paddw xmm3, xmm5 - - psraw xmm0, 3 - psraw xmm1, 3 - psraw xmm2, 3 - psraw xmm3, 3 - - movq QWORD PTR[rdi ], xmm0 - movq QWORD PTR[rdi+ 8], xmm1 - movq QWORD PTR[rdi+16], xmm2 - movq QWORD PTR[rdi+24], xmm3 - - psrldq xmm0, 8 - psrldq xmm1, 8 - psrldq xmm2, 8 - psrldq xmm3, 8 - - movq QWORD PTR[rdi+32], xmm0 - movq QWORD PTR[rdi+40], xmm1 - movq QWORD PTR[rdi+48], xmm2 - movq QWORD PTR[rdi+56], xmm3 - ; begin epilog - pop rdi - pop rsi + ; begin epilog + pop rdi + pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret - SECTION_RODATA -;static const unsigned int dct1st_stage_rounding_mmx[2] = -align 16 -dct1st_stage_rounding_mmx: - times 2 dd 8192 - - -;static const unsigned int dct2nd_stage_rounding_mmx[2] = -align 16 -dct2nd_stage_rounding_mmx: - times 2 dd 32768 - - -;static const short dct_matrix[4][4]= -align 16 -dct_matrix: - times 4 dw 23170 - - dw 30274 - dw 12540 - dw -12540 - dw -30274 - - dw 23170 - times 2 dw -23170 - dw 23170 - - dw 12540 - dw -30274 - dw 30274 - dw -12540 - - -;static const unsigned short dct_const_mmx[4 * 4]= -align 16 -dct_const_mmx: - times 4 dw 0 - times 4 dw 60547 - times 4 dw 46341 - times 4 dw 25080 - - -;static const unsigned short dct_const_xmm[8 * 4]= -align 16 -dct_const_xmm: - times 8 dw 0 - times 8 dw 60547 - times 8 dw 46341 - times 8 dw 25080 +align 8 +_5352_2217: + dw 5352 + dw 2217 + dw 5352 + dw 2217 +align 8 +_2217_neg5352: + dw 2217 + dw -5352 + dw 2217 + dw -5352 +align 8 +_cmp_mask: + times 4 dw 1 +align 8 +_7w: + times 4 dw 7 +align 8 +_14500: + times 2 dd 14500 +align 8 +_7500: + times 2 dd 7500 +align 8 +_12000: + times 2 dd 12000 +align 8 +_51000: + times 2 dd 51000 diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 723a78d..652dd98 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -11,32 +11,68 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) -global sym(vp8_short_fdct4x4_sse2) -sym(vp8_short_fdct4x4_sse2): +%macro STACK_FRAME_CREATE 0 +%if ABI_IS_32BIT + %define input rsi + %define output rdi + %define pitch rax push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 -;; SAVE_XMM GET_GOT rbx push rsi push rdi ; end prolog mov rsi, arg(0) - movsxd rax, DWORD PTR arg(2) - lea rdi, [rsi + rax*2] + mov rdi, arg(1) + + movsxd rax, dword ptr arg(2) + lea rcx, [rsi + rax*2] +%else + %ifidn __OUTPUT_FORMAT__,x64 + %define input rcx + %define output rdx + %define pitch r8 + %else + %define input rdi + %define output rsi + %define pitch rdx + %endif +%endif +%endmacro + +%macro STACK_FRAME_DESTROY 0 + %define input + %define output + %define pitch + +%if ABI_IS_32BIT + pop rdi + pop rsi + RESTORE_GOT + pop rbp +%else + %ifidn __OUTPUT_FORMAT__,x64 + %endif +%endif + ret +%endmacro + +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct4x4_sse2) +sym(vp8_short_fdct4x4_sse2): - movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00 - movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10 - movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20 - movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30 + STACK_FRAME_CREATE + + movq xmm0, MMWORD PTR[input ] ;03 02 01 00 + movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 + lea input, [input+2*pitch] + movq xmm1, MMWORD PTR[input ] ;23 22 21 20 + movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 - mov rdi, arg(1) - movdqa xmm2, xmm0 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 @@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2): psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 + movdqa xmm1, xmm0 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 @@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2): punpcklqdq xmm0, xmm3 ;op[4] op[0] punpckhqdq xmm1, xmm3 ;op[12] op[8] - movdqa XMMWORD PTR[rdi + 0], xmm0 - movdqa XMMWORD PTR[rdi + 16], xmm1 + movdqa XMMWORD PTR[output + 0], xmm0 + movdqa XMMWORD PTR[output + 16], xmm1 - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT -;; RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret + STACK_FRAME_DESTROY + +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) +global sym(vp8_short_fdct8x4_sse2) +sym(vp8_short_fdct8x4_sse2): + + STACK_FRAME_CREATE + + ; read the input data + movdqa xmm0, [input ] + movdqa xmm2, [input+ pitch] + lea input, [input+2*pitch] + movdqa xmm4, [input ] + movdqa xmm3, [input+ pitch] + + ; transpose for the first stage + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 + + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 + + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 + + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 + + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 + + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 + + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 + + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 + + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 + + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 + + ; xmm0 0 + ; xmm1 1 + ; xmm2 2 + ; xmm3 3 + + ; first stage + movdqa xmm5, xmm0 + movdqa xmm4, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm2 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + psllw xmm5, 3 + psllw xmm4, 3 + + psllw xmm0, 3 + psllw xmm1, 3 + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; op[0] = a1 + b1 + psubw xmm2, xmm1 ; op[2] = a1 - b1 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] + paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] + paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] + + psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 + psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 + + packssdw xmm1, xmm4 ; op[1] + packssdw xmm3, xmm5 ; op[3] + + ; done with vertical + ; transpose for the second stage + movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 + movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 + + punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 + punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 + + punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 + + movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 + punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 + + punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 + + movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 + punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 + + punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 + movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 + + punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 + punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 + + movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 + punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 + + punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 + + ; xmm0 0 + ; xmm1 4 + ; xmm2 1 + ; xmm3 3 + + movdqa xmm5, xmm0 + movdqa xmm2, xmm1 + + paddw xmm0, xmm3 ; a1 = 0 + 3 + paddw xmm1, xmm4 ; b1 = 1 + 2 + + psubw xmm4, xmm2 ; c1 = 1 - 2 + psubw xmm5, xmm3 ; d1 = 0 - 3 + + pxor xmm6, xmm6 ; zero out for compare + + pcmpeqw xmm6, xmm5 ; d1 != 0 + + pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, + ; and keep bit 0 of lower + + ; output 0 and 2 + movdqa xmm2, xmm0 ; a1 + + paddw xmm0, xmm1 ; a1 + b1 + psubw xmm2, xmm1 ; a1 - b1 + + paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] + paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] + + psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 + psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 + + ; output 1 and 3 + ; interleave c1, d1 + movdqa xmm1, xmm5 ; d1 + punpcklwd xmm1, xmm4 ; c1 d1 + punpckhwd xmm5, xmm4 ; c1 d1 + + movdqa xmm3, xmm1 + movdqa xmm4, xmm5 + + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 + + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 + + paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] + paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] + paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] + + psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 + psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 + + packssdw xmm1, xmm4 ; op[4] + packssdw xmm3, xmm5 ; op[12] + + paddw xmm1, xmm6 ; op[4] += (d1!=0) + + movdqa xmm4, xmm0 + movdqa xmm5, xmm2 + + punpcklqdq xmm0, xmm1 + punpckhqdq xmm4, xmm1 + + punpcklqdq xmm2, xmm3 + punpckhqdq xmm5, xmm3 + + movdqa XMMWORD PTR[output + 0 ], xmm0 + movdqa XMMWORD PTR[output + 16], xmm2 + movdqa XMMWORD PTR[output + 32], xmm4 + movdqa XMMWORD PTR[output + 48], xmm5 + + STACK_FRAME_DESTROY SECTION_RODATA align 16 @@ -161,7 +397,9 @@ align 16 _cmp_mask: times 4 dw 1 times 4 dw 0 - +align 16 +_cmp_mask8x4: + times 8 dw 1 align 16 _mult_sub: dw 1 @@ -176,6 +414,9 @@ align 16 _7: times 4 dd 7 align 16 +_7w: + times 8 dw 7 +align 16 _14500: times 4 dd 14500 align 16 diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h index 05824c6..59a5cb1 100644 --- a/vp8/encoder/x86/dct_x86.h +++ b/vp8/encoder/x86/dct_x86.h @@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx); extern prototype_fdct(vp8_short_fdct8x4_mmx); #if !CONFIG_RUNTIME_CPU_DETECT -#if 0 + #undef vp8_fdct_short4x4 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx -#endif #endif + #endif #if HAVE_SSE2 -extern prototype_fdct(vp8_short_fdct8x4_wmt); +extern prototype_fdct(vp8_short_fdct8x4_sse2); extern prototype_fdct(vp8_short_walsh4x4_sse2); extern prototype_fdct(vp8_short_fdct4x4_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -#if 1 -/* short SSE2 DCT currently disabled, does not match the MMX version */ + #undef vp8_fdct_short4x4 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2 #undef vp8_fdct_short8x4 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2 -#endif #undef vp8_fdct_fast4x4 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2 @@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2); #undef vp8_fdct_fast8x4 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2 -#undef vp8_fdct_walsh_short4x4 +#undef vp8_fdct_walsh_short4x4 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2 #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index fb1b37c..7810798 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -18,11 +18,10 @@ #if HAVE_MMX void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp8_short_fdct4x4_c(input, output, pitch); - vp8_short_fdct4x4_c(input + 4, output + 16, pitch); + vp8_short_fdct4x4_mmx(input, output, pitch); + vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch); } - int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, @@ -82,12 +81,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) #endif #if HAVE_SSE2 -void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) -{ - vp8_short_fdct4x4_sse2(input, output, pitch); - vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch); -} - int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, short *qcoeff_ptr, short *dequant_ptr, short *scan_mask, short *round_ptr, @@ -249,18 +242,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx; cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx; cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx; -#if 0 // new fdct + cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx; cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx; cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx; cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx; -#else - cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c; - cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c; - cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c; - cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c; - -#endif cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;