From: Fritz Koenig Date: Fri, 20 Aug 2010 17:58:19 +0000 (-0700) Subject: Rework idct calling structure. X-Git-Tag: 1.0_branch~929 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=93c32a55c2444b8245e8cba9187e1ec654d1fbc6;p=profile%2Fivi%2Flibvpx.git Rework idct calling structure. Moving the eob structure allows for a non-struct based function to handle decoding an entire mb of idct/dequant/recon data. This allows for SIMD functions to idct/dequant/recon multiple blocks at once. SSE2 implementation gives 3% gain on Atom. Change-Id: I8a8f3efd546ea4e0535f517d94f347cfb737c9c2 --- diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index de308ff..cb07e9e 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -218,6 +218,7 @@ typedef struct //not used DECLARE_ALIGNED(16, short, reference[384]); DECLARE_ALIGNED(16, short, qcoeff[400]); DECLARE_ALIGNED(16, short, dqcoeff[400]); + DECLARE_ALIGNED(16, char, eobs[25]); // 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. BLOCKD block[25]; diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm new file mode 100644 index 0000000..058ed8a --- /dev/null +++ b/vp8/common/x86/idctllm_sse2.asm @@ -0,0 +1,708 @@ +; +; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void idct_dequant_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *pre - 2 +; unsigned char *dst - 3 +; int dst_stride - 4 +; int blk_stride - 5 +; ) + +global sym(idct_dequant_0_2x_sse2) +sym(idct_dequant_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + ; end prolog + + mov rdx, arg(1) ; dequant + mov rax, arg(0) ; qcoeff + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + movd xmm4, [rax] + movd xmm5, [rdx] + + pinsrw xmm4, [rax+32], 4 + pinsrw xmm5, [rdx], 4 + + pmullw xmm4, xmm5 + + ; clear coeffs + movd [rax], xmm7 + movd [rax+32], xmm7 +;pshufb + pshuflw xmm4, xmm4, 00000000b + pshufhw xmm4, xmm4, 00000000b + + mov rax, arg(2) ; pre + paddw xmm4, [fours GLOBAL] + + movsxd rcx, dword ptr arg(5) ; blk_stride + psraw xmm4, 3 + + movq xmm0, [rax] + movq xmm1, [rax+rcx] + movq xmm2, [rax+2*rcx] + lea rcx, [3*rcx] + movq xmm3, [rax+rcx] + + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + + mov rax, arg(3) ; dst + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; store blocks back out + movq [rax], xmm0 + movq [rax + rdx], xmm1 + + lea rax, [rax + 2*rdx] + + movq [rax], xmm2 + movq [rax + rdx], xmm3 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +global sym(idct_dequant_full_2x_sse2) +sym(idct_dequant_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rsi, arg(2) ; pre + mov rdi, arg(3) ; dst + movsxd rcx, dword ptr arg(5) ; blk_stride + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + mov rdx, arg(1) ; dequant + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [x_s1sqr2 GLOBAL] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [x_c1sqr2less1 GLOBAL] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [x_c1sqr2less1 GLOBAL] + paddw xmm5, xmm1 + + pmulhw xmm3, [x_s1sqr2 GLOBAL] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [x_s1sqr2 GLOBAL] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [x_c1sqr2less1 GLOBAL] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [x_c1sqr2less1 GLOBAL] + paddw xmm5, xmm1 + + pmulhw xmm3, [x_s1sqr2 GLOBAL] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [fours GLOBAL] + + paddw xmm2, [fours GLOBAL] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rsi] + movq xmm5, [rsi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rsi+2*rcx] + lea rcx, [3*rcx] + movq xmm5, [rsi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void idct_dequant_dc_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *pre - 2 +; unsigned char *dst - 3 +; int dst_stride - 4 +; short *dc - 5 +; ) +global sym(idct_dequant_dc_0_2x_sse2) +sym(idct_dequant_dc_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rsi, arg(2) ; pre + mov rdi, arg(3) ; dst + mov rdx, arg(5) ; dc + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + ; load up 2 dc words here == 2*16 = doubleword + movd xmm4, [rdx] + + ; Load up predict blocks + movq xmm0, [rsi] + movq xmm1, [rsi+16] + movq xmm2, [rsi+32] + movq xmm3, [rsi+48] + + ; Duplicate and expand dc across + punpcklwd xmm4, xmm4 + punpckldq xmm4, xmm4 + + ; Rounding to dequant and downshift + paddw xmm4, [fours GLOBAL] + psraw xmm4, 3 + + ; Predict buffer needs to be expanded from bytes to words + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +global sym(idct_dequant_dc_full_2x_sse2) +sym(idct_dequant_dc_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rsi, arg(2) ; pre + mov rdi, arg(3) ; dst + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + mov rdx, arg(1) ; dequant + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; DC component + mov rdx, arg(5) + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; insert DC component + pinsrw xmm0, [rdx], 0 + pinsrw xmm0, [rdx+2], 4 + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [x_s1sqr2 GLOBAL] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [x_c1sqr2less1 GLOBAL] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [x_c1sqr2less1 GLOBAL] + paddw xmm5, xmm1 + + pmulhw xmm3, [x_s1sqr2 GLOBAL] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [x_s1sqr2 GLOBAL] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [x_c1sqr2less1 GLOBAL] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [x_c1sqr2less1 GLOBAL] + paddw xmm5, xmm1 + + pmulhw xmm3, [x_s1sqr2 GLOBAL] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [fours GLOBAL] + + paddw xmm2, [fours GLOBAL] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rsi] + movq xmm5, [rsi+16] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rsi+32] + movq xmm5, [rsi+48] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +fours: + times 8 dw 0x0004 +align 16 +x_s1sqr2: + times 8 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 8 dw 0x4E7B diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c new file mode 100644 index 0000000..96aca2b --- /dev/null +++ b/vp8/decoder/arm/armv6/idct_blk_v6.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void vp8_dequant_dc_idct_add_y_block_v6 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]); + else + vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride); + + if (eobs[1] > 1) + vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); + else + vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride); + + if (eobs[2] > 1) + vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); + else + vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride); + + if (eobs[3] > 1) + vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); + else + vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride); + + q += 64; + dc += 4; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_y_block_v6 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_v6 + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h index 3a044f8..12e836a 100644 --- a/vp8/decoder/arm/dequantize_arm.h +++ b/vp8/decoder/arm/dequantize_arm.h @@ -16,6 +16,9 @@ extern prototype_dequant_block(vp8_dequantize_b_v6); extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6); extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6); +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6); +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6); +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_v6 @@ -25,12 +28,24 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6); #undef vp8_dequant_dc_idct_add #define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6 + +#undef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6 + +#undef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6 + +#undef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6 #endif #if HAVE_ARMV7 extern prototype_dequant_block(vp8_dequantize_b_neon); extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon); extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon); +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon); +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon); +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_neon @@ -40,6 +55,15 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon); #undef vp8_dequant_dc_idct_add #define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon + +#undef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon + +#undef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +#undef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon #endif #endif diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c new file mode 100644 index 0000000..e190bc0 --- /dev/null +++ b/vp8/decoder/arm/neon/idct_blk_neon.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void vp8_dequant_dc_idct_add_y_block_neon + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]); + else + vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride); + + if (eobs[1] > 1) + vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); + else + vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride); + + if (eobs[2] > 1) + vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); + else + vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride); + + if (eobs[3] > 1) + vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); + else + vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride); + + q += 64; + dc += 4; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_y_block_neon + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 9942e0b..45d7ec3 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -237,7 +237,7 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) DEQUANT_INVOKE(&pbi->dequant, block)(b); // do 2nd order transform on the dc block - if (b->eob > 1) + if (xd->eobs[24] > 1) { IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); ((int *)b->qcoeff)[0] = 0; @@ -255,24 +255,10 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) ((int *)b->qcoeff)[0] = 0; } - - for (i = 0; i < 16; i++) - { - - b = &xd->block[i]; - - if (b->eob > 1) - { - DEQUANT_INVOKE(&pbi->dequant, dc_idct_add) - (b->qcoeff, &b->dequant[0][0], b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride, - xd->block[24].diff[i]); - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(xd->block[24].diff[i], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); - } - } + DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) + (xd->qcoeff, &xd->block[0].dequant[0][0], + xd->predictor, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs, xd->block[24].diff); } else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED) { @@ -282,13 +268,17 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) BLOCKD *b = &xd->block[i]; vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); - if (b->eob > 1) + if (xd->eobs[i] > 1) { - DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); + DEQUANT_INVOKE(&pbi->dequant, idct_add) + (b->qcoeff, &b->dequant[0][0], b->predictor, + *(b->base_dst) + b->dst, 16, b->dst_stride); } else { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); + IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) + (b->qcoeff[0] * b->dequant[0][0], b->predictor, + *(b->base_dst) + b->dst, 16, b->dst_stride); ((int *)b->qcoeff)[0] = 0; } } @@ -296,37 +286,16 @@ void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) } else { - for (i = 0; i < 16; i++) - { - BLOCKD *b = &xd->block[i]; - - if (b->eob > 1) - { - DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 16, b->dst_stride); - ((int *)b->qcoeff)[0] = 0; - } - } + DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) + (xd->qcoeff, &xd->block[0].dequant[0][0], + xd->predictor, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); } - for (i = 16; i < 24; i++) - { - - BLOCKD *b = &xd->block[i]; - - if (b->eob > 1) - { - DEQUANT_INVOKE(&pbi->dequant, idct_add)(b->qcoeff, &b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride); - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)(b->qcoeff[0] * b->dequant[0][0], b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride); - ((int *)b->qcoeff)[0] = 0; - } - } + DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) + (xd->qcoeff+16*16, &xd->block[16].dequant[0][0], + xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); } static int get_delta_q(vp8_reader *bc, int prev, int *q_update) diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h index fbca391..125d35b 100644 --- a/vp8/decoder/dequantize.h +++ b/vp8/decoder/dequantize.h @@ -27,6 +27,21 @@ int pitch, int stride, \ int dc) +#define prototype_dequant_dc_idct_add_y_block(sym) \ + void sym(short *q, short *dq, \ + unsigned char *pre, unsigned char *dst, \ + int stride, char *eobs, short *dc) + +#define prototype_dequant_idct_add_y_block(sym) \ + void sym(short *q, short *dq, \ + unsigned char *pre, unsigned char *dst, \ + int stride, char *eobs) + +#define prototype_dequant_idct_add_uv_block(sym) \ + void sym(short *q, short *dq, \ + unsigned char *pre, unsigned char *dst_u, \ + unsigned char *dst_v, int stride, char *eobs) + #if ARCH_X86 || ARCH_X86_64 #include "x86/dequantize_x86.h" #endif @@ -50,16 +65,42 @@ extern prototype_dequant_idct_add(vp8_dequant_idct_add); #endif extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add); +#ifndef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_c +#endif +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block); + +#ifndef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c +#endif +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block); + +#ifndef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c +#endif +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block); + + typedef prototype_dequant_block((*vp8_dequant_block_fn_t)); typedef prototype_dequant_idct_add((*vp8_dequant_idct_add_fn_t)); + typedef prototype_dequant_dc_idct_add((*vp8_dequant_dc_idct_add_fn_t)); +typedef prototype_dequant_dc_idct_add_y_block((*vp8_dequant_dc_idct_add_y_block_fn_t)); + +typedef prototype_dequant_idct_add_y_block((*vp8_dequant_idct_add_y_block_fn_t)); + +typedef prototype_dequant_idct_add_uv_block((*vp8_dequant_idct_add_uv_block_fn_t)); + typedef struct { - vp8_dequant_block_fn_t block; - vp8_dequant_idct_add_fn_t idct_add; - vp8_dequant_dc_idct_add_fn_t dc_idct_add; + vp8_dequant_block_fn_t block; + vp8_dequant_idct_add_fn_t idct_add; + vp8_dequant_dc_idct_add_fn_t dc_idct_add; + vp8_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block; + vp8_dequant_idct_add_y_block_fn_t idct_add_y_block; + vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block; } vp8_dequant_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index 34faae3..9cbea23 100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c @@ -266,6 +266,8 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) BOOL_DECODER *bc = x->current_bc; + char *eobs = x->eobs; + ENTROPY_CONTEXT *a; ENTROPY_CONTEXT *l; int i; @@ -416,8 +418,8 @@ ONE_CONTEXT_NODE_0_: qcoeff_ptr [ scan[15] ] = (INT16) v; BLOCK_FINISHED: - t = ((x->block[i].eob = c) != !type); // any nonzero data? - eobtotal += x->block[i].eob; + t = ((eobs[i] = c) != !type); // any nonzero data? + eobtotal += c; *a = *l = t; qcoeff_ptr += 16; diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c index ab085e2..e8104dc 100644 --- a/vp8/decoder/generic/dsystemdependent.c +++ b/vp8/decoder/generic/dsystemdependent.c @@ -19,12 +19,15 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi) { // Pure C: #if CONFIG_RUNTIME_CPU_DETECT - pbi->mb.rtcd = &pbi->common.rtcd; - pbi->dequant.block = vp8_dequantize_b_c; - pbi->dequant.idct_add = vp8_dequant_idct_add_c; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c; - pbi->dboolhuff.start = vp8dx_start_decode_c; - pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; + pbi->mb.rtcd = &pbi->common.rtcd; + pbi->dequant.block = vp8_dequantize_b_c; + pbi->dequant.idct_add = vp8_dequant_idct_add_c; + pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_c; + pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c; + pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_c; + pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + pbi->dboolhuff.start = vp8dx_start_decode_c; + pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; #if 0 //For use with RTCD, when implemented pbi->dboolhuff.debool = vp8dx_decode_bool_c; pbi->dboolhuff.devalue = vp8dx_decode_value_c; diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c new file mode 100644 index 0000000..b18984b --- /dev/null +++ b/vp8/decoder/idct_blk.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void vp8_dequant_dc_idct_add_y_block_c + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i, j; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1) + vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]); + else + vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride); + + q += 16; + pre += 4; + dst += 4; + dc ++; + } + + pre += 64 - 16; + dst += 4*stride - 16; + } +} + +void vp8_dequant_idct_add_y_block_c + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride); + ((int *)q)[0] = 0; + } + + q += 16; + pre += 4; + dst += 4; + } + + pre += 64 - 16; + dst += 4*stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_c + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride); + ((int *)q)[0] = 0; + } + + q += 16; + pre += 4; + dstu += 4; + } + + pre += 32 - 8; + dstu += 4*stride - 8; + } + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride); + ((int *)q)[0] = 0; + } + + q += 16; + pre += 4; + dstv += 4; + } + + pre += 32 - 8; + dstv += 4*stride - 8; + } +} diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h index 4492676..201479c 100644 --- a/vp8/decoder/x86/dequantize_x86.h +++ b/vp8/decoder/x86/dequantize_x86.h @@ -23,7 +23,9 @@ extern prototype_dequant_block(vp8_dequantize_b_mmx); extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx); extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx); - +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_mmx); +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_mmx); +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_mmx); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_dequant_block @@ -35,6 +37,33 @@ extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_mmx); #undef vp8_dequant_dc_idct_add #define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_mmx +#undef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_mmx + +#undef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_mmx + +#undef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_mmx + +#endif +#endif + +#if HAVE_SSE2 +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_sse2); +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_sse2); +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_sse2 + +#undef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +#undef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 + #endif #endif diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c new file mode 100644 index 0000000..1522a80 --- /dev/null +++ b/vp8/decoder/x86/idct_blk_mmx.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void vp8_dequant_dc_idct_add_y_block_mmx + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]); + else + vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride); + + if (eobs[1] > 1) + vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); + else + vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride); + + if (eobs[2] > 1) + vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); + else + vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride); + + if (eobs[3] > 1) + vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); + else + vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride); + + q += 64; + dc += 4; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_y_block_mmx + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride); + else + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride); + else + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride); + else + { + vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride); + else + { + vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_mmx + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride); + else + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride); + else + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride); + else + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride); + else + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c new file mode 100644 index 0000000..c5e4ad3 --- /dev/null +++ b/vp8/decoder/x86/idct_blk_sse2.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void idct_dequant_dc_0_2x_sse2 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int dst_stride, short *dc); +void idct_dequant_dc_full_2x_sse2 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int dst_stride, short *dc); + +void idct_dequant_0_2x_sse2 + (short *q, short *dq ,unsigned char *pre, + unsigned char *dst, int dst_stride, int blk_stride); +void idct_dequant_full_2x_sse2 + (short *q, short *dq ,unsigned char *pre, + unsigned char *dst, int dst_stride, int blk_stride); + +void vp8_dequant_dc_idct_add_y_block_sse2 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0] & 0xfefe) + idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc); + else + idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc); + + if (((short *)(eobs))[1] & 0xfefe) + idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); + else + idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); + + q += 64; + dc += 4; + pre += 64; + dst += stride*4; + eobs += 4; + } +} + +void vp8_dequant_idct_add_y_block_sse2 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0] & 0xfefe) + idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16); + else + idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16); + + if (((short *)(eobs))[1] & 0xfefe) + idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); + else + idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); + + q += 64; + pre += 64; + dst += stride*4; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_sse2 + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + if (((short *)(eobs))[0] & 0xfefe) + idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); + else + idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); + + q += 32; + pre += 32; + dstu += stride*4; + + if (((short *)(eobs))[1] & 0xfefe) + idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); + else + idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); + + q += 32; + pre += 32; + + if (((short *)(eobs))[2] & 0xfefe) + idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); + else + idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); + + q += 32; + pre += 32; + dstv += stride*4; + + if (((short *)(eobs))[3] & 0xfefe) + idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); + else + idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); +} diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c index 7891051..eb8198f 100644 --- a/vp8/decoder/x86/x86_dsystemdependent.c +++ b/vp8/decoder/x86/x86_dsystemdependent.c @@ -39,14 +39,24 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi) #if CONFIG_RUNTIME_CPU_DETECT /* Override default functions with fastest ones for this CPU. */ #if HAVE_MMX - if (flags & HAS_MMX) { - pbi->dequant.block = vp8_dequantize_b_mmx; - pbi->dequant.idct_add = vp8_dequant_idct_add_mmx; - pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx; + pbi->dequant.block = vp8_dequantize_b_mmx; + pbi->dequant.idct_add = vp8_dequant_idct_add_mmx; + pbi->dequant.dc_idct_add = vp8_dequant_dc_idct_add_mmx; + pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx; + pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + } +#endif +#if HAVE_SSE2 + if (flags & HAS_SSE2) + { + pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_sse2; + pbi->dequant.idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + pbi->dequant.idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; } - #endif + #endif } diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index dea2373..3aad7b7 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -103,6 +103,7 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk index 8ab9425..f6b7d94 100644 --- a/vp8/vp8dx.mk +++ b/vp8/vp8dx.mk @@ -68,9 +68,12 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h VP8_DX_SRCS-yes += decoder/treereader.h VP8_DX_SRCS-yes += decoder/onyxd_if.c VP8_DX_SRCS-yes += decoder/threading.c +VP8_DX_SRCS-yes += decoder/idct_blk.c VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes)) VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/dequantize_x86.h VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm +VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c +VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk index 61a1ce4..c4e79af 100644 --- a/vp8/vp8dx_arm.mk +++ b/vp8/vp8dx_arm.mk @@ -15,14 +15,17 @@ VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dequantize_arm.c VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/dsystemdependent.c VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/generic/dsystemdependent.c VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/dequantize.c +VP8_DX_SRCS_REMOVE-$(HAVE_ARMV6) += decoder/idct_blk.c VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK) += decoder/arm/detokenize$(ASM) #File list for armv6 VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM) VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM) +VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/idct_blk_v6.c #File list for neon VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_dc_idct_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM) VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM) +VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_blk_neon.c