From f13b69d07c337bf8f7165b01934ae8ad971694bd Mon Sep 17 00:00:00 2001 From: Christian Duvivier Date: Mon, 25 Mar 2013 16:18:38 -0700 Subject: [PATCH] Faster vp9_short_fdct4x4 and vp9_short_fdct8x4. Scalar path is about 1.3x faster (2.1% overall encoder speedup). SSE2 path is about 5.0x faster (8.4% overall encoder speedup). Change-Id: I360d167b5ad6f387bba00406129323e2fe6e7dda --- vp9/common/vp9_rtcd_defs.sh | 4 +- vp9/encoder/vp9_dct.c | 82 ++++-- vp9/encoder/x86/vp9_dct_sse2.asm | 432 ------------------------------ vp9/encoder/x86/vp9_dct_sse2_intrinsics.c | 105 ++++++++ vp9/vp9cx.mk | 1 - 5 files changed, 167 insertions(+), 457 deletions(-) delete mode 100644 vp9/encoder/x86/vp9_dct_sse2.asm diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 0073660..d787a26 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -662,10 +662,10 @@ prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int p specialize vp9_short_fdct8x8 sse2 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct4x4 +specialize vp9_short_fdct4x4 sse2 prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct8x4 +specialize vp9_short_fdct8x4 sse2 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct32x32 diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 6365ed9..ecd3e2d 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -37,30 +37,68 @@ static void fdct4_1d(int16_t *input, int16_t *output) { } void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { - int16_t out[4 * 4]; - int16_t *outptr = &out[0]; - const int short_pitch = pitch >> 1; - int i, j; - int16_t temp_in[4], temp_out[4]; - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = input[j * short_pitch + i] << 4; - if (i == 0 && temp_in[0]) - temp_in[0] += 1; - fdct4_1d(temp_in, temp_out); - for (j = 0; j < 4; ++j) - outptr[j * 4 + i] = temp_out[j]; + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we tranpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + const int stride = pitch >> 1; + int pass; + // We need an intermediate buffer between passes. + int16_t intermediate[4 * 4]; + int16_t *in = input; + int16_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + /*canbe16*/ int input[4]; + /*canbe16*/ int step[4]; + /*needs32*/ int temp1, temp2; + int i; + for (i = 0; i < 4; ++i) { + // Load inputs. + if (0 == pass) { + input[0] = in[0 * stride] << 4; + input[1] = in[1 * stride] << 4; + input[2] = in[2 * stride] << 4; + input[3] = in[3 * stride] << 4; + if (i == 0 && input[0]) { + input[0] += 1; + } + } else { + input[0] = in[0 * 4]; + input[1] = in[1 * 4]; + input[2] = in[2 * 4]; + input[3] = in[3 * 4]; + } + // Transform. + step[0] = input[0] + input[3]; + step[1] = input[1] + input[2]; + step[2] = input[1] - input[2]; + step[3] = input[0] - input[3]; + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + out[0] = dct_const_round_shift(temp1); + out[2] = dct_const_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + out[1] = dct_const_round_shift(temp1); + out[3] = dct_const_round_shift(temp2); + // Do next column (which is a transposed row in second/horizontal pass) + in++; + out += 4; + } + // Setup in/out for next pass. + in = intermediate; + out = output; } - // Rows - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j + i * 4]; - fdct4_1d(temp_in, temp_out); - for (j = 0; j < 4; ++j) - output[j + i * 4] = (temp_out[j] + 1) >> 2; + { + int i, j; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } } } diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm deleted file mode 100644 index bbd6086..0000000 --- a/vp9/encoder/x86/vp9_dct_sse2.asm +++ /dev/null @@ -1,432 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE 0 -%if ABI_IS_32BIT - %define input rsi - %define output rdi - %define pitch rax - push rbp - mov rbp, rsp - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rdi, arg(1) - - movsxd rax, dword ptr arg(2) - lea rcx, [rsi + rax*2] -%else - %if LIBVPX_YASM_WIN64 - %define input rcx - %define output rdx - %define pitch r8 - SAVE_XMM 7, u - %else - %define input rdi - %define output rsi - %define pitch rdx - %endif -%endif -%endmacro - -%macro STACK_FRAME_DESTROY 0 - %define input - %define output - %define pitch - -%if ABI_IS_32BIT - pop rdi - pop rsi - RESTORE_GOT - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch) -global sym(vp9_short_fdct4x4_sse2) PRIVATE -sym(vp9_short_fdct4x4_sse2): - - STACK_FRAME_CREATE - - movq xmm0, MMWORD PTR[input ] ;03 02 01 00 - movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 - lea input, [input+2*pitch] - movq xmm1, MMWORD PTR[input ] ;23 22 21 20 - movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 - - punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 - punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 - - movdqa xmm2, xmm0 - punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 - punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 - movdqa xmm1, xmm0 - punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 - pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx - pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx - - punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 - movdqa xmm3, xmm0 - paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 - psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 - psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 - psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 - - movdqa xmm1, xmm0 - pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 - pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 - movdqa xmm4, xmm3 - pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 - pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 - - paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] - paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] - psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 - - packssdw xmm0, xmm1 ;op[2] op[0] - packssdw xmm3, xmm4 ;op[3] op[1] - ; 23 22 21 20 03 02 01 00 - ; - ; 33 32 31 30 13 12 11 10 - ; - movdqa xmm2, xmm0 - punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 - punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 - - movdqa xmm3, xmm0 - punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 - punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 - punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 - - movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] - pshufd xmm2, xmm2, 04eh - movdqa xmm3, xmm0 - paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 - psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 - - pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 - movdqa xmm2, xmm3 ;save d1 for compare - pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 - pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 - pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 - pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 - pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 - movdqa xmm1, xmm0 - pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 - pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 - - pxor xmm4, xmm4 ;zero out for compare - paddd xmm0, xmm5 - paddd xmm1, xmm5 - pcmpeqw xmm2, xmm4 - psrad xmm0, 4 ;(a1 + b1 + 7)>>4 - psrad xmm1, 4 ;(a1 - b1 + 7)>>4 - pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, - ;and keep bit 0 of lower - - movdqa xmm4, xmm3 - pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 - pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 - paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] - paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] - packssdw xmm0, xmm1 ;op[8] op[0] - psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 - psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 - - packssdw xmm3, xmm4 ;op[12] op[4] - movdqa xmm1, xmm0 - paddw xmm3, xmm2 ;op[4] += (d1!=0) - punpcklqdq xmm0, xmm3 ;op[4] op[0] - punpckhqdq xmm1, xmm3 ;op[12] op[8] - - movdqa XMMWORD PTR[output + 0], xmm0 - movdqa XMMWORD PTR[output + 16], xmm1 - - STACK_FRAME_DESTROY - -;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch) -global sym(vp9_short_fdct8x4_sse2) PRIVATE -sym(vp9_short_fdct8x4_sse2): - - STACK_FRAME_CREATE - - ; read the input data - movdqa xmm0, [input ] - movdqa xmm2, [input+ pitch] - lea input, [input+2*pitch] - movdqa xmm4, [input ] - movdqa xmm3, [input+ pitch] - - ; transpose for the first stage - movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 - movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 - - punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 - punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 - - punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 - punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 - - movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 - punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 - - punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 - - movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 - punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 - - punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 - movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 - - punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 - punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 - - movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 - punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 - - punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 - - ; xmm0 0 - ; xmm1 1 - ; xmm2 2 - ; xmm3 3 - - ; first stage - movdqa xmm5, xmm0 - movdqa xmm4, xmm1 - - paddw xmm0, xmm3 ; a1 = 0 + 3 - paddw xmm1, xmm2 ; b1 = 1 + 2 - - psubw xmm4, xmm2 ; c1 = 1 - 2 - psubw xmm5, xmm3 ; d1 = 0 - 3 - - psllw xmm5, 3 - psllw xmm4, 3 - - psllw xmm0, 3 - psllw xmm1, 3 - - ; output 0 and 2 - movdqa xmm2, xmm0 ; a1 - - paddw xmm0, xmm1 ; op[0] = a1 + b1 - psubw xmm2, xmm1 ; op[2] = a1 - b1 - - ; output 1 and 3 - ; interleave c1, d1 - movdqa xmm1, xmm5 ; d1 - punpcklwd xmm1, xmm4 ; c1 d1 - punpckhwd xmm5, xmm4 ; c1 d1 - - movdqa xmm3, xmm1 - movdqa xmm4, xmm5 - - pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - - pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - - paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] - paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] - paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] - paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] - - psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 - psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 - - packssdw xmm1, xmm4 ; op[1] - packssdw xmm3, xmm5 ; op[3] - - ; done with vertical - ; transpose for the second stage - movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 - movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 - - punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 - punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 - - punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 - punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 - - movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 - punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 - - punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 - - movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 - punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 - - punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 - movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 - - punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 - punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 - - movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 - punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 - - punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 - - ; xmm0 0 - ; xmm1 4 - ; xmm2 1 - ; xmm3 3 - - movdqa xmm5, xmm0 - movdqa xmm2, xmm1 - - paddw xmm0, xmm3 ; a1 = 0 + 3 - paddw xmm1, xmm4 ; b1 = 1 + 2 - - psubw xmm4, xmm2 ; c1 = 1 - 2 - psubw xmm5, xmm3 ; d1 = 0 - 3 - - pxor xmm6, xmm6 ; zero out for compare - - pcmpeqw xmm6, xmm5 ; d1 != 0 - - pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, - ; and keep bit 0 of lower - - ; output 0 and 2 - movdqa xmm2, xmm0 ; a1 - - paddw xmm0, xmm1 ; a1 + b1 - psubw xmm2, xmm1 ; a1 - b1 - - paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] - paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] - - psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 - psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 - - ; output 1 and 3 - ; interleave c1, d1 - movdqa xmm1, xmm5 ; d1 - punpcklwd xmm1, xmm4 ; c1 d1 - punpckhwd xmm5, xmm4 ; c1 d1 - - movdqa xmm3, xmm1 - movdqa xmm4, xmm5 - - pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - - pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - - paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] - paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] - paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] - paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] - - psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 - psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 - psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - - packssdw xmm1, xmm4 ; op[4] - packssdw xmm3, xmm5 ; op[12] - - paddw xmm1, xmm6 ; op[4] += (d1!=0) - - movdqa xmm4, xmm0 - movdqa xmm5, xmm2 - - punpcklqdq xmm0, xmm1 - punpckhqdq xmm4, xmm1 - - punpcklqdq xmm2, xmm3 - punpckhqdq xmm5, xmm3 - - movdqa XMMWORD PTR[output + 0 ], xmm0 - movdqa XMMWORD PTR[output + 16], xmm2 - movdqa XMMWORD PTR[output + 32], xmm4 - movdqa XMMWORD PTR[output + 48], xmm5 - - STACK_FRAME_DESTROY - -SECTION_RODATA -align 16 -_5352_2217: - dw 5352 - dw 2217 - dw 5352 - dw 2217 - dw 5352 - dw 2217 - dw 5352 - dw 2217 -align 16 -_2217_neg5352: - dw 2217 - dw -5352 - dw 2217 - dw -5352 - dw 2217 - dw -5352 - dw 2217 - dw -5352 -align 16 -_mult_add: - times 8 dw 1 -align 16 -_cmp_mask: - times 4 dw 1 - times 4 dw 0 -align 16 -_cmp_mask8x4: - times 8 dw 1 -align 16 -_mult_sub: - dw 1 - dw -1 - dw 1 - dw -1 - dw 1 - dw -1 - dw 1 - dw -1 -align 16 -_7: - times 4 dd 7 -align 16 -_7w: - times 8 dw 7 -align 16 -_14500: - times 4 dd 14500 -align 16 -_7500: - times 4 dd 7500 -align 16 -_12000: - times 4 dd 12000 -align 16 -_51000: - times 4 dd 51000 diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c index 358d979..49cb837 100644 --- a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c +++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c @@ -11,6 +11,111 @@ #include // SSE2 #include "vp9/common/vp9_idct.h" // for cospi constants +void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we tranpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + const int stride = pitch >> 1; + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + const __m128i kOne = _mm_set1_epi16(1); + __m128i in0, in1, in2, in3; + // Load inputs. + { + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + // x = x << 4 + in0 = _mm_slli_epi16(in0, 4); + in1 = _mm_slli_epi16(in1, 4); + in2 = _mm_slli_epi16(in2, 4); + in3 = _mm_slli_epi16(in3, 4); + // if (i == 0 && input[0]) input[0] += 1; + { + // The mask will only contain wether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); + in0 = _mm_add_epi16(in0, mask); + in0 = _mm_add_epi16(in0, k__nonzero_bias_b); + } + } + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + // Transform 1/2: Add/substract + const __m128i r0 = _mm_add_epi16(in0, in3); + const __m128i r1 = _mm_add_epi16(in1, in2); + const __m128i r2 = _mm_sub_epi16(in1, in2); + const __m128i r3 = _mm_sub_epi16(in0, in3); + // Transform 1/2: Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + // Combine and transpose + const __m128i res0 = _mm_packs_epi32(w0, w2); + const __m128i res1 = _mm_packs_epi32(w4, w6); + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 + // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 + if (0 == pass) { + // Extract values in the high part for second pass as transform code + // only uses the first four values. + in1 = _mm_unpackhi_epi64(in0, in0); + in3 = _mm_unpackhi_epi64(in2, in2); + } else { + // Post-condition output and store it (v + 1) >> 2, taking advantage + // of the fact 1/3 are stored just after 0/2. + __m128i out01 = _mm_add_epi16(in0, kOne); + __m128i out23 = _mm_add_epi16(in2, kOne); + out01 = _mm_srai_epi16(out01, 2); + out23 = _mm_srai_epi16(out23, 2); + _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); + _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); + } + } +} + +void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { + vp9_short_fdct4x4_sse2(input, output, pitch); + vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); +} + void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 43dba13..13785f7 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -90,7 +90,6 @@ VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm -- 2.7.4