From 8f9d94ec17eea893ce35188416a9492317119d77 Mon Sep 17 00:00:00 2001 From: levytamar82 Date: Fri, 5 Dec 2014 11:14:33 -0700 Subject: [PATCH] SSSE3 Optimization for Atom processors using new instruction selection and ordering The function vp9_filter_block1d16_h8_ssse3 uses the PSHUFB instruction which has a 3 cycle latency and slows execution when done in blocks of 5 or more on Atom processors. By replacing the PSHUFB instructions with other more efficient single cycle instructions (PUNPCKLBW + PUNPCHBW + PALIGNR) performance can be improved. In the original code, the PSHUBF uses every byte and is consecutively copied. This is done more efficiently by PUNPCKLBW and PUNPCHBW, using PALIGNR to concatenate the intermediate result and then shift right the next consecutive 16 bytes for the final result. For example: filter = 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8 Reg = 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 REG1 = PUNPCKLBW Reg, Reg = 0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7 REG2 = PUNPCHBW Reg, Reg = 8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15 PALIGNR REG2, REG1, 1 = 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8 This optimization improved the function performance by 23% and produced a 3% user level gain on 1080p content on Atom processors. There was no observed performance impact on Core processors (expected). Change-Id: I3cec701158993d95ed23ff04516942b5a4a461c0 --- vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 48 +++++++++++++++++++------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index fd781d4..a5b9a79 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movq xmm0, [rsi - 3] ;load src data movq xmm4, [rsi + 5] - movq xmm7, [rsi + 13] + movq xmm6, [rsi + 13] punpcklqdq xmm0, xmm4 - punpcklqdq xmm4, xmm7 + punpcklqdq xmm4, xmm6 + + movdqa xmm7, xmm0 + punpcklbw xmm7, xmm7 + punpckhbw xmm0, xmm0 movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm3, xmm0 - movdqa xmm5, xmm4 - movdqa xmm6, xmm4 - movdqa xmm7, xmm4 - - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pshufb xmm3, [GLOBAL(shuf_t6t7)] - pshufb xmm4, [GLOBAL(shuf_t0t1)] - pshufb xmm5, [GLOBAL(shuf_t2t3)] - pshufb xmm6, [GLOBAL(shuf_t4t5)] - pshufb xmm7, [GLOBAL(shuf_t6t7)] + palignr xmm0, xmm7, 1 + palignr xmm1, xmm7, 5 pmaddubsw xmm0, k0k1 + palignr xmm2, xmm7, 9 pmaddubsw xmm1, k2k3 + palignr xmm3, xmm7, 13 + pmaddubsw xmm2, k4k5 pmaddubsw xmm3, k6k7 - pmaddubsw xmm4, k0k1 - pmaddubsw xmm5, k2k3 - pmaddubsw xmm6, k4k5 - pmaddubsw xmm7, k6k7 - paddsw xmm0, xmm3 + + movdqa xmm3, xmm4 + punpcklbw xmm3, xmm3 + punpckhbw xmm4, xmm4 + + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + + palignr xmm4, xmm3, 1 + palignr xmm5, xmm3, 5 + palignr xmm6, xmm3, 9 + palignr xmm7, xmm3, 13 + movdqa xmm3, xmm1 + pmaddubsw xmm4, k0k1 pmaxsw xmm1, xmm2 + pmaddubsw xmm5, k2k3 pminsw xmm2, xmm3 + pmaddubsw xmm6, k4k5 paddsw xmm0, xmm2 + pmaddubsw xmm7, k6k7 paddsw xmm0, xmm1 paddsw xmm4, xmm7 -- 2.7.4