From 8f9d94ec17eea893ce35188416a9492317119d77 Mon Sep 17 00:00:00 2001
From: levytamar82 <tamar.levy@intel.com>
Date: Fri, 5 Dec 2014 11:14:33 -0700
Subject: [PATCH] SSSE3 Optimization for Atom processors using new instruction
 selection and ordering

The function vp9_filter_block1d16_h8_ssse3 uses the PSHUFB instruction which has a 3 cycle latency and slows execution when done in blocks of 5 or more on Atom processors.
By replacing the PSHUFB instructions with other more efficient single cycle instructions (PUNPCKLBW + PUNPCHBW + PALIGNR) performance can be improved.
In the original code, the PSHUBF uses every byte and is consecutively copied.
This is done more efficiently by PUNPCKLBW and PUNPCHBW, using PALIGNR to concatenate the intermediate result and then shift right the next consecutive 16 bytes for the final result.

For example:
filter = 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8
Reg = 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
REG1 = PUNPCKLBW Reg, Reg = 0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7
REG2 = PUNPCHBW Reg, Reg = 8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15
PALIGNR REG2, REG1, 1 = 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8

This optimization improved the function performance by 23% and produced a 3% user level gain on 1080p content on Atom processors.
There was no observed performance impact on Core processors (expected).

Change-Id: I3cec701158993d95ed23ff04516942b5a4a461c0
---
 vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 48 +++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index fd781d4..a5b9a79 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
 
     movq        xmm0,   [rsi - 3]           ;load src data
     movq        xmm4,   [rsi + 5]
-    movq        xmm7,   [rsi + 13]
+    movq        xmm6,   [rsi + 13]
     punpcklqdq  xmm0,   xmm4
-    punpcklqdq  xmm4,   xmm7
+    punpcklqdq  xmm4,   xmm6
+
+    movdqa      xmm7,   xmm0
 
+    punpcklbw   xmm7,   xmm7
+    punpckhbw   xmm0,   xmm0
     movdqa      xmm1,   xmm0
     movdqa      xmm2,   xmm0
     movdqa      xmm3,   xmm0
-    movdqa      xmm5,   xmm4
-    movdqa      xmm6,   xmm4
-    movdqa      xmm7,   xmm4
-
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
-    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
 
+    palignr     xmm0,   xmm7, 1
+    palignr     xmm1,   xmm7, 5
     pmaddubsw   xmm0,   k0k1
+    palignr     xmm2,   xmm7, 9
     pmaddubsw   xmm1,   k2k3
+    palignr     xmm3,   xmm7, 13
+
     pmaddubsw   xmm2,   k4k5
     pmaddubsw   xmm3,   k6k7
-    pmaddubsw   xmm4,   k0k1
-    pmaddubsw   xmm5,   k2k3
-    pmaddubsw   xmm6,   k4k5
-    pmaddubsw   xmm7,   k6k7
-
     paddsw      xmm0,   xmm3
+
+    movdqa      xmm3,   xmm4
+    punpcklbw   xmm3,   xmm3
+    punpckhbw   xmm4,   xmm4
+
+    movdqa      xmm5,   xmm4
+    movdqa      xmm6,   xmm4
+    movdqa      xmm7,   xmm4
+
+    palignr     xmm4,   xmm3, 1
+    palignr     xmm5,   xmm3, 5
+    palignr     xmm6,   xmm3, 9
+    palignr     xmm7,   xmm3, 13
+
     movdqa      xmm3,   xmm1
+    pmaddubsw   xmm4,   k0k1
     pmaxsw      xmm1,   xmm2
+    pmaddubsw   xmm5,   k2k3
     pminsw      xmm2,   xmm3
+    pmaddubsw   xmm6,   k4k5
     paddsw      xmm0,   xmm2
+    pmaddubsw   xmm7,   k6k7
     paddsw      xmm0,   xmm1
 
     paddsw      xmm4,   xmm7
-- 
2.7.4