From 5b8a7d6e2556e16bfeee2ba8101080f079051aa3 Mon Sep 17 00:00:00 2001
From: John Koleszar <jkoleszar@google.com>
Date: Fri, 19 Apr 2013 08:24:33 -0700
Subject: [PATCH] Use SSSE3 for 2d filters larger than 16

The C code was being used as a fallback for the >16 case, but only for 2D.

Change-Id: I1e2e6da9e4b28bd88bde9ba4dd32724ce466cf6f
---
 vp9/common/x86/vp9_asm_stubs.c | 91 +++++++++++-------------------------------
 1 file changed, 23 insertions(+), 68 deletions(-)

diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 310f8ed..2b66834 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -278,43 +278,20 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*71);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
 
+  assert(w <= 64);
   assert(h <= 64);
-
-  if (x_step_q4 == 16 && y_step_q4 == 16 &&
-      filter_x[3] != 128 && filter_y[3] != 128) {
-    if (w == 16) {
-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
-                                    fdata2, 16,
-                                    h + 7, filter_x);
-      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      return;
-    }
-    if (w == 8) {
-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      return;
-    }
-    if (w == 4) {
-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d4_v8_ssse3(fdata2, 16,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      return;
-    }
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   }
-  vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h);
 }
 
 void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
@@ -322,42 +299,20 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*71);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
 
+  assert(w <= 64);
   assert(h <= 64);
-
-  if (x_step_q4 == 16 && y_step_q4 == 16 &&
-      filter_x[3] != 128 && filter_y[3] != 128) {
-    if (w == 16) {
-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
-                                    fdata2, 16,
-                                    h + 7, filter_x);
-      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
-                                        dst, dst_stride,
-                                        h, filter_y);
-      return;
-    }
-    if (w == 8) {
-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
-                                       dst, dst_stride,
-                                       h, filter_y);
-      return;
-    }
-    if (w == 4) {
-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
-                                       dst, dst_stride,
-                                       h, filter_y);
-      return;
-    }
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   }
-  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h);
 }
 #endif
-- 
2.7.4