Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
authorRonald S. Bultje <rbultje@google.com>
Fri, 29 Apr 2011 18:51:37 +0000 (11:51 -0700)
committerRonald S. Bultje <rbultje@google.com>
Fri, 29 Apr 2011 18:52:09 +0000 (11:52 -0700)
Change-Id: I658a1df7d825f820573cb2d11ad402f9d2791035

vp8/common/x86/recon_sse2.asm
vp8/common/x86/recon_wrapper_sse2.c

index 97dc4f6..aaa6a8f 100644 (file)
@@ -578,23 +578,35 @@ sym(vp8_intra_pred_uv_ve_mmx):
 ;    unsigned char *src,
 ;    int src_stride,
 ;    )
-global sym(vp8_intra_pred_uv_ho_mmx2)
-sym(vp8_intra_pred_uv_ho_mmx2):
+%macro vp8_intra_pred_uv_ho 1
+global sym(vp8_intra_pred_uv_ho_%1)
+sym(vp8_intra_pred_uv_ho_%1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
     push        rsi
     push        rdi
+%ifidn %1, ssse3
+    push        rbx
+%endif
     ; end prolog
 
     ; read from left and write out
+%ifidn %1, mmx2
     mov         edx,        4
+%endif
     mov         rsi,        arg(2) ;src;
     movsxd      rax,        dword ptr arg(3) ;src_stride;
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+    lea         rbx,        [rax*3]
+    lea         rdx,        [rcx*3]
+    movdqa      xmm2,       [GLOBAL(dc_00001111)]
+%endif
     dec         rsi
-vp8_intra_pred_uv_ho_mmx2_loop:
+%ifidn %1, mmx2
+vp8_intra_pred_uv_ho_%1_loop:
     movd        mm0,        [rsi]
     movd        mm1,        [rsi+rax]
     punpcklbw   mm0,        mm0
@@ -606,14 +618,49 @@ vp8_intra_pred_uv_ho_mmx2_loop:
     lea         rsi,        [rsi+rax*2]
     lea         rdi,        [rdi+rcx*2]
     dec         edx
-    jnz vp8_intra_pred_uv_ho_mmx2_loop
+    jnz vp8_intra_pred_uv_ho_%1_loop
+%else
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+    lea         rsi,        [rsi+rax*4]
+    lea         rdi,        [rdi+rcx*4]
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+%endif
 
     ; begin epilog
+%ifidn %1, ssse3
+    pop         rbx
+%endif
     pop         rdi
     pop         rsi
     UNSHADOW_ARGS
     pop         rbp
     ret
+%endmacro
+
+vp8_intra_pred_uv_ho mmx2
+vp8_intra_pred_uv_ho ssse3
 
 SECTION_RODATA
 dc_128:
@@ -623,3 +670,7 @@ dc_4:
 align 16
 dc_1024:
     times 8 dw 0x400
+align 16
+dc_00001111:
+    times 8 db 0
+    times 8 db 1
index 86b4da2..cb7b69c 100644 (file)
@@ -23,6 +23,7 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
@@ -31,7 +32,8 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
                                                 unsigned char *dst_u,
                                                 unsigned char *dst_v,
                                                 int dst_stride,
-                                                build_intra_predictors_mbuv_fn_t tm_func)
+                                                build_intra_predictors_mbuv_fn_t tm_func,
+                                                build_intra_predictors_mbuv_fn_t ho_func)
 {
     int mode = x->mode_info_context->mbmi.uv_mode;
     build_intra_predictors_mbuv_fn_t fn;
@@ -39,7 +41,7 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
 
     switch (mode) {
         case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
-        case  H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break;
+        case  H_PRED: fn = ho_func; break;
         case TM_PRED: fn = tm_func; break;
         case DC_PRED:
             if (x->up_available) {
@@ -65,26 +67,30 @@ void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
 {
     vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
                                         &x->predictor[320], 8,
-                                        vp8_intra_pred_uv_tm_sse2);
+                                        vp8_intra_pred_uv_tm_sse2,
+                                        vp8_intra_pred_uv_ho_mmx2);
 }
 
 void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
 {
     vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
                                         &x->predictor[320], 8,
-                                        vp8_intra_pred_uv_tm_ssse3);
+                                        vp8_intra_pred_uv_tm_ssse3,
+                                        vp8_intra_pred_uv_ho_ssse3);
 }
 
 void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
 {
     vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
                                         x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_sse2);
+                                        vp8_intra_pred_uv_tm_sse2,
+                                        vp8_intra_pred_uv_ho_mmx2);
 }
 
 void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
 {
     vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
                                         x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_ssse3);
+                                        vp8_intra_pred_uv_tm_ssse3,
+                                        vp8_intra_pred_uv_ho_ssse3);
 }