Updated vp8_build_intra_predictors_mbuv_s(sse2/ssse3)
authorScott LaVarnway <slavarnway@google.com>
Tue, 20 Mar 2012 20:32:54 +0000 (16:32 -0400)
committerScott LaVarnway <slavarnway@google.com>
Mon, 26 Mar 2012 17:40:14 +0000 (13:40 -0400)
to work with the latest code.

Change-Id: Ie382bb55d00ea5929bdadba859eea15f696d4cd9

vp8/common/rtcd_defs.sh
vp8/common/x86/recon_sse2.asm
vp8/common/x86/recon_wrapper_sse2.c

index fee8965..c069a21 100644 (file)
@@ -126,7 +126,7 @@ prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned
 #TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
 
 prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
-#TODO: fix assembly --- specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
 
 prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
 specialize vp8_intra4x4_predict media
index 4b68ef5..d371ebd 100644 (file)
@@ -119,35 +119,39 @@ sym(vp8_copy_mem16x16_sse2):
 ;void vp8_intra_pred_uv_dc_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dc_mmx2)
 sym(vp8_intra_pred_uv_dc_mmx2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
     ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rsi,        arg(2) ;above;
     pxor        mm0,        mm0
     movq        mm1,        [rsi]
     psadbw      mm1,        mm0
 
     ; from left
-    dec         rsi
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi+rax]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax*1]
+    add         ecx,        edx
     movzx       edx,        byte [rsi+rax*2]
     add         ecx,        edx
+
+
     movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
     lea         rsi,        [rsi+rax*4]
+    add         ecx,        edx
     movzx       edx,        byte [rsi]
     add         ecx,        edx
     movzx       edx,        byte [rsi+rax]
@@ -156,8 +160,6 @@ sym(vp8_intra_pred_uv_dc_mmx2):
     add         ecx,        edx
     movzx       edx,        byte [rsi+rdi]
     add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*4]
-    add         ecx,        edx
 
     ; add up
     pextrw      edx,        mm1, 0x0
@@ -192,23 +194,24 @@ sym(vp8_intra_pred_uv_dc_mmx2):
 ;void vp8_intra_pred_uv_dctop_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dctop_mmx2)
 sym(vp8_intra_pred_uv_dctop_mmx2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog
 
+    ;arg(3), arg(4) not used
+
     ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rsi,        arg(2) ;above;
     pxor        mm0,        mm0
     movq        mm1,        [rsi]
     psadbw      mm1,        mm0
@@ -245,22 +248,24 @@ sym(vp8_intra_pred_uv_dctop_mmx2):
 ;void vp8_intra_pred_uv_dcleft_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dcleft_mmx2)
 sym(vp8_intra_pred_uv_dcleft_mmx2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
+    ;arg(2) not used
+
     ; from left
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    dec         rsi
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     lea         rdi,        [rax*3]
     movzx       ecx,        byte [rsi]
     movzx       edx,        byte [rsi+rax]
@@ -310,17 +315,20 @@ sym(vp8_intra_pred_uv_dcleft_mmx2):
 ;void vp8_intra_pred_uv_dc128_mmx(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dc128_mmx)
 sym(vp8_intra_pred_uv_dc128_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     ; end prolog
 
+    ;arg(2), arg(3), arg(4) not used
+
     ; write out
     movq        mm1,        [GLOBAL(dc_128)]
     mov         rax,        arg(0) ;dst;
@@ -346,15 +354,16 @@ sym(vp8_intra_pred_uv_dc128_mmx):
 ;void vp8_intra_pred_uv_tm_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 %macro vp8_intra_pred_uv_tm 1
 global sym(vp8_intra_pred_uv_tm_%1)
 sym(vp8_intra_pred_uv_tm_%1):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -362,9 +371,8 @@ sym(vp8_intra_pred_uv_tm_%1):
 
     ; read top row
     mov         edx,        4
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rsi,        arg(2) ;above
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     pxor        xmm0,       xmm0
 %ifidn %1, ssse3
     movdqa      xmm2,       [GLOBAL(dc_1024)]
@@ -374,7 +382,7 @@ sym(vp8_intra_pred_uv_tm_%1):
 
     ; set up left ptrs ans subtract topleft
     movd        xmm3,       [rsi-1]
-    lea         rsi,        [rsi+rax-1]
+    mov         rsi,        arg(3) ;left;
 %ifidn %1, sse2
     punpcklbw   xmm3,       xmm0
     pshuflw     xmm3,       xmm3, 0x0
@@ -427,20 +435,22 @@ vp8_intra_pred_uv_tm ssse3
 ;void vp8_intra_pred_uv_ve_mmx(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_ve_mmx)
 sym(vp8_intra_pred_uv_ve_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     ; end prolog
 
+    ; arg(3), arg(4) not used
+
     ; read from top
     mov         rax,        arg(2) ;src;
-    movsxd      rdx,        dword ptr arg(3) ;src_stride;
-    sub         rax,        rdx
+
     movq        mm1,        [rax]
 
     ; write out
@@ -466,15 +476,16 @@ sym(vp8_intra_pred_uv_ve_mmx):
 ;void vp8_intra_pred_uv_ho_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 %macro vp8_intra_pred_uv_ho 1
 global sym(vp8_intra_pred_uv_ho_%1)
 sym(vp8_intra_pred_uv_ho_%1):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
 %ifidn %1, ssse3
@@ -485,12 +496,14 @@ sym(vp8_intra_pred_uv_ho_%1):
 %endif
     ; end prolog
 
+    ;arg(2) not used
+
     ; read from left and write out
 %ifidn %1, mmx2
     mov         edx,        4
 %endif
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rsi,        arg(3) ;left
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
 %ifidn %1, ssse3
@@ -498,7 +511,7 @@ sym(vp8_intra_pred_uv_ho_%1):
     movdqa      xmm2,       [GLOBAL(dc_00001111)]
     lea         rbx,        [rax*3]
 %endif
-    dec         rsi
+
 %ifidn %1, mmx2
 .vp8_intra_pred_uv_ho_%1_loop:
     movd        mm0,        [rsi]
index cb9ab80..949b2fb 100644 (file)
@@ -15,7 +15,8 @@
 
 #define build_intra_predictors_mbuv_prototype(sym) \
     void sym(unsigned char *dst, int dst_stride, \
-             const unsigned char *src, int src_stride)
+             const unsigned char *above, \
+             const unsigned char *left, int left_stride)
 typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
 
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
@@ -29,15 +30,19 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
 
 static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+                                                unsigned char * uabove_row,
+                                                unsigned char * vabove_row,
                                                 unsigned char *dst_u,
                                                 unsigned char *dst_v,
                                                 int dst_stride,
+                                                unsigned char * uleft,
+                                                unsigned char * vleft,
+                                                int left_stride,
                                                 build_intra_predictors_mbuv_fn_t tm_func,
                                                 build_intra_predictors_mbuv_fn_t ho_func)
 {
     int mode = x->mode_info_context->mbmi.uv_mode;
     build_intra_predictors_mbuv_fn_t fn;
-    int src_stride = x->dst.uv_stride;
 
     switch (mode) {
         case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
@@ -59,38 +64,48 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
         default: return;
     }
 
-    fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
-    fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+    fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
+    fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
 }
 
-void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
+                                            unsigned char * uabove_row,
+                                            unsigned char * vabove_row,
+                                            unsigned char * uleft,
+                                            unsigned char * vleft,
+                                            int left_stride,
+                                            unsigned char * upred_ptr,
+                                            unsigned char * vpred_ptr,
+                                            int pred_stride)
 {
-    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
-                                        &x->predictor[320], 8,
+    vp8_build_intra_predictors_mbuv_x86(x,
+                                        uabove_row, vabove_row,
+                                        upred_ptr,
+                                        vpred_ptr, pred_stride,
+                                        uleft,
+                                        vleft,
+                                        left_stride,
                                         vp8_intra_pred_uv_tm_sse2,
                                         vp8_intra_pred_uv_ho_mmx2);
 }
 
-void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
+                                             unsigned char * uabove_row,
+                                             unsigned char * vabove_row,
+                                             unsigned char * uleft,
+                                             unsigned char * vleft,
+                                             int left_stride,
+                                             unsigned char * upred_ptr,
+                                             unsigned char * vpred_ptr,
+                                             int pred_stride)
 {
-    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
-                                        &x->predictor[320], 8,
-                                        vp8_intra_pred_uv_tm_ssse3,
-                                        vp8_intra_pred_uv_ho_ssse3);
-}
-
-void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
-                                        x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_sse2,
-                                        vp8_intra_pred_uv_ho_mmx2);
-}
-
-void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
-                                        x->dst.v_buffer, x->dst.uv_stride,
+    vp8_build_intra_predictors_mbuv_x86(x,
+                                        uabove_row, vabove_row,
+                                        upred_ptr,
+                                        vpred_ptr, pred_stride,
+                                        uleft,
+                                        vleft,
+                                        left_stride,
                                         vp8_intra_pred_uv_tm_ssse3,
                                         vp8_intra_pred_uv_ho_ssse3);
 }
@@ -132,22 +147,10 @@ static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
         default: return;
     }
 
-    fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
+//    fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
     return;
 }
 
-void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
-                                       vp8_intra_pred_y_tm_sse2);
-}
-
-void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
-                                       vp8_intra_pred_y_tm_ssse3);
-}
-
 void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
 {
     vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,