Add SSE2 subtract functions
authorYunqing Wang <yunqingwang@google.com>
Mon, 18 Oct 2010 18:15:15 +0000 (14:15 -0400)
committerYunqing Wang <yunqingwang@google.com>
Mon, 18 Oct 2010 18:15:15 +0000 (14:15 -0400)
Instead of doing 8-bit data unpack and 16-bit subtraction, use
psubb to do 16 8-bit subtractions and pcmpgtb to preserve the
sign information. This does not bring noticable gain since
these functions are not called frequently.

Change-Id: I90a0dfaa3db9d422e4ada324076596ffb178548e

vp8/encoder/x86/encodemb_x86.h
vp8/encoder/x86/subtract_mmx.asm
vp8/encoder/x86/subtract_sse2.asm [new file with mode: 0644]
vp8/encoder/x86/x86_csystemdependent.c
vp8/vp8cx.mk

index d090b2d..69b3edd 100644 (file)
@@ -55,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx);
 extern prototype_berr(vp8_block_error_xmm);
 extern prototype_mberr(vp8_mbblock_error_xmm);
 extern prototype_mbuverr(vp8_mbuverror_xmm);
-
+extern prototype_subb(vp8_subtract_b_sse2);
+extern prototype_submby(vp8_subtract_mby_sse2);
+extern prototype_submbuv(vp8_subtract_mbuv_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_encodemb_berr
@@ -67,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm);
 #undef  vp8_encodemb_mbuverr
 #define vp8_encodemb_mbuverr vp8_mbuverror_xmm
 
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_sse2
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_sse2
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2
+
 #endif
 #endif
 
index 8fe3ee1..a47e1f0 100644 (file)
@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            unsigned short *diff, unsigned char *Predictor,
+;                            short *diff, unsigned char *Predictor,
 ;                            int pitch);
 global sym(vp8_subtract_b_mmx_impl)
 sym(vp8_subtract_b_mmx_impl):
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644 (file)
index 0000000..ef329de
--- /dev/null
@@ -0,0 +1,348 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_sse2_impl)
+sym(vp8_subtract_b_sse2_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi],      mm0
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],mm0
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*4],        mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_sse2)
+sym(vp8_subtract_mby_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            8      ; do two lines at one time
+
+submby_loop:
+            movdqa      xmm0,           [rsi]   ; src
+            movdqa      xmm1,           [rax]   ; pred
+
+            movdqa      xmm2,           xmm0
+            psubb       xmm0,           xmm1
+
+            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm2,           [GLOBAL(t80)]
+            pcmpgtb     xmm1,           xmm2            ; obtain sign information
+
+            movdqa      xmm2,    xmm0
+            movdqa      xmm3,    xmm1
+            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa      [rdi],   xmm0
+            movdqa      [rdi +16], xmm2
+
+            movdqa      xmm4,           [rsi + rdx]
+            movdqa      xmm5,           [rax + 16]
+
+            movdqa      xmm6,           xmm4
+            psubb       xmm4,           xmm5
+
+            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
+            pxor        xmm6,           [GLOBAL(t80)]
+            pcmpgtb     xmm5,           xmm6            ; obtain sign information
+
+            movdqa      xmm6,    xmm4
+            movdqa      xmm7,    xmm5
+            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
+            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
+
+            movdqa      [rdi +32], xmm4
+            movdqa      [rdi +48], xmm6
+
+            add         rdi,            64
+            add         rax,            32
+            lea         rsi,            [rsi+rdx*2]
+
+            sub         rcx,            1
+            jnz         submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_sse2)
+sym(vp8_subtract_mbuv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            lea     rcx,        [rdx + rdx*2]
+
+            ;u
+            ;line 0 1
+            movq       xmm0,    [rsi]  ; src
+            movq       xmm2,    [rsi+rdx]
+            movdqa     xmm1,    [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi],   xmm0
+            movdqa     [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    [rsi+rdx*2]  ; src
+            movq       xmm2,    [rsi+rcx]
+            movdqa     xmm1,    [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi + 32],   xmm0
+            movdqa     [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    [rsi]  ; src
+            movq       xmm2,    [rsi+rdx]
+            movdqa     xmm1,    [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi + 64],   xmm0
+            movdqa     [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    [rsi+rdx*2]  ; src
+            movq       xmm2,    [rsi+rcx]
+            movdqa     xmm1,    [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi + 96],   xmm0
+            movdqa     [rdi + 112],  xmm2
+
+            ;v
+            mov     rsi,        arg(2) ;z = vsrc
+            add     rdi,        64*2  ;diff = diff + 320 (shorts)
+            add     rax,        64    ;Predictor = pred + 320
+
+            ;line 0 1
+            movq       xmm0,    [rsi]  ; src
+            movq       xmm2,    [rsi+rdx]
+            movdqa     xmm1,    [rax]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi],   xmm0
+            movdqa     [rdi +16],   xmm2
+
+            ;line 2 3
+            movq       xmm0,    [rsi+rdx*2]  ; src
+            movq       xmm2,    [rsi+rcx]
+            movdqa     xmm1,    [rax+16]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi + 32],   xmm0
+            movdqa     [rdi + 48],   xmm2
+
+            ;line 4 5
+            lea        rsi,     [rsi + rdx*4]
+
+            movq       xmm0,    [rsi]  ; src
+            movq       xmm2,    [rsi+rdx]
+            movdqa     xmm1,    [rax + 32]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi + 64],   xmm0
+            movdqa     [rdi + 80],   xmm2
+
+            ;line 6 7
+            movq       xmm0,    [rsi+rdx*2]  ; src
+            movq       xmm2,    [rsi+rcx]
+            movdqa     xmm1,    [rax+ 48]  ; pred
+            punpcklqdq xmm0,    xmm2
+
+            movdqa     xmm2,    xmm0
+            psubb      xmm0,    xmm1            ; subtraction with sign missed
+
+            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
+            pxor       xmm2,    [GLOBAL(t80)]
+            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+
+            movdqa     xmm2,    xmm0
+            movdqa     xmm3,    xmm1
+            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
+            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+
+            movdqa     [rdi + 96],   xmm0
+            movdqa     [rdi + 112],  xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t80:
+    times 16 db 0x80
index 6d36aff..9b753bf 100644 (file)
@@ -165,6 +165,18 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
     return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 }
 
+void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
+
 #endif
 
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
@@ -282,12 +294,12 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
         cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
         cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
-        /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_sse2;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;
 
         /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
-
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
-
     }
 
 #endif
index 4ce18b6..2a84402 100644 (file)
@@ -104,6 +104,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm