vp10/common/x86/vp9_mfqe_sse2.asm

   1 ;
   2 ;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11 ;  This file is a duplicate of mfqe_sse2.asm in VP8.
  12 ;  TODO(jackychen): Find a way to fix the duplicate.
  13 %include "vpx_ports/x86_abi_support.asm"
  14
  15 ;void vp10_filter_by_weight16x16_sse2
  16 ;(
  17 ;    unsigned char *src,
  18 ;    int            src_stride,
  19 ;    unsigned char *dst,
  20 ;    int            dst_stride,
  21 ;    int            src_weight
  22 ;)
  23 global sym(vp10_filter_by_weight16x16_sse2) PRIVATE
  24 sym(vp10_filter_by_weight16x16_sse2):
  25     push        rbp
  26     mov         rbp, rsp
  27     SHADOW_ARGS_TO_STACK 5
  28     SAVE_XMM 6
  29     GET_GOT     rbx
  30     push        rsi
  31     push        rdi
  32     ; end prolog
  33
  34     movd        xmm0, arg(4)                ; src_weight
  35     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
  36     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
  37
  38     movdqa      xmm1, [GLOBAL(tMFQE)]
  39     psubw       xmm1, xmm0                  ; dst_weight
  40
  41     mov         rax, arg(0)                 ; src
  42     mov         rsi, arg(1)                 ; src_stride
  43     mov         rdx, arg(2)                 ; dst
  44     mov         rdi, arg(3)                 ; dst_stride
  45
  46     mov         rcx, 16                     ; loop count
  47     pxor        xmm6, xmm6
  48
  49 .combine
  50     movdqa      xmm2, [rax]
  51     movdqa      xmm4, [rdx]
  52     add         rax, rsi
  53
  54     ; src * src_weight
  55     movdqa      xmm3, xmm2
  56     punpcklbw   xmm2, xmm6
  57     punpckhbw   xmm3, xmm6
  58     pmullw      xmm2, xmm0
  59     pmullw      xmm3, xmm0
  60
  61     ; dst * dst_weight
  62     movdqa      xmm5, xmm4
  63     punpcklbw   xmm4, xmm6
  64     punpckhbw   xmm5, xmm6
  65     pmullw      xmm4, xmm1
  66     pmullw      xmm5, xmm1
  67
  68     ; sum, round and shift
  69     paddw       xmm2, xmm4
  70     paddw       xmm3, xmm5
  71     paddw       xmm2, [GLOBAL(tMFQE_round)]
  72     paddw       xmm3, [GLOBAL(tMFQE_round)]
  73     psrlw       xmm2, 4
  74     psrlw       xmm3, 4
  75
  76     packuswb    xmm2, xmm3
  77     movdqa      [rdx], xmm2
  78     add         rdx, rdi
  79
  80     dec         rcx
  81     jnz         .combine
  82
  83     ; begin epilog
  84     pop         rdi
  85     pop         rsi
  86     RESTORE_GOT
  87     RESTORE_XMM
  88     UNSHADOW_ARGS
  89     pop         rbp
  90
  91     ret
  92
  93 ;void vp10_filter_by_weight8x8_sse2
  94 ;(
  95 ;    unsigned char *src,
  96 ;    int            src_stride,
  97 ;    unsigned char *dst,
  98 ;    int            dst_stride,
  99 ;    int            src_weight
 100 ;)
 101 global sym(vp10_filter_by_weight8x8_sse2) PRIVATE
 102 sym(vp10_filter_by_weight8x8_sse2):
 103     push        rbp
 104     mov         rbp, rsp
 105     SHADOW_ARGS_TO_STACK 5
 106     GET_GOT     rbx
 107     push        rsi
 108     push        rdi
 109     ; end prolog
 110
 111     movd        xmm0, arg(4)                ; src_weight
 112     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
 113     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
 114
 115     movdqa      xmm1, [GLOBAL(tMFQE)]
 116     psubw       xmm1, xmm0                  ; dst_weight
 117
 118     mov         rax, arg(0)                 ; src
 119     mov         rsi, arg(1)                 ; src_stride
 120     mov         rdx, arg(2)                 ; dst
 121     mov         rdi, arg(3)                 ; dst_stride
 122
 123     mov         rcx, 8                      ; loop count
 124     pxor        xmm4, xmm4
 125
 126 .combine
 127     movq        xmm2, [rax]
 128     movq        xmm3, [rdx]
 129     add         rax, rsi
 130
 131     ; src * src_weight
 132     punpcklbw   xmm2, xmm4
 133     pmullw      xmm2, xmm0
 134
 135     ; dst * dst_weight
 136     punpcklbw   xmm3, xmm4
 137     pmullw      xmm3, xmm1
 138
 139     ; sum, round and shift
 140     paddw       xmm2, xmm3
 141     paddw       xmm2, [GLOBAL(tMFQE_round)]
 142     psrlw       xmm2, 4
 143
 144     packuswb    xmm2, xmm4
 145     movq        [rdx], xmm2
 146     add         rdx, rdi
 147
 148     dec         rcx
 149     jnz         .combine
 150
 151     ; begin epilog
 152     pop         rdi
 153     pop         rsi
 154     RESTORE_GOT
 155     UNSHADOW_ARGS
 156     pop         rbp
 157
 158     ret
 159
 160 ;void vp10_variance_and_sad_16x16_sse2 | arg
 161 ;(
 162 ;    unsigned char *src1,          0
 163 ;    int            stride1,       1
 164 ;    unsigned char *src2,          2
 165 ;    int            stride2,       3
 166 ;    unsigned int  *variance,      4
 167 ;    unsigned int  *sad,           5
 168 ;)
 169 global sym(vp10_variance_and_sad_16x16_sse2) PRIVATE
 170 sym(vp10_variance_and_sad_16x16_sse2):
 171     push        rbp
 172     mov         rbp, rsp
 173     SHADOW_ARGS_TO_STACK 6
 174     GET_GOT     rbx
 175     push        rsi
 176     push        rdi
 177     ; end prolog
 178
 179     mov         rax,        arg(0)          ; src1
 180     mov         rcx,        arg(1)          ; stride1
 181     mov         rdx,        arg(2)          ; src2
 182     mov         rdi,        arg(3)          ; stride2
 183
 184     mov         rsi,        16              ; block height
 185
 186     ; Prep accumulator registers
 187     pxor        xmm3, xmm3                  ; SAD
 188     pxor        xmm4, xmm4                  ; sum of src2
 189     pxor        xmm5, xmm5                  ; sum of src2^2
 190
 191     ; Because we're working with the actual output frames
 192     ; we can't depend on any kind of data alignment.
 193 .accumulate
 194     movdqa      xmm0, [rax]                 ; src1
 195     movdqa      xmm1, [rdx]                 ; src2
 196     add         rax, rcx                    ; src1 + stride1
 197     add         rdx, rdi                    ; src2 + stride2
 198
 199     ; SAD(src1, src2)
 200     psadbw      xmm0, xmm1
 201     paddusw     xmm3, xmm0
 202
 203     ; SUM(src2)
 204     pxor        xmm2, xmm2
 205     psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
 206     paddusw     xmm4, xmm2
 207
 208     ; pmaddubsw would be ideal if it took two unsigned values. instead,
 209     ; it expects a signed and an unsigned value. so instead we zero extend
 210     ; and operate on words.
 211     pxor        xmm2, xmm2
 212     movdqa      xmm0, xmm1
 213     punpcklbw   xmm0, xmm2
 214     punpckhbw   xmm1, xmm2
 215     pmaddwd     xmm0, xmm0
 216     pmaddwd     xmm1, xmm1
 217     paddd       xmm5, xmm0
 218     paddd       xmm5, xmm1
 219
 220     sub         rsi,        1
 221     jnz         .accumulate
 222
 223     ; phaddd only operates on adjacent double words.
 224     ; Finalize SAD and store
 225     movdqa      xmm0, xmm3
 226     psrldq      xmm0, 8
 227     paddusw     xmm0, xmm3
 228     paddd       xmm0, [GLOBAL(t128)]
 229     psrld       xmm0, 8
 230
 231     mov         rax,  arg(5)
 232     movd        [rax], xmm0
 233
 234     ; Accumulate sum of src2
 235     movdqa      xmm0, xmm4
 236     psrldq      xmm0, 8
 237     paddusw     xmm0, xmm4
 238     ; Square src2. Ignore high value
 239     pmuludq     xmm0, xmm0
 240     psrld       xmm0, 8
 241
 242     ; phaddw could be used to sum adjacent values but we want
 243     ; all the values summed. promote to doubles, accumulate,
 244     ; shift and sum
 245     pxor        xmm2, xmm2
 246     movdqa      xmm1, xmm5
 247     punpckldq   xmm1, xmm2
 248     punpckhdq   xmm5, xmm2
 249     paddd       xmm1, xmm5
 250     movdqa      xmm2, xmm1
 251     psrldq      xmm1, 8
 252     paddd       xmm1, xmm2
 253
 254     psubd       xmm1, xmm0
 255
 256     ; (variance + 128) >> 8
 257     paddd       xmm1, [GLOBAL(t128)]
 258     psrld       xmm1, 8
 259     mov         rax,  arg(4)
 260
 261     movd        [rax], xmm1
 262
 263
 264     ; begin epilog
 265     pop         rdi
 266     pop         rsi
 267     RESTORE_GOT
 268     UNSHADOW_ARGS
 269     pop         rbp
 270     ret
 271
 272 SECTION_RODATA
 273 align 16
 274 t128:
 275 %ifndef __NASM_VER__
 276     ddq 128
 277 %elif CONFIG_BIG_ENDIAN
 278     dq  0, 128
 279 %else
 280     dq  128, 0
 281 %endif
 282 align 16
 283 tMFQE: ; 1 << MFQE_PRECISION
 284     times 8 dw 0x10
 285 align 16
 286 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
 287     times 8 dw 0x08