2 ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 ; This file is a duplicate of mfqe_sse2.asm in VP8.
12 ; TODO(jackychen): Find a way to fix the duplicate.
13 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp10_filter_by_weight16x16_sse2
23 global sym(vp10_filter_by_weight16x16_sse2) PRIVATE
24 sym(vp10_filter_by_weight16x16_sse2):
27 SHADOW_ARGS_TO_STACK 5
34 movd xmm0, arg(4) ; src_weight
35 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
36 punpcklqdq xmm0, xmm0 ; replicate to all hi words
38 movdqa xmm1, [GLOBAL(tMFQE)]
39 psubw xmm1, xmm0 ; dst_weight
42 mov rsi, arg(1) ; src_stride
44 mov rdi, arg(3) ; dst_stride
46 mov rcx, 16 ; loop count
68 ; sum, round and shift
71 paddw xmm2, [GLOBAL(tMFQE_round)]
72 paddw xmm3, [GLOBAL(tMFQE_round)]
93 ;void vp10_filter_by_weight8x8_sse2
101 global sym(vp10_filter_by_weight8x8_sse2) PRIVATE
102 sym(vp10_filter_by_weight8x8_sse2):
105 SHADOW_ARGS_TO_STACK 5
111 movd xmm0, arg(4) ; src_weight
112 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
113 punpcklqdq xmm0, xmm0 ; replicate to all hi words
115 movdqa xmm1, [GLOBAL(tMFQE)]
116 psubw xmm1, xmm0 ; dst_weight
118 mov rax, arg(0) ; src
119 mov rsi, arg(1) ; src_stride
120 mov rdx, arg(2) ; dst
121 mov rdi, arg(3) ; dst_stride
123 mov rcx, 8 ; loop count
139 ; sum, round and shift
141 paddw xmm2, [GLOBAL(tMFQE_round)]
160 ;void vp10_variance_and_sad_16x16_sse2 | arg
162 ; unsigned char *src1, 0
164 ; unsigned char *src2, 2
166 ; unsigned int *variance, 4
167 ; unsigned int *sad, 5
169 global sym(vp10_variance_and_sad_16x16_sse2) PRIVATE
170 sym(vp10_variance_and_sad_16x16_sse2):
173 SHADOW_ARGS_TO_STACK 6
179 mov rax, arg(0) ; src1
180 mov rcx, arg(1) ; stride1
181 mov rdx, arg(2) ; src2
182 mov rdi, arg(3) ; stride2
184 mov rsi, 16 ; block height
186 ; Prep accumulator registers
187 pxor xmm3, xmm3 ; SAD
188 pxor xmm4, xmm4 ; sum of src2
189 pxor xmm5, xmm5 ; sum of src2^2
191 ; Because we're working with the actual output frames
192 ; we can't depend on any kind of data alignment.
194 movdqa xmm0, [rax] ; src1
195 movdqa xmm1, [rdx] ; src2
196 add rax, rcx ; src1 + stride1
197 add rdx, rdi ; src2 + stride2
205 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
208 ; pmaddubsw would be ideal if it took two unsigned values. instead,
209 ; it expects a signed and an unsigned value. so instead we zero extend
210 ; and operate on words.
223 ; phaddd only operates on adjacent double words.
224 ; Finalize SAD and store
228 paddd xmm0, [GLOBAL(t128)]
234 ; Accumulate sum of src2
238 ; Square src2. Ignore high value
242 ; phaddw could be used to sum adjacent values but we want
243 ; all the values summed. promote to doubles, accumulate,
256 ; (variance + 128) >> 8
257 paddd xmm1, [GLOBAL(t128)]
277 %elif CONFIG_BIG_ENDIAN
283 tMFQE: ; 1 << MFQE_PRECISION
286 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)