2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define xmm_filter_shift 7
17 ;void vp8_filter_block2d_bil_var_ssse3
19 ; unsigned char *ref_ptr,
20 ; int ref_pixels_per_line,
21 ; unsigned char *src_ptr,
22 ; int src_pixels_per_line,
23 ; unsigned int Height,
27 ; unsigned int *sumsquared;;
30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
33 sym(vp8_filter_block2d_bil_var_ssse3):
36 SHADOW_ARGS_TO_STACK 9
46 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
47 movsxd rax, dword ptr arg(5) ; xoffset
49 cmp rax, 0 ; skip first_pass filter if xoffset=0
50 je .filter_block2d_bil_var_ssse3_sp_only
52 shl rax, 4 ; point to filter coeff with xoffset
53 lea rax, [rax + rcx] ; HFilter
55 movsxd rdx, dword ptr arg(6) ; yoffset
57 cmp rdx, 0 ; skip second_pass filter if yoffset=0
58 je .filter_block2d_bil_var_ssse3_fp_only
61 lea rdx, [rdx + rcx] ; VFilter
63 mov rsi, arg(0) ;ref_ptr
64 mov rdi, arg(2) ;src_ptr
65 movsxd rcx, dword ptr arg(4) ;Height
67 movdqu xmm0, XMMWORD PTR [rsi]
68 movdqu xmm1, XMMWORD PTR [rsi+1]
76 paddw xmm0, [GLOBAL(xmm_bi_rd)]
77 paddw xmm2, [GLOBAL(xmm_bi_rd)]
78 psraw xmm0, xmm_filter_shift
79 psraw xmm2, xmm_filter_shift
84 add rsi, dword ptr arg(1) ;ref_pixels_per_line
86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
91 .filter_block2d_bil_var_ssse3_loop:
92 movdqu xmm1, XMMWORD PTR [rsi]
93 movdqu xmm2, XMMWORD PTR [rsi+1]
101 paddw xmm1, [GLOBAL(xmm_bi_rd)]
102 paddw xmm3, [GLOBAL(xmm_bi_rd)]
103 psraw xmm1, xmm_filter_shift
104 psraw xmm3, xmm_filter_shift
113 pmaddubsw xmm2, [rdx]
114 pmaddubsw xmm3, [rdx]
116 paddw xmm2, [GLOBAL(xmm_bi_rd)]
117 paddw xmm3, [GLOBAL(xmm_bi_rd)]
118 psraw xmm2, xmm_filter_shift
119 psraw xmm3, xmm_filter_shift
121 movq xmm1, QWORD PTR [rdi]
124 movq xmm5, QWORD PTR [rdi+8]
137 add rsi, dword ptr arg(1) ;ref_pixels_per_line
138 add rdi, dword ptr arg(3) ;src_pixels_per_line
145 jnz .filter_block2d_bil_var_ssse3_loop
147 jmp .filter_block2d_bil_variance
149 .filter_block2d_bil_var_ssse3_sp_only:
150 movsxd rdx, dword ptr arg(6) ; yoffset
152 cmp rdx, 0 ; Both xoffset =0 and yoffset=0
153 je .filter_block2d_bil_var_ssse3_full_pixel
156 lea rdx, [rdx + rcx] ; VFilter
158 mov rsi, arg(0) ;ref_ptr
159 mov rdi, arg(2) ;src_ptr
160 movsxd rcx, dword ptr arg(4) ;Height
161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
163 movdqu xmm1, XMMWORD PTR [rsi]
167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
172 .filter_block2d_bil_sp_only_loop:
173 movdqu xmm3, XMMWORD PTR [rsi]
179 pmaddubsw xmm1, [rdx]
180 pmaddubsw xmm2, [rdx]
182 paddw xmm1, [GLOBAL(xmm_bi_rd)]
183 paddw xmm2, [GLOBAL(xmm_bi_rd)]
184 psraw xmm1, xmm_filter_shift
185 psraw xmm2, xmm_filter_shift
187 movq xmm3, QWORD PTR [rdi]
190 movq xmm5, QWORD PTR [rdi+8]
203 lea rsi, [rsi + rax] ;ref_pixels_per_line
206 add rdi, dword ptr arg(3) ;src_pixels_per_line
212 jnz .filter_block2d_bil_sp_only_loop
214 jmp .filter_block2d_bil_variance
216 .filter_block2d_bil_var_ssse3_full_pixel:
217 mov rsi, arg(0) ;ref_ptr
218 mov rdi, arg(2) ;src_ptr
219 movsxd rcx, dword ptr arg(4) ;Height
220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
224 .filter_block2d_bil_full_pixel_loop:
225 movq xmm1, QWORD PTR [rsi]
227 movq xmm2, QWORD PTR [rsi+8]
230 movq xmm3, QWORD PTR [rdi]
232 movq xmm4, QWORD PTR [rdi+8]
244 lea rsi, [rsi + rax] ;ref_pixels_per_line
245 lea rdi, [rdi + rdx] ;src_pixels_per_line
247 jnz .filter_block2d_bil_full_pixel_loop
249 jmp .filter_block2d_bil_variance
251 .filter_block2d_bil_var_ssse3_fp_only:
252 mov rsi, arg(0) ;ref_ptr
253 mov rdi, arg(2) ;src_ptr
254 movsxd rcx, dword ptr arg(4) ;Height
255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
263 .filter_block2d_bil_fp_only_loop:
264 movdqu xmm1, XMMWORD PTR [rsi]
265 movdqu xmm2, XMMWORD PTR [rsi+1]
270 pmaddubsw xmm1, [rax]
271 pmaddubsw xmm3, [rax]
273 paddw xmm1, [GLOBAL(xmm_bi_rd)]
274 paddw xmm3, [GLOBAL(xmm_bi_rd)]
275 psraw xmm1, xmm_filter_shift
276 psraw xmm3, xmm_filter_shift
278 movq xmm2, XMMWORD PTR [rdi]
281 movq xmm5, QWORD PTR [rdi+8]
295 add rdi, dword ptr arg(3) ;src_pixels_per_line
301 jnz .filter_block2d_bil_fp_only_loop
303 jmp .filter_block2d_bil_variance
305 .filter_block2d_bil_variance:
335 mov rsi, arg(7) ;[Sum]
336 mov rdi, arg(8) ;[SSE]
356 vp8_bilinear_filters_ssse3: