Make libvpx Chromium build friendly
[profile/ivi/libvpx.git] / vp8 / common / x86 / variance_impl_ssse3.asm
1 ;
2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ;  Use of this source code is governed by a BSD-style license
5 ;  that can be found in the LICENSE file in the root of the source
6 ;  tree. An additional intellectual property rights grant can be found
7 ;  in the file PATENTS.  All contributing project authors may
8 ;  be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define xmm_filter_shift            7
15
16
17 ;void vp8_filter_block2d_bil_var_ssse3
18 ;(
19 ;    unsigned char *ref_ptr,
20 ;    int ref_pixels_per_line,
21 ;    unsigned char *src_ptr,
22 ;    int src_pixels_per_line,
23 ;    unsigned int Height,
24 ;    int  xoffset,
25 ;    int  yoffset,
26 ;    int *sum,
27 ;    unsigned int *sumsquared;;
28 ;
29 ;)
30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
33 sym(vp8_filter_block2d_bil_var_ssse3):
34     push        rbp
35     mov         rbp, rsp
36     SHADOW_ARGS_TO_STACK 9
37     SAVE_XMM 7
38     GET_GOT     rbx
39     push rsi
40     push rdi
41     ; end prolog
42
43         pxor            xmm6,           xmm6
44         pxor            xmm7,           xmm7
45
46         lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
47         movsxd          rax,            dword ptr arg(5)     ; xoffset
48
49         cmp             rax,            0                    ; skip first_pass filter if xoffset=0
50         je              .filter_block2d_bil_var_ssse3_sp_only
51
52         shl             rax,            4                    ; point to filter coeff with xoffset
53         lea             rax,            [rax + rcx]          ; HFilter
54
55         movsxd          rdx,            dword ptr arg(6)     ; yoffset
56
57         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
58         je              .filter_block2d_bil_var_ssse3_fp_only
59
60         shl             rdx,            4
61         lea             rdx,            [rdx + rcx]          ; VFilter
62
63         mov             rsi,            arg(0)               ;ref_ptr
64         mov             rdi,            arg(2)               ;src_ptr
65         movsxd          rcx,            dword ptr arg(4)     ;Height
66
67         movdqu          xmm0,           XMMWORD PTR [rsi]
68         movdqu          xmm1,           XMMWORD PTR [rsi+1]
69         movdqa          xmm2,           xmm0
70
71         punpcklbw       xmm0,           xmm1
72         punpckhbw       xmm2,           xmm1
73         pmaddubsw       xmm0,           [rax]
74         pmaddubsw       xmm2,           [rax]
75
76         paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
77         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
78         psraw           xmm0,           xmm_filter_shift
79         psraw           xmm2,           xmm_filter_shift
80
81         packuswb        xmm0,           xmm2
82
83 %if ABI_IS_32BIT
84         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
85 %else
86         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
87         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
88         lea             rsi,            [rsi + r8]
89 %endif
90
91 .filter_block2d_bil_var_ssse3_loop:
92         movdqu          xmm1,           XMMWORD PTR [rsi]
93         movdqu          xmm2,           XMMWORD PTR [rsi+1]
94         movdqa          xmm3,           xmm1
95
96         punpcklbw       xmm1,           xmm2
97         punpckhbw       xmm3,           xmm2
98         pmaddubsw       xmm1,           [rax]
99         pmaddubsw       xmm3,           [rax]
100
101         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
102         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
103         psraw           xmm1,           xmm_filter_shift
104         psraw           xmm3,           xmm_filter_shift
105         packuswb        xmm1,           xmm3
106
107         movdqa          xmm2,           xmm0
108         movdqa          xmm0,           xmm1
109         movdqa          xmm3,           xmm2
110
111         punpcklbw       xmm2,           xmm1
112         punpckhbw       xmm3,           xmm1
113         pmaddubsw       xmm2,           [rdx]
114         pmaddubsw       xmm3,           [rdx]
115
116         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
117         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
118         psraw           xmm2,           xmm_filter_shift
119         psraw           xmm3,           xmm_filter_shift
120
121         movq            xmm1,           QWORD PTR [rdi]
122         pxor            xmm4,           xmm4
123         punpcklbw       xmm1,           xmm4
124         movq            xmm5,           QWORD PTR [rdi+8]
125         punpcklbw       xmm5,           xmm4
126
127         psubw           xmm2,           xmm1
128         psubw           xmm3,           xmm5
129         paddw           xmm6,           xmm2
130         paddw           xmm6,           xmm3
131         pmaddwd         xmm2,           xmm2
132         pmaddwd         xmm3,           xmm3
133         paddd           xmm7,           xmm2
134         paddd           xmm7,           xmm3
135
136 %if ABI_IS_32BIT
137         add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
138         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
139 %else
140         lea             rsi,            [rsi + r8]
141         lea             rdi,            [rdi + r9]
142 %endif
143
144         sub             rcx,            1
145         jnz             .filter_block2d_bil_var_ssse3_loop
146
147         jmp             .filter_block2d_bil_variance
148
149 .filter_block2d_bil_var_ssse3_sp_only:
150         movsxd          rdx,            dword ptr arg(6)     ; yoffset
151
152         cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
153         je              .filter_block2d_bil_var_ssse3_full_pixel
154
155         shl             rdx,            4
156         lea             rdx,            [rdx + rcx]          ; VFilter
157
158         mov             rsi,            arg(0)               ;ref_ptr
159         mov             rdi,            arg(2)               ;src_ptr
160         movsxd          rcx,            dword ptr arg(4)     ;Height
161         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
162
163         movdqu          xmm1,           XMMWORD PTR [rsi]
164         movdqa          xmm0,           xmm1
165
166 %if ABI_IS_32BIT=0
167         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
168 %endif
169
170         lea             rsi,            [rsi + rax]
171
172 .filter_block2d_bil_sp_only_loop:
173         movdqu          xmm3,           XMMWORD PTR [rsi]
174         movdqa          xmm2,           xmm1
175         movdqa          xmm0,           xmm3
176
177         punpcklbw       xmm1,           xmm3
178         punpckhbw       xmm2,           xmm3
179         pmaddubsw       xmm1,           [rdx]
180         pmaddubsw       xmm2,           [rdx]
181
182         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
183         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
184         psraw           xmm1,           xmm_filter_shift
185         psraw           xmm2,           xmm_filter_shift
186
187         movq            xmm3,           QWORD PTR [rdi]
188         pxor            xmm4,           xmm4
189         punpcklbw       xmm3,           xmm4
190         movq            xmm5,           QWORD PTR [rdi+8]
191         punpcklbw       xmm5,           xmm4
192
193         psubw           xmm1,           xmm3
194         psubw           xmm2,           xmm5
195         paddw           xmm6,           xmm1
196         paddw           xmm6,           xmm2
197         pmaddwd         xmm1,           xmm1
198         pmaddwd         xmm2,           xmm2
199         paddd           xmm7,           xmm1
200         paddd           xmm7,           xmm2
201
202         movdqa          xmm1,           xmm0
203         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
204
205 %if ABI_IS_32BIT
206         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
207 %else
208         lea             rdi,            [rdi + r9]
209 %endif
210
211         sub             rcx,            1
212         jnz             .filter_block2d_bil_sp_only_loop
213
214         jmp             .filter_block2d_bil_variance
215
216 .filter_block2d_bil_var_ssse3_full_pixel:
217         mov             rsi,            arg(0)               ;ref_ptr
218         mov             rdi,            arg(2)               ;src_ptr
219         movsxd          rcx,            dword ptr arg(4)     ;Height
220         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
221         movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
222         pxor            xmm0,           xmm0
223
224 .filter_block2d_bil_full_pixel_loop:
225         movq            xmm1,           QWORD PTR [rsi]
226         punpcklbw       xmm1,           xmm0
227         movq            xmm2,           QWORD PTR [rsi+8]
228         punpcklbw       xmm2,           xmm0
229
230         movq            xmm3,           QWORD PTR [rdi]
231         punpcklbw       xmm3,           xmm0
232         movq            xmm4,           QWORD PTR [rdi+8]
233         punpcklbw       xmm4,           xmm0
234
235         psubw           xmm1,           xmm3
236         psubw           xmm2,           xmm4
237         paddw           xmm6,           xmm1
238         paddw           xmm6,           xmm2
239         pmaddwd         xmm1,           xmm1
240         pmaddwd         xmm2,           xmm2
241         paddd           xmm7,           xmm1
242         paddd           xmm7,           xmm2
243
244         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
245         lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
246         sub             rcx,            1
247         jnz             .filter_block2d_bil_full_pixel_loop
248
249         jmp             .filter_block2d_bil_variance
250
251 .filter_block2d_bil_var_ssse3_fp_only:
252         mov             rsi,            arg(0)               ;ref_ptr
253         mov             rdi,            arg(2)               ;src_ptr
254         movsxd          rcx,            dword ptr arg(4)     ;Height
255         movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
256
257         pxor            xmm0,           xmm0
258
259 %if ABI_IS_32BIT=0
260         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
261 %endif
262
263 .filter_block2d_bil_fp_only_loop:
264         movdqu          xmm1,           XMMWORD PTR [rsi]
265         movdqu          xmm2,           XMMWORD PTR [rsi+1]
266         movdqa          xmm3,           xmm1
267
268         punpcklbw       xmm1,           xmm2
269         punpckhbw       xmm3,           xmm2
270         pmaddubsw       xmm1,           [rax]
271         pmaddubsw       xmm3,           [rax]
272
273         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
274         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
275         psraw           xmm1,           xmm_filter_shift
276         psraw           xmm3,           xmm_filter_shift
277
278         movq            xmm2,           XMMWORD PTR [rdi]
279         pxor            xmm4,           xmm4
280         punpcklbw       xmm2,           xmm4
281         movq            xmm5,           QWORD PTR [rdi+8]
282         punpcklbw       xmm5,           xmm4
283
284         psubw           xmm1,           xmm2
285         psubw           xmm3,           xmm5
286         paddw           xmm6,           xmm1
287         paddw           xmm6,           xmm3
288         pmaddwd         xmm1,           xmm1
289         pmaddwd         xmm3,           xmm3
290         paddd           xmm7,           xmm1
291         paddd           xmm7,           xmm3
292
293         lea             rsi,            [rsi + rdx]
294 %if ABI_IS_32BIT
295         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
296 %else
297         lea             rdi,            [rdi + r9]
298 %endif
299
300         sub             rcx,            1
301         jnz             .filter_block2d_bil_fp_only_loop
302
303         jmp             .filter_block2d_bil_variance
304
305 .filter_block2d_bil_variance:
306         pxor        xmm0,           xmm0
307         pxor        xmm1,           xmm1
308         pxor        xmm5,           xmm5
309
310         punpcklwd   xmm0,           xmm6
311         punpckhwd   xmm1,           xmm6
312         psrad       xmm0,           16
313         psrad       xmm1,           16
314         paddd       xmm0,           xmm1
315         movdqa      xmm1,           xmm0
316
317         movdqa      xmm6,           xmm7
318         punpckldq   xmm6,           xmm5
319         punpckhdq   xmm7,           xmm5
320         paddd       xmm6,           xmm7
321
322         punpckldq   xmm0,           xmm5
323         punpckhdq   xmm1,           xmm5
324         paddd       xmm0,           xmm1
325
326         movdqa      xmm7,           xmm6
327         movdqa      xmm1,           xmm0
328
329         psrldq      xmm7,           8
330         psrldq      xmm1,           8
331
332         paddd       xmm6,           xmm7
333         paddd       xmm0,           xmm1
334
335         mov         rsi,            arg(7) ;[Sum]
336         mov         rdi,            arg(8) ;[SSE]
337
338         movd        [rsi],       xmm0
339         movd        [rdi],       xmm6
340
341     ; begin epilog
342     pop rdi
343     pop rsi
344     RESTORE_GOT
345     RESTORE_XMM
346     UNSHADOW_ARGS
347     pop         rbp
348     ret
349
350
351 SECTION_RODATA
352 align 16
353 xmm_bi_rd:
354     times 8 dw 64
355 align 16
356 vp8_bilinear_filters_ssse3:
357     times 8 db 128, 0
358     times 8 db 112, 16
359     times 8 db 96,  32
360     times 8 db 80,  48
361     times 8 db 64,  64
362     times 8 db 48,  80
363     times 8 db 32,  96
364     times 8 db 16,  112