2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
13 extern sym(vp8_bilinear_filters_x86_8)
16 %define BLOCK_HEIGHT_WIDTH 4
17 %define vp8_filter_weight 128
18 %define VP8_FILTER_SHIFT 7
21 ;void vp8_filter_block1d_h6_mmx
23 ; unsigned char *src_ptr,
24 ; unsigned short *output_ptr,
25 ; unsigned int src_pixels_per_line,
26 ; unsigned int pixel_step,
27 ; unsigned int output_height,
28 ; unsigned int output_width,
31 global sym(vp8_filter_block1d_h6_mmx) PRIVATE
32 sym(vp8_filter_block1d_h6_mmx):
35 SHADOW_ARGS_TO_STACK 7
41 mov rdx, arg(6) ;vp8_filter
43 movq mm1, [rdx + 16] ; do both the negative taps first!!!
44 movq mm2, [rdx + 32] ;
45 movq mm6, [rdx + 48] ;
46 movq mm7, [rdx + 64] ;
48 mov rdi, arg(1) ;output_ptr
49 mov rsi, arg(0) ;src_ptr
50 movsxd rcx, dword ptr arg(4) ;output_height
51 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
52 pxor mm0, mm0 ; mm0 = 00000000
55 movq mm3, [rsi-2] ; mm3 = p-2..p5
56 movq mm4, mm3 ; mm4 = p-2..p5
57 psrlq mm3, 8 ; mm3 = p-1..p5
58 punpcklbw mm3, mm0 ; mm3 = p-1..p2
59 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
61 movq mm5, mm4 ; mm5 = p-2..p5
62 punpckhbw mm4, mm0 ; mm5 = p2..p5
63 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
64 paddsw mm3, mm4 ; mm3 += mm5
66 movq mm4, mm5 ; mm4 = p-2..p5;
67 psrlq mm5, 16 ; mm5 = p0..p5;
68 punpcklbw mm5, mm0 ; mm5 = p0..p3
69 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
70 paddsw mm3, mm5 ; mm3 += mm5
72 movq mm5, mm4 ; mm5 = p-2..p5
73 psrlq mm4, 24 ; mm4 = p1..p5
74 punpcklbw mm4, mm0 ; mm4 = p1..p4
75 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
76 paddsw mm3, mm4 ; mm3 += mm5
78 ; do outer positive taps
80 punpcklbw mm4, mm0 ; mm5 = p3..p6
81 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
82 paddsw mm3, mm4 ; mm3 += mm5
84 punpcklbw mm5, mm0 ; mm5 = p-2..p1
85 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
86 paddsw mm3, mm5 ; mm3 += mm5
88 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
89 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
90 packuswb mm3, mm0 ; pack and unpack to saturate
93 movq [rdi], mm3 ; store the results in the destination
96 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
99 movsxd r8, dword ptr arg(2) ;src_pixels_per_line
102 add rsi, r8 ; next line
105 dec rcx ; decrement count
106 jnz .nextrow ; next row
117 ;void vp8_filter_block1dc_v6_mmx
120 ; unsigned char *output_ptr,
122 ; unsigned int pixels_per_line,
123 ; unsigned int pixel_step,
124 ; unsigned int output_height,
125 ; unsigned int output_width,
128 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
129 sym(vp8_filter_block1dc_v6_mmx):
132 SHADOW_ARGS_TO_STACK 8
138 movq mm5, [GLOBAL(rd)]
140 mov rbx, arg(7) ;vp8_filter
141 movq mm1, [rbx + 16] ; do both the negative taps first!!!
142 movq mm2, [rbx + 32] ;
143 movq mm6, [rbx + 48] ;
144 movq mm7, [rbx + 64] ;
146 movsxd rdx, dword ptr arg(3) ;pixels_per_line
147 mov rdi, arg(1) ;output_ptr
148 mov rsi, arg(0) ;src_ptr
151 movsxd rcx, DWORD PTR arg(5) ;output_height
152 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
153 pxor mm0, mm0 ; mm0 = 00000000
157 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
158 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
161 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
162 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
163 paddsw mm3, mm4 ; mm3 += mm4
165 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
166 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
167 paddsw mm3, mm4 ; mm3 += mm4
169 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
170 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
171 paddsw mm3, mm4 ; mm3 += mm4
174 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
175 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
176 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
177 paddsw mm3, mm4 ; mm3 += mm4
179 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
180 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
181 paddsw mm3, mm4 ; mm3 += mm4
184 paddsw mm3, mm5 ; mm3 += round value
185 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
186 packuswb mm3, mm0 ; pack and saturate
188 movd [rdi],mm3 ; store the results in the destination
189 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
190 ; recon block should be in cache this shouldn't cost much. Its obviously
193 dec rcx ; decrement count
194 jnz .nextrow_cv ; next row
207 ;void bilinear_predict8x8_mmx
209 ; unsigned char *src_ptr,
210 ; int src_pixels_per_line,
213 ; unsigned char *dst_ptr,
216 global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
217 sym(vp8_bilinear_predict8x8_mmx):
220 SHADOW_ARGS_TO_STACK 6
226 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
227 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
229 movsxd rax, dword ptr arg(2) ;xoffset
230 mov rdi, arg(4) ;dst_ptr ;
232 shl rax, 5 ; offset * 32
233 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
235 add rax, rcx ; HFilter
236 mov rsi, arg(0) ;src_ptr ;
238 movsxd rdx, dword ptr arg(5) ;dst_pitch
242 movsxd rax, dword ptr arg(3) ;yoffset
246 shl rax, 5 ; offset*32
247 add rax, rcx ; VFilter
249 lea rcx, [rdi+rdx*8] ;
250 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
254 ; get the first horizontal line done ;
255 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
256 movq mm4, mm3 ; make a copy of current line
258 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
276 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
277 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
279 paddw mm4, [GLOBAL(rd)] ;
280 psraw mm4, VP8_FILTER_SHIFT ;
285 add rsi, rdx ; next line
287 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
288 movq mm4, mm3 ; make a copy of current line
290 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
317 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
318 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
320 paddw mm4, [GLOBAL(rd)] ;
321 psraw mm4, VP8_FILTER_SHIFT ;
327 pmullw mm3, [rax+16] ;
328 pmullw mm4, [rax+16] ;
334 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
335 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
337 paddw mm4, [GLOBAL(rd)] ;
338 psraw mm4, VP8_FILTER_SHIFT ;
342 movq [rdi], mm3 ; store the results in the destination
345 add rsi, rdx ; next line
346 add rdi, dword ptr arg(5) ;dst_pitch ;
348 movsxd r8, dword ptr arg(5) ;dst_pitch
349 add rsi, rdx ; next line
350 add rdi, r8 ;dst_pitch
364 ;void bilinear_predict8x4_mmx
366 ; unsigned char *src_ptr,
367 ; int src_pixels_per_line,
370 ; unsigned char *dst_ptr,
373 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
374 sym(vp8_bilinear_predict8x4_mmx):
377 SHADOW_ARGS_TO_STACK 6
383 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
384 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
386 movsxd rax, dword ptr arg(2) ;xoffset
387 mov rdi, arg(4) ;dst_ptr ;
389 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
392 mov rsi, arg(0) ;src_ptr ;
395 movsxd rdx, dword ptr arg(5) ;dst_pitch
399 movsxd rax, dword ptr arg(3) ;yoffset
405 lea rcx, [rdi+rdx*4] ;
407 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
409 ; get the first horizontal line done ;
410 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
411 movq mm4, mm3 ; make a copy of current line
413 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
431 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
432 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
434 paddw mm4, [GLOBAL(rd)] ;
435 psraw mm4, VP8_FILTER_SHIFT ;
440 add rsi, rdx ; next line
442 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
443 movq mm4, mm3 ; make a copy of current line
445 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
472 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
473 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
475 paddw mm4, [GLOBAL(rd)] ;
476 psraw mm4, VP8_FILTER_SHIFT ;
482 pmullw mm3, [rax+16] ;
483 pmullw mm4, [rax+16] ;
489 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
490 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
492 paddw mm4, [GLOBAL(rd)] ;
493 psraw mm4, VP8_FILTER_SHIFT ;
497 movq [rdi], mm3 ; store the results in the destination
500 add rsi, rdx ; next line
501 add rdi, dword ptr arg(5) ;dst_pitch ;
503 movsxd r8, dword ptr arg(5) ;dst_pitch
504 add rsi, rdx ; next line
519 ;void bilinear_predict4x4_mmx
521 ; unsigned char *src_ptr,
522 ; int src_pixels_per_line,
525 ; unsigned char *dst_ptr,
528 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
529 sym(vp8_bilinear_predict4x4_mmx):
532 SHADOW_ARGS_TO_STACK 6
538 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
539 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
541 movsxd rax, dword ptr arg(2) ;xoffset
542 mov rdi, arg(4) ;dst_ptr ;
544 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
547 add rax, rcx ; HFilter
548 mov rsi, arg(0) ;src_ptr ;
550 movsxd rdx, dword ptr arg(5) ;ldst_pitch
554 movsxd rax, dword ptr arg(3) ;yoffset
560 lea rcx, [rdi+rdx*4] ;
562 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
564 ; get the first horizontal line done ;
565 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
566 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
575 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
577 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
582 add rsi, rdx ; next line
584 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
585 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
599 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
601 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
606 pmullw mm3, [rax+16] ;
610 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
611 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
614 movd [rdi], mm3 ; store the results in the destination
617 add rsi, rdx ; next line
618 add rdi, dword ptr arg(5) ;dst_pitch ;
620 movsxd r8, dword ptr arg(5) ;dst_pitch ;
621 add rsi, rdx ; next line
644 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
645 sym(vp8_six_tap_mmx):