2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
17 %macro HIGH_GET_FILTERS_4 0
18 mov rdx, arg(5) ;filter ptr
21 movdqa xmm7, [rdx] ;load filters
22 pshuflw xmm0, xmm7, 0b ;k0
23 pshuflw xmm1, xmm7, 01010101b ;k1
24 pshuflw xmm2, xmm7, 10101010b ;k2
25 pshuflw xmm3, xmm7, 11111111b ;k3
27 pshuflw xmm4, xmm7, 0b ;k4
28 pshuflw xmm5, xmm7, 01010101b ;k5
29 pshuflw xmm6, xmm7, 10101010b ;k6
30 pshuflw xmm7, xmm7, 11111111b ;k7
46 ;Compute max and min values of a pixel
48 movsxd rcx, DWORD PTR arg(6) ;bps
56 movdqa max, xmm0 ;max value (for clamping)
57 movdqa min, xmm1 ;min value (for clamping)
61 %macro HIGH_APPLY_FILTER_4 1
62 punpcklwd xmm0, xmm6 ;two row in one register
67 pmaddwd xmm0, k0k6 ;multiply the filter factors
76 paddd xmm0, krd ;rounding
78 packssdw xmm0, xmm0 ;pack to word
91 %macro HIGH_GET_FILTERS 0
92 mov rdx, arg(5) ;filter ptr
93 mov rsi, arg(0) ;src_ptr
94 mov rdi, arg(2) ;output_ptr
97 movdqa xmm7, [rdx] ;load filters
98 pshuflw xmm0, xmm7, 0b ;k0
99 pshuflw xmm1, xmm7, 01010101b ;k1
100 pshuflw xmm2, xmm7, 10101010b ;k2
101 pshuflw xmm3, xmm7, 11111111b ;k3
102 pshufhw xmm4, xmm7, 0b ;k4
103 pshufhw xmm5, xmm7, 01010101b ;k5
104 pshufhw xmm6, xmm7, 10101010b ;k6
105 pshufhw xmm7, xmm7, 11111111b ;k7
106 punpcklqdq xmm2, xmm2
107 punpcklqdq xmm3, xmm3
113 movdqa k0k1, xmm0 ;store filter factors on stack
120 movdqa krd, xmm6 ;rounding
122 ;Compute max and min values of a pixel
124 movsxd rcx, DWORD PTR arg(6) ;bps
127 pshufd xmm0, xmm0, 0b
132 movdqa max, xmm0 ;max value (for clamping)
133 movdqa min, xmm1 ;min value (for clamping)
137 movdqu xmm0, [rsi + %1] ;0
138 movdqu xmm1, [rsi + rax + %1] ;1
139 movdqu xmm6, [rsi + rdx * 2 + %1] ;6
141 movdqu xmm7, [rsi + rdx * 2 + %1] ;7
142 movdqu xmm2, [rsi + rax + %1] ;2
143 movdqu xmm3, [rsi + rax * 2 + %1] ;3
144 movdqu xmm4, [rsi + rdx + %1] ;4
145 movdqu xmm5, [rsi + rax * 4 + %1] ;5
148 %macro HIGH_APPLY_FILTER_8 2
183 paddd xmm0, krd ;rounding
187 packssdw xmm0, xmm5 ;pack back to word
194 movdqu xmm1, [rdi + %2]
197 movdqu [rdi + %2], xmm0
200 ;void vp9_filter_block1d4_v8_sse2
202 ; unsigned char *src_ptr,
203 ; unsigned int src_pitch,
204 ; unsigned char *output_ptr,
205 ; unsigned int out_pitch,
206 ; unsigned int output_height,
209 global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE
210 sym(vp9_highbd_filter_block1d4_v8_sse2):
213 SHADOW_ARGS_TO_STACK 7
222 %define k0k6 [rsp + 16 * 0]
223 %define k2k5 [rsp + 16 * 1]
224 %define k3k4 [rsp + 16 * 2]
225 %define k1k7 [rsp + 16 * 3]
226 %define krd [rsp + 16 * 4]
227 %define max [rsp + 16 * 5]
228 %define min [rsp + 16 * 6]
232 mov rsi, arg(0) ;src_ptr
233 mov rdi, arg(2) ;output_ptr
235 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
236 movsxd rbx, DWORD PTR arg(3) ;out_pitch
237 lea rax, [rax + rax] ;bytes per line
239 lea rdx, [rax + rax * 2]
240 movsxd rcx, DWORD PTR arg(4) ;output_height
243 movq xmm0, [rsi] ;load src: row 0
244 movq xmm1, [rsi + rax] ;1
245 movq xmm6, [rsi + rdx * 2] ;6
247 movq xmm7, [rsi + rdx * 2] ;7
248 movq xmm2, [rsi + rax] ;2
249 movq xmm3, [rsi + rax * 2] ;3
250 movq xmm4, [rsi + rdx] ;4
251 movq xmm5, [rsi + rax * 4] ;5
253 HIGH_APPLY_FILTER_4 0
270 ;void vp9_filter_block1d8_v8_sse2
272 ; unsigned char *src_ptr,
273 ; unsigned int src_pitch,
274 ; unsigned char *output_ptr,
275 ; unsigned int out_pitch,
276 ; unsigned int output_height,
279 global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE
280 sym(vp9_highbd_filter_block1d8_v8_sse2):
283 SHADOW_ARGS_TO_STACK 7
292 %define k0k1 [rsp + 16 * 0]
293 %define k6k7 [rsp + 16 * 1]
294 %define k2k5 [rsp + 16 * 2]
295 %define k3k4 [rsp + 16 * 3]
296 %define krd [rsp + 16 * 4]
297 %define temp [rsp + 16 * 5]
298 %define max [rsp + 16 * 6]
299 %define min [rsp + 16 * 7]
303 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
304 movsxd rbx, DWORD PTR arg(3) ;out_pitch
305 lea rax, [rax + rax] ;bytes per line
307 lea rdx, [rax + rax * 2]
308 movsxd rcx, DWORD PTR arg(4) ;output_height
312 HIGH_APPLY_FILTER_8 0, 0
329 ;void vp9_filter_block1d16_v8_sse2
331 ; unsigned char *src_ptr,
332 ; unsigned int src_pitch,
333 ; unsigned char *output_ptr,
334 ; unsigned int out_pitch,
335 ; unsigned int output_height,
338 global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE
339 sym(vp9_highbd_filter_block1d16_v8_sse2):
342 SHADOW_ARGS_TO_STACK 7
351 %define k0k1 [rsp + 16 * 0]
352 %define k6k7 [rsp + 16 * 1]
353 %define k2k5 [rsp + 16 * 2]
354 %define k3k4 [rsp + 16 * 3]
355 %define krd [rsp + 16 * 4]
356 %define temp [rsp + 16 * 5]
357 %define max [rsp + 16 * 6]
358 %define min [rsp + 16 * 7]
362 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
363 movsxd rbx, DWORD PTR arg(3) ;out_pitch
364 lea rax, [rax + rax] ;bytes per line
366 lea rdx, [rax + rax * 2]
367 movsxd rcx, DWORD PTR arg(4) ;output_height
371 HIGH_APPLY_FILTER_8 0, 0
375 HIGH_APPLY_FILTER_8 0, 16
392 global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
393 sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
396 SHADOW_ARGS_TO_STACK 7
405 %define k0k6 [rsp + 16 * 0]
406 %define k2k5 [rsp + 16 * 1]
407 %define k3k4 [rsp + 16 * 2]
408 %define k1k7 [rsp + 16 * 3]
409 %define krd [rsp + 16 * 4]
410 %define max [rsp + 16 * 5]
411 %define min [rsp + 16 * 6]
415 mov rsi, arg(0) ;src_ptr
416 mov rdi, arg(2) ;output_ptr
418 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
419 movsxd rbx, DWORD PTR arg(3) ;out_pitch
420 lea rax, [rax + rax] ;bytes per line
422 lea rdx, [rax + rax * 2]
423 movsxd rcx, DWORD PTR arg(4) ;output_height
426 movq xmm0, [rsi] ;load src: row 0
427 movq xmm1, [rsi + rax] ;1
428 movq xmm6, [rsi + rdx * 2] ;6
430 movq xmm7, [rsi + rdx * 2] ;7
431 movq xmm2, [rsi + rax] ;2
432 movq xmm3, [rsi + rax * 2] ;3
433 movq xmm4, [rsi + rdx] ;4
434 movq xmm5, [rsi + rax * 4] ;5
436 HIGH_APPLY_FILTER_4 1
453 global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
454 sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
457 SHADOW_ARGS_TO_STACK 7
466 %define k0k1 [rsp + 16 * 0]
467 %define k6k7 [rsp + 16 * 1]
468 %define k2k5 [rsp + 16 * 2]
469 %define k3k4 [rsp + 16 * 3]
470 %define krd [rsp + 16 * 4]
471 %define temp [rsp + 16 * 5]
472 %define max [rsp + 16 * 6]
473 %define min [rsp + 16 * 7]
477 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
478 movsxd rbx, DWORD PTR arg(3) ;out_pitch
479 lea rax, [rax + rax] ;bytes per line
481 lea rdx, [rax + rax * 2]
482 movsxd rcx, DWORD PTR arg(4) ;output_height
485 HIGH_APPLY_FILTER_8 1, 0
502 global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
503 sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
506 SHADOW_ARGS_TO_STACK 7
515 %define k0k1 [rsp + 16 * 0]
516 %define k6k7 [rsp + 16 * 1]
517 %define k2k5 [rsp + 16 * 2]
518 %define k3k4 [rsp + 16 * 3]
519 %define krd [rsp + 16 * 4]
520 %define temp [rsp + 16 * 5]
521 %define max [rsp + 16 * 6]
522 %define min [rsp + 16 * 7]
526 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
527 movsxd rbx, DWORD PTR arg(3) ;out_pitch
528 lea rax, [rax + rax] ;bytes per line
530 lea rdx, [rax + rax * 2]
531 movsxd rcx, DWORD PTR arg(4) ;output_height
534 HIGH_APPLY_FILTER_8 1, 0
538 HIGH_APPLY_FILTER_8 1, 16
555 ;void vp9_filter_block1d4_h8_sse2
557 ; unsigned char *src_ptr,
558 ; unsigned int src_pixels_per_line,
559 ; unsigned char *output_ptr,
560 ; unsigned int output_pitch,
561 ; unsigned int output_height,
564 global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE
565 sym(vp9_highbd_filter_block1d4_h8_sse2):
568 SHADOW_ARGS_TO_STACK 7
576 %define k0k6 [rsp + 16 * 0]
577 %define k2k5 [rsp + 16 * 1]
578 %define k3k4 [rsp + 16 * 2]
579 %define k1k7 [rsp + 16 * 3]
580 %define krd [rsp + 16 * 4]
581 %define max [rsp + 16 * 5]
582 %define min [rsp + 16 * 6]
586 mov rsi, arg(0) ;src_ptr
587 mov rdi, arg(2) ;output_ptr
589 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
590 movsxd rdx, DWORD PTR arg(3) ;out_pitch
591 lea rax, [rax + rax] ;bytes per line
593 movsxd rcx, DWORD PTR arg(4) ;output_height
596 movdqu xmm0, [rsi - 6] ;load src
597 movdqu xmm4, [rsi + 2]
612 HIGH_APPLY_FILTER_4 0
630 ;void vp9_filter_block1d8_h8_sse2
632 ; unsigned char *src_ptr,
633 ; unsigned int src_pixels_per_line,
634 ; unsigned char *output_ptr,
635 ; unsigned int output_pitch,
636 ; unsigned int output_height,
639 global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE
640 sym(vp9_highbd_filter_block1d8_h8_sse2):
643 SHADOW_ARGS_TO_STACK 7
651 %define k0k1 [rsp + 16 * 0]
652 %define k6k7 [rsp + 16 * 1]
653 %define k2k5 [rsp + 16 * 2]
654 %define k3k4 [rsp + 16 * 3]
655 %define krd [rsp + 16 * 4]
656 %define temp [rsp + 16 * 5]
657 %define max [rsp + 16 * 6]
658 %define min [rsp + 16 * 7]
662 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
663 movsxd rdx, DWORD PTR arg(3) ;out_pitch
664 lea rax, [rax + rax] ;bytes per line
666 movsxd rcx, DWORD PTR arg(4) ;output_height
669 movdqu xmm0, [rsi - 6] ;load src
670 movdqu xmm1, [rsi - 4]
671 movdqu xmm2, [rsi - 2]
673 movdqu xmm4, [rsi + 2]
674 movdqu xmm5, [rsi + 4]
675 movdqu xmm6, [rsi + 6]
676 movdqu xmm7, [rsi + 8]
678 HIGH_APPLY_FILTER_8 0, 0
696 ;void vp9_filter_block1d16_h8_sse2
698 ; unsigned char *src_ptr,
699 ; unsigned int src_pixels_per_line,
700 ; unsigned char *output_ptr,
701 ; unsigned int output_pitch,
702 ; unsigned int output_height,
705 global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE
706 sym(vp9_highbd_filter_block1d16_h8_sse2):
709 SHADOW_ARGS_TO_STACK 7
717 %define k0k1 [rsp + 16 * 0]
718 %define k6k7 [rsp + 16 * 1]
719 %define k2k5 [rsp + 16 * 2]
720 %define k3k4 [rsp + 16 * 3]
721 %define krd [rsp + 16 * 4]
722 %define temp [rsp + 16 * 5]
723 %define max [rsp + 16 * 6]
724 %define min [rsp + 16 * 7]
728 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
729 movsxd rdx, DWORD PTR arg(3) ;out_pitch
730 lea rax, [rax + rax] ;bytes per line
732 movsxd rcx, DWORD PTR arg(4) ;output_height
735 movdqu xmm0, [rsi - 6] ;load src
736 movdqu xmm1, [rsi - 4]
737 movdqu xmm2, [rsi - 2]
739 movdqu xmm4, [rsi + 2]
740 movdqu xmm5, [rsi + 4]
741 movdqu xmm6, [rsi + 6]
742 movdqu xmm7, [rsi + 8]
744 HIGH_APPLY_FILTER_8 0, 0
746 movdqu xmm0, [rsi + 10] ;load src
747 movdqu xmm1, [rsi + 12]
748 movdqu xmm2, [rsi + 14]
749 movdqu xmm3, [rsi + 16]
750 movdqu xmm4, [rsi + 18]
751 movdqu xmm5, [rsi + 20]
752 movdqu xmm6, [rsi + 22]
753 movdqu xmm7, [rsi + 24]
755 HIGH_APPLY_FILTER_8 0, 16
773 global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
774 sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
777 SHADOW_ARGS_TO_STACK 7
785 %define k0k6 [rsp + 16 * 0]
786 %define k2k5 [rsp + 16 * 1]
787 %define k3k4 [rsp + 16 * 2]
788 %define k1k7 [rsp + 16 * 3]
789 %define krd [rsp + 16 * 4]
790 %define max [rsp + 16 * 5]
791 %define min [rsp + 16 * 6]
795 mov rsi, arg(0) ;src_ptr
796 mov rdi, arg(2) ;output_ptr
798 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
799 movsxd rdx, DWORD PTR arg(3) ;out_pitch
800 lea rax, [rax + rax] ;bytes per line
802 movsxd rcx, DWORD PTR arg(4) ;output_height
805 movdqu xmm0, [rsi - 6] ;load src
806 movdqu xmm4, [rsi + 2]
821 HIGH_APPLY_FILTER_4 1
839 global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
840 sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
843 SHADOW_ARGS_TO_STACK 7
851 %define k0k1 [rsp + 16 * 0]
852 %define k6k7 [rsp + 16 * 1]
853 %define k2k5 [rsp + 16 * 2]
854 %define k3k4 [rsp + 16 * 3]
855 %define krd [rsp + 16 * 4]
856 %define temp [rsp + 16 * 5]
857 %define max [rsp + 16 * 6]
858 %define min [rsp + 16 * 7]
862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
863 movsxd rdx, DWORD PTR arg(3) ;out_pitch
864 lea rax, [rax + rax] ;bytes per line
866 movsxd rcx, DWORD PTR arg(4) ;output_height
869 movdqu xmm0, [rsi - 6] ;load src
870 movdqu xmm1, [rsi - 4]
871 movdqu xmm2, [rsi - 2]
873 movdqu xmm4, [rsi + 2]
874 movdqu xmm5, [rsi + 4]
875 movdqu xmm6, [rsi + 6]
876 movdqu xmm7, [rsi + 8]
878 HIGH_APPLY_FILTER_8 1, 0
896 global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
897 sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
900 SHADOW_ARGS_TO_STACK 7
908 %define k0k1 [rsp + 16 * 0]
909 %define k6k7 [rsp + 16 * 1]
910 %define k2k5 [rsp + 16 * 2]
911 %define k3k4 [rsp + 16 * 3]
912 %define krd [rsp + 16 * 4]
913 %define temp [rsp + 16 * 5]
914 %define max [rsp + 16 * 6]
915 %define min [rsp + 16 * 7]
919 movsxd rax, DWORD PTR arg(1) ;pixels_per_line
920 movsxd rdx, DWORD PTR arg(3) ;out_pitch
921 lea rax, [rax + rax] ;bytes per line
923 movsxd rcx, DWORD PTR arg(4) ;output_height
926 movdqu xmm0, [rsi - 6] ;load src
927 movdqu xmm1, [rsi - 4]
928 movdqu xmm2, [rsi - 2]
930 movdqu xmm4, [rsi + 2]
931 movdqu xmm5, [rsi + 4]
932 movdqu xmm6, [rsi + 6]
933 movdqu xmm7, [rsi + 8]
935 HIGH_APPLY_FILTER_8 1, 0
937 movdqu xmm0, [rsi + 10] ;load src
938 movdqu xmm1, [rsi + 12]
939 movdqu xmm2, [rsi + 14]
940 movdqu xmm3, [rsi + 16]
941 movdqu xmm4, [rsi + 18]
942 movdqu xmm5, [rsi + 20]
943 movdqu xmm6, [rsi + 22]
944 movdqu xmm7, [rsi + 24]
946 HIGH_APPLY_FILTER_8 1, 16