2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 mov rdx, arg(5) ;filter ptr
16 mov rsi, arg(0) ;src_ptr
17 mov rdi, arg(2) ;output_ptr
20 movdqa xmm4, [rdx] ;load filters
23 pshuflw xmm0, xmm4, 0b ;k0_k1
24 pshuflw xmm1, xmm4, 01010101b ;k2_k3
25 pshuflw xmm2, xmm4, 10101010b ;k4_k5
26 pshuflw xmm3, xmm4, 11111111b ;k6_k7
40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
43 movsxd r8, DWORD PTR arg(3) ;out_pitch
46 movsxd rcx, DWORD PTR arg(4) ;output_height
49 lea rbx, [rdx + rdx*4]
50 add rbx, rdx ;pitch * 6
54 movd xmm1, [rsi + rdx] ;B
55 movd xmm2, [rsi + rdx * 2] ;C
56 movd xmm3, [rax + rdx * 2] ;D
57 movd xmm4, [rsi + rdx * 4] ;E
58 movd xmm5, [rax + rdx * 4] ;F
60 punpcklbw xmm0, xmm1 ;A B
61 punpcklbw xmm2, xmm3 ;C D
62 punpcklbw xmm4, xmm5 ;E F
64 movd xmm6, [rsi + rbx] ;G
65 movd xmm7, [rax + rbx] ;H
69 punpcklbw xmm6, xmm7 ;G H
93 add rdi, DWORD PTR arg(3) ;out_pitch
102 mov rdx, arg(5) ;filter ptr
103 mov rsi, arg(0) ;src_ptr
104 mov rdi, arg(2) ;output_ptr
107 movdqa xmm4, [rdx] ;load filters
110 pshuflw xmm0, xmm4, 0b ;k0_k1
111 pshuflw xmm1, xmm4, 01010101b ;k2_k3
112 pshuflw xmm2, xmm4, 10101010b ;k4_k5
113 pshuflw xmm3, xmm4, 11111111b ;k6_k7
115 punpcklqdq xmm0, xmm0
116 punpcklqdq xmm1, xmm1
117 punpcklqdq xmm2, xmm2
118 punpcklqdq xmm3, xmm3
127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
130 movsxd r8, DWORD PTR arg(3) ;out_pitch
133 movsxd rcx, DWORD PTR arg(4) ;output_height
136 lea rbx, [rdx + rdx*4]
137 add rbx, rdx ;pitch * 6
141 movq xmm1, [rsi + rdx] ;B
142 movq xmm2, [rsi + rdx * 2] ;C
143 movq xmm3, [rax + rdx * 2] ;D
144 movq xmm4, [rsi + rdx * 4] ;E
145 movq xmm5, [rax + rdx * 4] ;F
147 punpcklbw xmm0, xmm1 ;A B
148 punpcklbw xmm2, xmm3 ;C D
149 punpcklbw xmm4, xmm5 ;E F
151 movq xmm6, [rsi + rbx] ;G
152 movq xmm7, [rax + rbx] ;H
156 punpcklbw xmm6, xmm7 ;G H
180 add rdi, DWORD PTR arg(3) ;out_pitch
190 mov rdx, arg(5) ;filter ptr
191 mov rsi, arg(0) ;src_ptr
192 mov rdi, arg(2) ;output_ptr
195 movdqa xmm4, [rdx] ;load filters
198 pshuflw xmm0, xmm4, 0b ;k0_k1
199 pshuflw xmm1, xmm4, 01010101b ;k2_k3
200 pshuflw xmm2, xmm4, 10101010b ;k4_k5
201 pshuflw xmm3, xmm4, 11111111b ;k6_k7
203 punpcklqdq xmm0, xmm0
204 punpcklqdq xmm1, xmm1
205 punpcklqdq xmm2, xmm2
206 punpcklqdq xmm3, xmm3
215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
218 movsxd r8, DWORD PTR arg(3) ;out_pitch
221 movsxd rcx, DWORD PTR arg(4) ;output_height
224 lea rbx, [rdx + rdx*4]
225 add rbx, rdx ;pitch * 6
229 movq xmm1, [rsi + rdx] ;B
230 movq xmm2, [rsi + rdx * 2] ;C
231 movq xmm3, [rax + rdx * 2] ;D
232 movq xmm4, [rsi + rdx * 4] ;E
233 movq xmm5, [rax + rdx * 4] ;F
235 punpcklbw xmm0, xmm1 ;A B
236 punpcklbw xmm2, xmm3 ;C D
237 punpcklbw xmm4, xmm5 ;E F
239 movq xmm6, [rsi + rbx] ;G
240 movq xmm7, [rax + rbx] ;H
244 punpcklbw xmm6, xmm7 ;G H
264 movq xmm0, [rsi + 8] ;A
265 movq xmm1, [rsi + rdx + 8] ;B
266 movq xmm2, [rsi + rdx * 2 + 8] ;C
267 movq xmm3, [rax + rdx * 2 + 8] ;D
268 movq xmm4, [rsi + rdx * 4 + 8] ;E
269 movq xmm5, [rax + rdx * 4 + 8] ;F
271 punpcklbw xmm0, xmm1 ;A B
272 punpcklbw xmm2, xmm3 ;C D
273 punpcklbw xmm4, xmm5 ;E F
275 movq xmm6, [rsi + rbx + 8] ;G
276 movq xmm7, [rax + rbx + 8] ;H
277 punpcklbw xmm6, xmm7 ;G H
305 add rdi, DWORD PTR arg(3) ;out_pitch
313 ;void vp9_filter_block1d8_v8_ssse3
315 ; unsigned char *src_ptr,
316 ; unsigned int src_pitch,
317 ; unsigned char *output_ptr,
318 ; unsigned int out_pitch,
319 ; unsigned int output_height,
322 global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
323 sym(vp9_filter_block1d4_v8_ssse3):
326 SHADOW_ARGS_TO_STACK 6
335 %define k0k1 [rsp + 16*0]
336 %define k2k3 [rsp + 16*1]
337 %define k4k5 [rsp + 16*2]
338 %define k6k7 [rsp + 16*3]
339 %define krd [rsp + 16*4]
354 ;void vp9_filter_block1d8_v8_ssse3
356 ; unsigned char *src_ptr,
357 ; unsigned int src_pitch,
358 ; unsigned char *output_ptr,
359 ; unsigned int out_pitch,
360 ; unsigned int output_height,
363 global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
364 sym(vp9_filter_block1d8_v8_ssse3):
367 SHADOW_ARGS_TO_STACK 6
376 %define k0k1 [rsp + 16*0]
377 %define k2k3 [rsp + 16*1]
378 %define k4k5 [rsp + 16*2]
379 %define k6k7 [rsp + 16*3]
380 %define krd [rsp + 16*4]
395 ;void vp9_filter_block1d16_v8_ssse3
397 ; unsigned char *src_ptr,
398 ; unsigned int src_pitch,
399 ; unsigned char *output_ptr,
400 ; unsigned int out_pitch,
401 ; unsigned int output_height,
404 global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
405 sym(vp9_filter_block1d16_v8_ssse3):
408 SHADOW_ARGS_TO_STACK 6
417 %define k0k1 [rsp + 16*0]
418 %define k2k3 [rsp + 16*1]
419 %define k4k5 [rsp + 16*2]
420 %define k6k7 [rsp + 16*3]
421 %define krd [rsp + 16*4]
436 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
439 global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
440 sym(vp9_filter_block1d4_v8_avg_ssse3):
443 SHADOW_ARGS_TO_STACK 6
452 %define k0k1 [rsp + 16*0]
453 %define k2k3 [rsp + 16*1]
454 %define k4k5 [rsp + 16*2]
455 %define k6k7 [rsp + 16*3]
456 %define krd [rsp + 16*4]
471 global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
472 sym(vp9_filter_block1d8_v8_avg_ssse3):
475 SHADOW_ARGS_TO_STACK 6
484 %define k0k1 [rsp + 16*0]
485 %define k2k3 [rsp + 16*1]
486 %define k4k5 [rsp + 16*2]
487 %define k6k7 [rsp + 16*3]
488 %define krd [rsp + 16*4]
503 global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
504 sym(vp9_filter_block1d16_v8_avg_ssse3):
507 SHADOW_ARGS_TO_STACK 6
516 %define k0k1 [rsp + 16*0]
517 %define k2k3 [rsp + 16*1]
518 %define k4k5 [rsp + 16*2]
519 %define k6k7 [rsp + 16*3]
520 %define krd [rsp + 16*4]
535 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
538 pshufb %1, [GLOBAL(shuf_t0t1)]
539 pshufb %2, [GLOBAL(shuf_t2t3)]
540 pmaddubsw %1, k0k1k4k5
541 pmaddubsw %2, k2k3k6k7
561 mov rdx, arg(5) ;filter ptr
562 mov rsi, arg(0) ;src_ptr
563 mov rdi, arg(2) ;output_ptr
566 movdqa xmm4, [rdx] ;load filters
569 pshuflw xmm6, xmm4, 0b ;k0_k1
570 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
571 pshuflw xmm7, xmm4, 01010101b ;k2_k3
572 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
573 pshufd xmm5, xmm5, 0 ;rounding
575 movdqa k0k1k4k5, xmm6
576 movdqa k2k3k6k7, xmm7
579 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
580 movsxd rdx, dword ptr arg(3) ;output_pitch
581 movsxd rcx, dword ptr arg(4) ;output_height
585 movq xmm0, [rsi - 3] ;load src
587 movq xmm2, [rsi + rax - 3]
588 movq xmm3, [rsi + rax + 5]
589 punpcklqdq xmm0, xmm1
590 punpcklqdq xmm2, xmm3
592 HORIZx4_ROW xmm0, xmm1
593 HORIZx4_ROW xmm2, xmm3
597 movd xmm3, [rdi + rdx]
601 movd [rdi +rdx], xmm2
604 prefetcht0 [rsi + 4 * rax - 3]
606 lea rdi, [rdi + 2 * rdx]
607 prefetcht0 [rsi + 2 * rax - 3]
612 ; Do last row if output_height is odd
613 movsxd rcx, dword ptr arg(4) ;output_height
617 movq xmm0, [rsi - 3] ; load src
619 punpcklqdq xmm0, xmm1
621 HORIZx4_ROW xmm0, xmm1
635 pshufb %1, [GLOBAL(shuf_t0t1)]
636 pshufb %2, [GLOBAL(shuf_t2t3)]
637 pshufb %3, [GLOBAL(shuf_t4t5)]
638 pshufb %4, [GLOBAL(shuf_t6t7)]
658 mov rdx, arg(5) ;filter ptr
659 mov rsi, arg(0) ;src_ptr
660 mov rdi, arg(2) ;output_ptr
663 movdqa xmm4, [rdx] ;load filters
666 pshuflw xmm0, xmm4, 0b ;k0_k1
667 pshuflw xmm1, xmm4, 01010101b ;k2_k3
668 pshuflw xmm2, xmm4, 10101010b ;k4_k5
669 pshuflw xmm3, xmm4, 11111111b ;k6_k7
671 punpcklqdq xmm0, xmm0
672 punpcklqdq xmm1, xmm1
673 punpcklqdq xmm2, xmm2
674 punpcklqdq xmm3, xmm3
683 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
684 movsxd rdx, dword ptr arg(3) ;output_pitch
685 movsxd rcx, dword ptr arg(4) ;output_height
689 movq xmm0, [rsi - 3] ;load src
691 movq xmm4, [rsi + rax - 3]
692 movq xmm7, [rsi + rax + 5]
693 punpcklqdq xmm0, xmm3
694 punpcklqdq xmm4, xmm7
696 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
697 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
700 movq xmm2, [rdi + rdx]
705 movq [rdi + rdx], xmm4
708 prefetcht0 [rsi + 4 * rax - 3]
710 lea rdi, [rdi + 2 * rdx]
711 prefetcht0 [rsi + 2 * rax - 3]
715 ;Do last row if output_height is odd
716 movsxd rcx, dword ptr arg(4) ;output_height
722 punpcklqdq xmm0, xmm3
724 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
734 mov rdx, arg(5) ;filter ptr
735 mov rsi, arg(0) ;src_ptr
736 mov rdi, arg(2) ;output_ptr
739 movdqa xmm4, [rdx] ;load filters
742 pshuflw xmm0, xmm4, 0b ;k0_k1
743 pshuflw xmm1, xmm4, 01010101b ;k2_k3
744 pshuflw xmm2, xmm4, 10101010b ;k4_k5
745 pshuflw xmm3, xmm4, 11111111b ;k6_k7
747 punpcklqdq xmm0, xmm0
748 punpcklqdq xmm1, xmm1
749 punpcklqdq xmm2, xmm2
750 punpcklqdq xmm3, xmm3
759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
760 movsxd rdx, dword ptr arg(3) ;output_pitch
761 movsxd rcx, dword ptr arg(4) ;output_height
764 prefetcht0 [rsi + 2 * rax -3]
766 movq xmm0, [rsi - 3] ;load src data
768 movq xmm7, [rsi + 13]
769 punpcklqdq xmm0, xmm4
770 punpcklqdq xmm4, xmm7
779 pshufb xmm0, [GLOBAL(shuf_t0t1)]
780 pshufb xmm1, [GLOBAL(shuf_t2t3)]
781 pshufb xmm2, [GLOBAL(shuf_t4t5)]
782 pshufb xmm3, [GLOBAL(shuf_t6t7)]
783 pshufb xmm4, [GLOBAL(shuf_t0t1)]
784 pshufb xmm5, [GLOBAL(shuf_t2t3)]
785 pshufb xmm6, [GLOBAL(shuf_t4t5)]
786 pshufb xmm7, [GLOBAL(shuf_t6t7)]
817 punpcklqdq xmm0, xmm4
831 ;void vp9_filter_block1d4_h8_ssse3
833 ; unsigned char *src_ptr,
834 ; unsigned int src_pixels_per_line,
835 ; unsigned char *output_ptr,
836 ; unsigned int output_pitch,
837 ; unsigned int output_height,
840 global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
841 sym(vp9_filter_block1d4_h8_ssse3):
844 SHADOW_ARGS_TO_STACK 6
853 %define k0k1k4k5 [rsp + 16 * 0]
854 %define k2k3k6k7 [rsp + 16 * 1]
855 %define krd [rsp + 16 * 2]
870 ;void vp9_filter_block1d8_h8_ssse3
872 ; unsigned char *src_ptr,
873 ; unsigned int src_pixels_per_line,
874 ; unsigned char *output_ptr,
875 ; unsigned int output_pitch,
876 ; unsigned int output_height,
879 global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
880 sym(vp9_filter_block1d8_h8_ssse3):
883 SHADOW_ARGS_TO_STACK 6
892 %define k0k1 [rsp + 16*0]
893 %define k2k3 [rsp + 16*1]
894 %define k4k5 [rsp + 16*2]
895 %define k6k7 [rsp + 16*3]
896 %define krd [rsp + 16*4]
912 ;void vp9_filter_block1d16_h8_ssse3
914 ; unsigned char *src_ptr,
915 ; unsigned int src_pixels_per_line,
916 ; unsigned char *output_ptr,
917 ; unsigned int output_pitch,
918 ; unsigned int output_height,
921 global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
922 sym(vp9_filter_block1d16_h8_ssse3):
925 SHADOW_ARGS_TO_STACK 6
934 %define k0k1 [rsp + 16*0]
935 %define k2k3 [rsp + 16*1]
936 %define k4k5 [rsp + 16*2]
937 %define k6k7 [rsp + 16*3]
938 %define krd [rsp + 16*4]
954 global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
955 sym(vp9_filter_block1d4_h8_avg_ssse3):
958 SHADOW_ARGS_TO_STACK 6
967 %define k0k1k4k5 [rsp + 16 * 0]
968 %define k2k3k6k7 [rsp + 16 * 1]
969 %define krd [rsp + 16 * 2]
984 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
985 sym(vp9_filter_block1d8_h8_avg_ssse3):
988 SHADOW_ARGS_TO_STACK 6
997 %define k0k1 [rsp + 16*0]
998 %define k2k3 [rsp + 16*1]
999 %define k4k5 [rsp + 16*2]
1000 %define k6k7 [rsp + 16*3]
1001 %define krd [rsp + 16*4]
1017 global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
1018 sym(vp9_filter_block1d16_h8_avg_ssse3):
1021 SHADOW_ARGS_TO_STACK 6
1030 %define k0k1 [rsp + 16*0]
1031 %define k2k3 [rsp + 16*1]
1032 %define k4k5 [rsp + 16*2]
1033 %define k6k7 [rsp + 16*3]
1034 %define krd [rsp + 16*4]
1052 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1055 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1058 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1061 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14