2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void copy_mem16x16_sse2(
20 global sym(vp8_copy_mem16x16_sse2) PRIVATE
21 sym(vp8_copy_mem16x16_sse2):
24 SHADOW_ARGS_TO_STACK 4
32 movsxd rax, dword ptr arg(1) ;src_stride;
35 movdqu xmm1, [rsi+rax]
36 movdqu xmm2, [rsi+rax*2]
38 movsxd rcx, dword ptr arg(3) ;dst_stride
44 movdqa [rdi+rcx], xmm1
45 movdqa [rdi+rcx*2],xmm2
51 movdqu xmm4, [rsi+rax]
53 movdqu xmm5, [rsi+rax*2]
59 movdqa [rdi+rcx], xmm4
60 movdqa [rdi+rcx*2],xmm5
66 movdqu xmm1, [rsi+rax]
68 movdqu xmm2, [rsi+rax*2]
74 movdqa [rdi+rcx], xmm1
76 movdqa [rdi+rcx*2], xmm2
79 movdqu xmm4, [rsi+rax]
83 movdqu xmm5, [rsi+rax*2]
89 movdqa [rdi+rcx], xmm4
91 movdqa [rdi+rcx*2],xmm5
95 movdqu xmm1, [rsi+rax]
98 movdqu xmm2, [rsi+rax*2]
103 movdqa [rdi+rcx], xmm1
104 movdqa [rdi+rcx*2],xmm2
106 movdqu xmm3, [rsi+rax]
109 movdqa [rdi+rcx], xmm3
119 ;void vp8_intra_pred_uv_dc_mmx2(
120 ; unsigned char *dst,
122 ; unsigned char *above,
123 ; unsigned char *left,
126 global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
127 sym(vp8_intra_pred_uv_dc_mmx2):
130 SHADOW_ARGS_TO_STACK 5
136 mov rdi, arg(2) ;above;
137 mov rsi, arg(3) ;left;
138 movsxd rax, dword ptr arg(4) ;left_stride;
144 movzx ecx, byte [rsi]
145 movzx edx, byte [rsi+rax*1]
147 movzx edx, byte [rsi+rax*2]
150 movzx edx, byte [rsi+rdi]
153 movzx edx, byte [rsi]
155 movzx edx, byte [rsi+rax]
157 movzx edx, byte [rsi+rax*2]
159 movzx edx, byte [rsi+rdi]
167 movsxd rcx, dword ptr arg(1) ;dst_stride
169 mov rdi, arg(0) ;dst;
178 movq [rdi+rcx*2], mm1
182 movq [rdx+rcx*2], mm1
192 ;void vp8_intra_pred_uv_dctop_mmx2(
193 ; unsigned char *dst,
195 ; unsigned char *above,
196 ; unsigned char *left,
199 global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
200 sym(vp8_intra_pred_uv_dctop_mmx2):
203 SHADOW_ARGS_TO_STACK 5
209 ;arg(3), arg(4) not used
212 mov rsi, arg(2) ;above;
218 paddw mm1, [GLOBAL(dc_4)]
224 mov rdi, arg(0) ;dst;
225 movsxd rcx, dword ptr arg(1) ;dst_stride
230 movq [rdi+rcx*2], mm1
235 movq [rdi+rcx*2], mm1
246 ;void vp8_intra_pred_uv_dcleft_mmx2(
247 ; unsigned char *dst,
249 ; unsigned char *above,
250 ; unsigned char *left,
253 global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
254 sym(vp8_intra_pred_uv_dcleft_mmx2):
257 SHADOW_ARGS_TO_STACK 5
265 mov rsi, arg(3) ;left;
266 movsxd rax, dword ptr arg(4) ;left_stride;
268 movzx ecx, byte [rsi]
269 movzx edx, byte [rsi+rax]
271 movzx edx, byte [rsi+rax*2]
273 movzx edx, byte [rsi+rdi]
276 movzx edx, byte [rsi]
278 movzx edx, byte [rsi+rax]
280 movzx edx, byte [rsi+rax*2]
282 movzx edx, byte [rsi+rdi]
292 mov rdi, arg(0) ;dst;
293 movsxd rcx, dword ptr arg(1) ;dst_stride
298 movq [rdi+rcx*2], mm1
303 movq [rdi+rcx*2], mm1
313 ;void vp8_intra_pred_uv_dc128_mmx(
314 ; unsigned char *dst,
316 ; unsigned char *above,
317 ; unsigned char *left,
320 global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
321 sym(vp8_intra_pred_uv_dc128_mmx):
324 SHADOW_ARGS_TO_STACK 5
328 ;arg(2), arg(3), arg(4) not used
331 movq mm1, [GLOBAL(dc_128)]
332 mov rax, arg(0) ;dst;
333 movsxd rdx, dword ptr arg(1) ;dst_stride
338 movq [rax+rdx*2], mm1
343 movq [rax+rdx*2], mm1
352 ;void vp8_intra_pred_uv_tm_sse2(
353 ; unsigned char *dst,
355 ; unsigned char *above,
356 ; unsigned char *left,
359 %macro vp8_intra_pred_uv_tm 1
360 global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
361 sym(vp8_intra_pred_uv_tm_%1):
364 SHADOW_ARGS_TO_STACK 5
373 mov rsi, arg(2) ;above
374 movsxd rax, dword ptr arg(4) ;left_stride;
377 movdqa xmm2, [GLOBAL(dc_1024)]
382 ; set up left ptrs ans subtract topleft
384 mov rsi, arg(3) ;left;
387 pshuflw xmm3, xmm3, 0x0
388 punpcklqdq xmm3, xmm3
395 mov rdi, arg(0) ;dst;
396 movsxd rcx, dword ptr arg(1) ;dst_stride
398 .vp8_intra_pred_uv_tm_%1_loop:
407 pshuflw xmm3, xmm3, 0x0
408 pshuflw xmm5, xmm5, 0x0
409 punpcklqdq xmm3, xmm3
410 punpcklqdq xmm5, xmm5
419 movhps[rdi+rcx], xmm3
423 jnz .vp8_intra_pred_uv_tm_%1_loop
435 vp8_intra_pred_uv_tm sse2
436 vp8_intra_pred_uv_tm ssse3
438 ;void vp8_intra_pred_uv_ve_mmx(
439 ; unsigned char *dst,
441 ; unsigned char *above,
442 ; unsigned char *left,
445 global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
446 sym(vp8_intra_pred_uv_ve_mmx):
449 SHADOW_ARGS_TO_STACK 5
452 ; arg(3), arg(4) not used
455 mov rax, arg(2) ;src;
460 mov rax, arg(0) ;dst;
461 movsxd rdx, dword ptr arg(1) ;dst_stride
466 movq [rax+rdx*2], mm1
471 movq [rax+rdx*2], mm1
479 ;void vp8_intra_pred_uv_ho_mmx2(
480 ; unsigned char *dst,
482 ; unsigned char *above,
483 ; unsigned char *left,
486 %macro vp8_intra_pred_uv_ho 1
487 global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
488 sym(vp8_intra_pred_uv_ho_%1):
491 SHADOW_ARGS_TO_STACK 5
502 ; read from left and write out
506 mov rsi, arg(3) ;left
507 movsxd rax, dword ptr arg(4) ;left_stride;
508 mov rdi, arg(0) ;dst;
509 movsxd rcx, dword ptr arg(1) ;dst_stride
512 movdqa xmm2, [GLOBAL(dc_00001111)]
516 .vp8_intra_pred_uv_ho_%1_loop:
532 jnz .vp8_intra_pred_uv_ho_%1_loop
552 movhps [rdi+rcx], xmm0
553 movq [rdi+rcx*2], xmm1
554 movhps [rdi+rdx], xmm1
576 movhps [rdi+rcx], xmm0
577 movq [rdi+rcx*2], xmm1
578 movhps [rdi+rdx], xmm1
593 vp8_intra_pred_uv_ho mmx2
594 vp8_intra_pred_uv_ho ssse3
596 ;void vp8_intra_pred_y_dc_sse2(
597 ; unsigned char *dst,
599 ; unsigned char *above,
600 ; unsigned char *left,
603 global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
604 sym(vp8_intra_pred_y_dc_sse2):
607 SHADOW_ARGS_TO_STACK 5
613 mov rdi, arg(2) ;above
614 mov rsi, arg(3) ;left
615 movsxd rax, dword ptr arg(4) ;left_stride;
621 punpckhqdq xmm1, xmm1
627 movzx ecx, byte [rsi]
628 movzx edx, byte [rsi+rax]
630 movzx edx, byte [rsi+rax*2]
632 movzx edx, byte [rsi+rdi]
636 movzx edx, byte [rsi]
638 movzx edx, byte [rsi+rax]
640 movzx edx, byte [rsi+rax*2]
642 movzx edx, byte [rsi+rdi]
646 movzx edx, byte [rsi]
648 movzx edx, byte [rsi+rax]
650 movzx edx, byte [rsi+rax*2]
652 movzx edx, byte [rsi+rdi]
656 movzx edx, byte [rsi]
658 movzx edx, byte [rsi+rax]
660 movzx edx, byte [rsi+rax*2]
662 movzx edx, byte [rsi+rdi]
666 pextrw edx, xmm1, 0x0
667 lea edx, [edx+ecx+16]
670 ; FIXME use pshufb for ssse3 version
671 pshuflw xmm1, xmm1, 0x0
672 punpcklqdq xmm1, xmm1
677 mov rdi, arg(0) ;dst;
678 movsxd rcx, dword ptr arg(1) ;dst_stride
683 movdqa [rdi+rcx ], xmm1
684 movdqa [rdi+rcx*2], xmm1
685 movdqa [rdi+rax ], xmm1
688 movdqa [rdi+rcx ], xmm1
689 movdqa [rdi+rcx*2], xmm1
690 movdqa [rdi+rax ], xmm1
702 ;void vp8_intra_pred_y_dctop_sse2(
703 ; unsigned char *dst,
705 ; unsigned char *above,
706 ; unsigned char *left,
709 global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
710 sym(vp8_intra_pred_y_dctop_sse2):
713 SHADOW_ARGS_TO_STACK 5
718 ;arg(3), arg(4) not used
721 mov rcx, arg(2) ;above;
726 punpckhqdq xmm1, xmm1
730 paddw xmm1, [GLOBAL(dc_8)]
732 ; FIXME use pshufb for ssse3 version
733 pshuflw xmm1, xmm1, 0x0
734 punpcklqdq xmm1, xmm1
739 mov rdx, arg(0) ;dst;
740 movsxd rcx, dword ptr arg(1) ;dst_stride
745 movdqa [rdx+rcx ], xmm1
746 movdqa [rdx+rcx*2], xmm1
747 movdqa [rdx+rax ], xmm1
750 movdqa [rdx+rcx ], xmm1
751 movdqa [rdx+rcx*2], xmm1
752 movdqa [rdx+rax ], xmm1
764 ;void vp8_intra_pred_y_dcleft_sse2(
765 ; unsigned char *dst,
767 ; unsigned char *above,
768 ; unsigned char *left,
771 global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
772 sym(vp8_intra_pred_y_dcleft_sse2):
775 SHADOW_ARGS_TO_STACK 5
783 mov rsi, arg(3) ;left;
784 movsxd rax, dword ptr arg(4) ;left_stride;
787 movzx ecx, byte [rsi]
788 movzx edx, byte [rsi+rax]
790 movzx edx, byte [rsi+rax*2]
792 movzx edx, byte [rsi+rdi]
795 movzx edx, byte [rsi]
797 movzx edx, byte [rsi+rax]
799 movzx edx, byte [rsi+rax*2]
801 movzx edx, byte [rsi+rdi]
804 movzx edx, byte [rsi]
806 movzx edx, byte [rsi+rax]
808 movzx edx, byte [rsi+rax*2]
810 movzx edx, byte [rsi+rdi]
813 movzx edx, byte [rsi]
815 movzx edx, byte [rsi+rax]
817 movzx edx, byte [rsi+rax*2]
819 movzx edx, byte [rsi+rdi]
825 ; FIXME use pshufb for ssse3 version
826 pshuflw xmm1, xmm1, 0x0
827 punpcklqdq xmm1, xmm1
832 mov rdi, arg(0) ;dst;
833 movsxd rcx, dword ptr arg(1) ;dst_stride
838 movdqa [rdi+rcx ], xmm1
839 movdqa [rdi+rcx*2], xmm1
840 movdqa [rdi+rax ], xmm1
843 movdqa [rdi+rcx ], xmm1
844 movdqa [rdi+rcx*2], xmm1
845 movdqa [rdi+rax ], xmm1
857 ;void vp8_intra_pred_y_dc128_sse2(
858 ; unsigned char *dst,
860 ; unsigned char *above,
861 ; unsigned char *left,
864 global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
865 sym(vp8_intra_pred_y_dc128_sse2):
868 SHADOW_ARGS_TO_STACK 5
873 ;arg(2), arg(3), arg(4) not used
877 movdqa xmm1, [GLOBAL(dc_128)]
878 mov rax, arg(0) ;dst;
879 movsxd rdx, dword ptr arg(1) ;dst_stride
884 movdqa [rax+rdx ], xmm1
885 movdqa [rax+rdx*2], xmm1
886 movdqa [rax+rcx ], xmm1
889 movdqa [rax+rdx ], xmm1
890 movdqa [rax+rdx*2], xmm1
891 movdqa [rax+rcx ], xmm1
903 ;void vp8_intra_pred_y_tm_sse2(
904 ; unsigned char *dst,
906 ; unsigned char *above,
907 ; unsigned char *left,
910 %macro vp8_intra_pred_y_tm 1
911 global sym(vp8_intra_pred_y_tm_%1) PRIVATE
912 sym(vp8_intra_pred_y_tm_%1):
915 SHADOW_ARGS_TO_STACK 5
925 mov rsi, arg(2) ;above
926 movsxd rax, dword ptr arg(4) ;left_stride;
929 movdqa xmm3, [GLOBAL(dc_1024)]
936 ; set up left ptrs ans subtract topleft
938 mov rsi, arg(3) ;left
941 pshuflw xmm4, xmm4, 0x0
942 punpcklqdq xmm4, xmm4
950 mov rdi, arg(0) ;dst;
951 movsxd rcx, dword ptr arg(1) ;dst_stride
952 vp8_intra_pred_y_tm_%1_loop:
961 pshuflw xmm4, xmm4, 0x0
962 pshuflw xmm5, xmm5, 0x0
963 punpcklqdq xmm4, xmm4
964 punpcklqdq xmm5, xmm5
978 movdqa [rdi+rcx], xmm5
982 jnz vp8_intra_pred_y_tm_%1_loop
995 vp8_intra_pred_y_tm sse2
996 vp8_intra_pred_y_tm ssse3
998 ;void vp8_intra_pred_y_ve_sse2(
999 ; unsigned char *dst,
1001 ; unsigned char *above,
1002 ; unsigned char *left,
1005 global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
1006 sym(vp8_intra_pred_y_ve_sse2):
1009 SHADOW_ARGS_TO_STACK 5
1013 ;arg(3), arg(4) not used
1015 mov rax, arg(2) ;above;
1017 movsxd rdx, dword ptr arg(1) ;dst_stride
1023 mov rax, arg(0) ;dst;
1028 movdqa [rax+rdx ], xmm1
1029 movdqa [rax+rdx*2], xmm1
1030 movdqa [rax+rcx ], xmm1
1031 lea rax, [rax+rdx*4]
1033 movdqa [rax+rdx ], xmm1
1034 movdqa [rax+rdx*2], xmm1
1035 movdqa [rax+rcx ], xmm1
1036 lea rax, [rax+rdx*4]
1046 ;void vp8_intra_pred_y_ho_sse2(
1047 ; unsigned char *dst,
1049 ; unsigned char *above,
1050 ; unsigned char *left,
1053 global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
1054 sym(vp8_intra_pred_y_ho_sse2):
1057 SHADOW_ARGS_TO_STACK 5
1065 ; read from left and write out
1067 mov rsi, arg(3) ;left;
1068 movsxd rax, dword ptr arg(4) ;left_stride;
1069 mov rdi, arg(0) ;dst;
1070 movsxd rcx, dword ptr arg(1) ;dst_stride
1072 vp8_intra_pred_y_ho_sse2_loop:
1078 ; FIXME use pshufb for ssse3 version
1079 punpcklbw xmm0, xmm0
1080 punpcklbw xmm1, xmm1
1081 pshuflw xmm0, xmm0, 0x0
1082 pshuflw xmm1, xmm1, 0x0
1083 punpcklqdq xmm0, xmm0
1084 punpcklqdq xmm1, xmm1
1086 movdqa [rdi+rcx], xmm1
1087 lea rsi, [rsi+rax*2]
1088 lea rdi, [rdi+rcx*2]
1090 jnz vp8_intra_pred_y_ho_sse2_loop