2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;unsigned int vp9_get_mb_ss_sse2
18 global sym(vp9_get_mb_ss_sse2) PRIVATE
19 sym(vp9_get_mb_ss_sse2):
22 SHADOW_ARGS_TO_STACK 1
30 mov rax, arg(0) ;[src_ptr]
72 ;unsigned int vp9_get16x16var_sse2
74 ; unsigned char * src_ptr,
76 ; unsigned char * ref_ptr,
81 global sym(vp9_get16x16var_sse2) PRIVATE
82 sym(vp9_get16x16var_sse2):
85 SHADOW_ARGS_TO_STACK 6
92 mov rsi, arg(0) ;[src_ptr]
93 mov rdi, arg(2) ;[ref_ptr]
95 movsxd rax, DWORD PTR arg(1) ;[source_stride]
96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
102 prefetcht0 [rsi+rax*2]
107 prefetcht0 [rbx+rax*2]
113 prefetcht0 [rdi+rdx*2]
118 prefetcht0 [rbx+rdx*2]
121 pxor xmm0, xmm0 ; clear xmm0 for unpack
122 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
124 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
128 movdqu xmm1, XMMWORD PTR [rsi]
129 movdqu xmm2, XMMWORD PTR [rdi]
131 prefetcht0 [rsi+rax*8]
132 prefetcht0 [rdi+rdx*8]
197 mov rax, arg(5) ;[Sum]
198 mov rdi, arg(4) ;[SSE]
200 movd DWORD PTR [rax], xmm7
201 movd DWORD PTR [rdi], xmm1
216 ;unsigned int vp9_get8x8var_sse2
218 ; unsigned char * src_ptr,
220 ; unsigned char * ref_ptr,
222 ; unsigned int * SSE,
225 global sym(vp9_get8x8var_sse2) PRIVATE
226 sym(vp9_get8x8var_sse2):
229 SHADOW_ARGS_TO_STACK 6
237 mov rsi, arg(0) ;[src_ptr]
238 mov rdi, arg(2) ;[ref_ptr]
240 movsxd rax, DWORD PTR arg(1) ;[source_stride]
241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
243 pxor xmm0, xmm0 ; clear xmm0 for unpack
244 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
246 movq xmm1, QWORD PTR [rsi]
247 movq xmm2, QWORD PTR [rdi]
257 movq xmm2, QWORD PTR[rsi + rax]
258 movq xmm3, QWORD PTR[rdi + rdx]
270 movq xmm2, QWORD PTR[rsi + rax * 2]
271 movq xmm3, QWORD PTR[rdi + rdx * 2]
283 lea rsi, [rsi + rax * 2]
284 lea rdi, [rdi + rdx * 2]
285 movq xmm2, QWORD PTR[rsi + rax]
286 movq xmm3, QWORD PTR[rdi + rdx]
297 movq xmm2, QWORD PTR[rsi + rax *2]
298 movq xmm3, QWORD PTR[rdi + rdx *2]
310 lea rsi, [rsi + rax * 2]
311 lea rdi, [rdi + rdx * 2]
314 movq xmm2, QWORD PTR[rsi + rax]
315 movq xmm3, QWORD PTR[rdi + rdx]
326 movq xmm2, QWORD PTR[rsi + rax *2]
327 movq xmm3, QWORD PTR[rdi + rdx *2]
339 lea rsi, [rsi + rax * 2]
340 lea rdi, [rdi + rdx * 2]
342 movq xmm2, QWORD PTR[rsi + rax]
343 movq xmm3, QWORD PTR[rdi + rdx]
382 mov rax, arg(5) ;[Sum]
383 mov rdi, arg(4) ;[SSE]
388 mov dword ptr [rax], ecx
389 movd DWORD PTR [rdi], xmm1
401 ;void vp9_half_horiz_vert_variance8x_h_sse2
403 ; unsigned char *ref_ptr,
404 ; int ref_pixels_per_line,
405 ; unsigned char *src_ptr,
406 ; int src_pixels_per_line,
407 ; unsigned int Height,
409 ; unsigned int *sumsquared
411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
412 sym(vp9_half_horiz_vert_variance8x_h_sse2):
415 SHADOW_ARGS_TO_STACK 7
423 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
424 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
427 pxor xmm6, xmm6 ; error accumulator
428 pxor xmm7, xmm7 ; sse eaccumulator
429 mov rsi, arg(0) ;ref_ptr ;
431 mov rdi, arg(2) ;src_ptr ;
432 movsxd rcx, dword ptr arg(4) ;Height ;
433 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
437 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
438 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
439 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
442 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
447 .half_horiz_vert_variance8x_h_1:
449 movq xmm1, QWORD PTR [rsi] ;
450 movq xmm2, QWORD PTR [rsi+1] ;
451 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
453 pavgb xmm5, xmm1 ; xmm = vertical average of the above
454 punpcklbw xmm5, xmm0 ; xmm5 = words of above
456 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
457 punpcklbw xmm3, xmm0 ; xmm3 = words of above
459 psubw xmm5, xmm3 ; xmm5 -= xmm3
460 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
461 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
462 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
464 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
467 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
468 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
475 jnz .half_horiz_vert_variance8x_h_1 ;
507 mov rsi, arg(5) ; sum
508 mov rdi, arg(6) ; sumsquared
523 ;void vp9_half_vert_variance8x_h_sse2
525 ; unsigned char *ref_ptr,
526 ; int ref_pixels_per_line,
527 ; unsigned char *src_ptr,
528 ; int src_pixels_per_line,
529 ; unsigned int Height,
531 ; unsigned int *sumsquared
533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
534 sym(vp9_half_vert_variance8x_h_sse2):
537 SHADOW_ARGS_TO_STACK 7
545 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
546 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
549 pxor xmm6, xmm6 ; error accumulator
550 pxor xmm7, xmm7 ; sse eaccumulator
551 mov rsi, arg(0) ;ref_ptr ;
553 mov rdi, arg(2) ;src_ptr ;
554 movsxd rcx, dword ptr arg(4) ;Height ;
555 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
558 .half_vert_variance8x_h_1:
559 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
560 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
562 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
563 punpcklbw xmm5, xmm0 ; xmm5 = words of above
565 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
566 punpcklbw xmm3, xmm0 ; xmm3 = words of above
568 psubw xmm5, xmm3 ; xmm5 -= xmm3
569 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
570 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
571 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
574 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
575 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
582 jnz .half_vert_variance8x_h_1 ;
614 mov rsi, arg(5) ; sum
615 mov rdi, arg(6) ; sumsquared
631 ;void vp9_half_horiz_variance8x_h_sse2
633 ; unsigned char *ref_ptr,
634 ; int ref_pixels_per_line,
635 ; unsigned char *src_ptr,
636 ; int src_pixels_per_line,
637 ; unsigned int Height,
639 ; unsigned int *sumsquared
641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
642 sym(vp9_half_horiz_variance8x_h_sse2):
645 SHADOW_ARGS_TO_STACK 7
653 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
654 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
657 pxor xmm6, xmm6 ; error accumulator
658 pxor xmm7, xmm7 ; sse eaccumulator
659 mov rsi, arg(0) ;ref_ptr ;
661 mov rdi, arg(2) ;src_ptr ;
662 movsxd rcx, dword ptr arg(4) ;Height ;
665 .half_horiz_variance8x_h_1:
666 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
667 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
669 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
670 punpcklbw xmm5, xmm0 ; xmm5 = words of above
672 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
673 punpcklbw xmm3, xmm0 ; xmm3 = words of above
675 psubw xmm5, xmm3 ; xmm5 -= xmm3
676 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
677 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
678 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
681 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
682 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
688 jnz .half_horiz_variance8x_h_1 ;
720 mov rsi, arg(5) ; sum
721 mov rdi, arg(6) ; sumsquared