2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;macro in deblock functions
21 ;calculate absolute value
42 %macro SECOND_2_ROWS 0
48 ;calculate absolute value
74 ;decide if or not to use filtered value
80 %macro UPDATE_FLIMIT 0
81 movdqa xmm2, XMMWORD PTR [rbx]
86 ;void vp8_post_proc_down_and_across_mb_row_sse2
88 ; unsigned char *src_ptr,
89 ; unsigned char *dst_ptr,
90 ; int src_pixels_per_line,
91 ; int dst_pixels_per_line,
96 global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
97 sym(vp8_post_proc_down_and_across_mb_row_sse2):
100 SHADOW_ARGS_TO_STACK 7
109 ; put flimit on stack
110 mov rbx, arg(5) ;flimits ptr
115 mov rsi, arg(0) ;src_ptr
116 mov rdi, arg(1) ;dst_ptr
118 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
119 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
123 ;load current and next 2 rows
124 movdqu xmm0, XMMWORD PTR [rsi]
125 movdqu xmm1, XMMWORD PTR [rsi + rax]
126 movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
132 movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
133 movdqu xmm3, XMMWORD PTR [rsi + rax]
137 movdqu XMMWORD PTR [rdi], xmm0
139 neg rax ; positive stride
144 cmp edx, dword arg(4) ;cols
150 ; done with the all cols, start the across filtering in place
154 mov rbx, arg(5) ; flimits
157 ; dup the first byte into the left border 8 times
165 ; dup the last byte into the right border
166 movsxd rdx, dword arg(4)
167 movq mm1, [rdi + rdx + -1]
174 movq mm0, QWORD PTR [rdi-16];
175 movq mm1, QWORD PTR [rdi-8];
178 movdqu xmm0, XMMWORD PTR [rdi + rdx]
179 movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
180 movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
184 movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
185 movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
189 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
190 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
196 cmp edx, dword arg(4) ;cols
203 movq QWORD PTR [rdi+rdx-16], mm0
205 cmp edx, dword arg(4)
207 movq QWORD PTR [rdi+rdx-8], mm1
210 add rsi,rax ;next src line
211 mov eax, dword arg(3) ;dst_pixels_per_line
212 add rdi,rax ;next destination
213 mov eax, dword arg(2) ;src_pixels_per_line
215 mov rbx, arg(5) ;flimits
218 dec rcx ;decrement count
219 jnz .nextrow ;next row
233 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
234 ; int pitch, int rows, int cols,int flimit)
236 global sym(vp8_mbpost_proc_down_xmm) PRIVATE
237 sym(vp8_mbpost_proc_down_xmm):
240 SHADOW_ARGS_TO_STACK 5
250 ; unsigned char d[16][8] at [rsp]
251 ; create flimit2 at [rsp+128]
252 mov eax, dword ptr arg(4) ;flimit
256 mov [rsp+128+12], eax
257 %define flimit4 [rsp+128]
260 lea r8, [GLOBAL(sym(vp8_rv))]
266 ;for(c=0; c<cols; c+=8)
271 movsxd rax, dword ptr arg(1) ;pitch ;
273 ; this copies the last row down into the border 8 rows
279 movq xmm1, QWORD ptr[rdi] ; first row
281 .init_borderd ; initialize borders
288 neg rax ; rax = -pitch
290 ; this copies the first row up into the border 8 rows
292 movq xmm1, QWORD ptr[rdi] ; first row
294 .init_border ; initialize borders
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
315 movq xmm1, QWORD PTR [rdi];
316 punpcklbw xmm1, xmm0 ;
322 punpcklwd xmm1, xmm0 ;
324 punpckhwd xmm2, xmm0 ;
332 ;save the var and sum
335 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
336 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
395 movq xmm1, QWORD PTR [rsi+rax*8]
404 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
406 lea rax, [GLOBAL(sym(vp8_rv))]
407 movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
410 movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
412 movdqu xmm4, [sym(vp8_rv) + rcx*2]
426 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
434 movq mm0, [rsp + rcx*8] ;d[rcx*8]
443 cmp edx, dword arg(2) ;rows
446 add dword arg(0), 8 ; s += 8
447 sub dword arg(3), 8 ; cols -= 8
465 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
466 ; int pitch, int rows, int cols,int flimit)
467 global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
468 sym(vp8_mbpost_proc_across_ip_xmm):
471 SHADOW_ARGS_TO_STACK 5
481 ; create flimit4 at [rsp]
482 mov eax, dword ptr arg(4) ;flimit
487 %define flimit4 [rsp]
493 xor rdx, rdx ;sumsq=0;
498 ; dup the first byte into the left border 8 times
507 ; dup the last byte into the right border
508 movsxd rdx, dword arg(3)
509 movq mm1, [rsi + rdx + -1]
518 ; sumsq += s[i]*s[i];
521 movzx eax, byte [rsi+rdi]
541 movsxd rdx, dword arg(3) ;cols
549 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
550 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
552 punpcklbw xmm1, xmm0 ; expanding
553 punpcklbw xmm2, xmm0 ; expanding
555 punpcklwd xmm1, xmm0 ; expanding to dwords
556 punpcklwd xmm2, xmm0 ; expanding to dwords
558 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
559 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
561 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
562 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
567 pshufd xmm6, xmm6, 0 ; duplicate the last ones
568 pshufd xmm7, xmm7, 0 ; duplicate the last ones
570 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
571 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
573 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
574 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
579 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
580 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
585 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
586 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
606 movd xmm1, DWORD PTR [rsi+rcx]
613 paddd xmm1, [GLOBAL(four8s)]
624 movd [rsi+rcx-8], mm0
637 movsxd rax, dword arg(1)
640 sub dword arg(2), 1 ;rows-=1
658 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
659 ; unsigned char blackclamp[16],
660 ; unsigned char whiteclamp[16],
661 ; unsigned char bothclamp[16],
662 ; unsigned int Width, unsigned int Height, int Pitch)
663 global sym(vp8_plane_add_noise_wmt) PRIVATE
664 sym(vp8_plane_add_noise_wmt):
667 SHADOW_ARGS_TO_STACK 8
674 call sym(LIBVPX_RAND) WRT_PLT
675 mov rcx, arg(1) ;noise
679 ; we rely on the fact that the clamping vectors are stored contiguously
680 ; in black/white/both order. Note that we have to reload this here because
681 ; rdx could be trashed by rand()
682 mov rdx, arg(2) ; blackclamp
686 movsxd rcx, dword arg(5) ;[Width]
691 movdqu xmm1,[rsi+rax] ; get the source
693 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
694 paddusb xmm1, [rdx+32] ;bothclamp
695 psubusb xmm1, [rdx+16] ;whiteclamp
697 movdqu xmm2,[rdi+rax] ; get the noise for this line
698 paddb xmm1,xmm2 ; add it in
699 movdqu [rsi+rax],xmm1 ; store the result
701 add rax,16 ; move to the next line
706 movsxd rax, dword arg(7) ; Pitch
707 add arg(0), rax ; Start += Pitch
708 sub dword arg(6), 1 ; Height -= 1