2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ; %1 value not preserved
18 movdqa scratch1, %2 ; v2
20 psubusb scratch1, %1 ; v2 - v1
21 psubusb %1, %2 ; v1 - v2
22 por %1, scratch1 ; abs(v2 - v1)
25 %macro LF_FILTER_HEV_MASK 8-9
27 LF_ABS %1, %2 ; abs(p3 - p2)
28 LF_ABS %2, %3 ; abs(p2 - p1)
29 pmaxub %1, %2 ; accumulate mask
31 movdqa scratch2, %3 ; save p1
32 LF_ABS scratch2, %4 ; abs(p1 - p0)
34 LF_ABS %4, %5 ; abs(p0 - q0)
35 LF_ABS %5, %6 ; abs(q0 - q1)
37 pmaxub %5, scratch2 ; accumulate hev
41 pmaxub %1, %5 ; accumulate mask
43 LF_ABS %3, %6 ; abs(p1 - q1)
44 LF_ABS %6, %7 ; abs(q1 - q2)
45 pmaxub %1, %6 ; accumulate mask
46 LF_ABS %7, %8 ; abs(q2 - q3)
47 pmaxub %1, %7 ; accumulate mask
49 paddusb %4, %4 ; 2 * abs(p0 - q0)
50 pand %3, [GLOBAL(tfe)]
51 psrlw %3, 1 ; abs(p1 - q1) / 2
52 paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
57 pcmpeqb %1, zero ; mask
60 pcmpeqb %5, zero ; ~hev
68 movdqa scratch2, %6 ; save hev
70 pxor %1, [GLOBAL(t80)] ; ps1
71 pxor %4, [GLOBAL(t80)] ; qs1
73 psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
74 pandn scratch2, scratch1 ; vp8_filter &= hev
76 pxor %2, [GLOBAL(t80)] ; ps0
77 pxor %3, [GLOBAL(t80)] ; qs0
79 psubsb scratch1, %2 ; qs0 - ps0
80 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
81 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
82 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
83 pand %5, scratch2 ; &= mask
86 paddsb %5, [GLOBAL(t4)] ; Filter1
87 paddsb scratch2, [GLOBAL(t3)] ; Filter2
93 pand scratch1, [GLOBAL(te0)]
94 pand %5, [GLOBAL(t1f)]
97 psubsb %3, %5 ; qs0 - Filter1
98 pxor %3, [GLOBAL(t80)]
101 movdqa scratch1, zero
102 pcmpgtb scratch1, scratch2
104 pand scratch1, [GLOBAL(te0)]
105 pand scratch2, [GLOBAL(t1f)]
106 por scratch2, scratch1
108 paddsb %2, scratch2 ; ps0 + Filter2
109 pxor %2, [GLOBAL(t80)]
111 ; outer tap adjustments
112 paddsb %5, [GLOBAL(t1)]
113 movdqa scratch1, zero
116 pand scratch1, [GLOBAL(t80)]
117 pand %5, [GLOBAL(t7f)]
119 pand %5, %6 ; vp8_filter &= ~hev
121 psubsb %4, %5 ; qs1 - vp8_filter
122 pxor %4, [GLOBAL(t80)]
124 paddsb %1, %5 ; ps1 + vp8_filter
125 pxor %1, [GLOBAL(t80)]
128 ;void vp8_loop_filter_bh_y_sse2
130 ; unsigned char *src_ptr,
131 ; int src_pixel_step,
132 ; const char *blimit,
136 global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
137 sym(vp8_loop_filter_bh_y_sse2):
139 %if LIBVPX_YASM_WIN64
140 %define src rcx ; src_ptr
141 %define stride rdx ; src_pixel_step
158 %define src rdi ; src_ptr
159 %define stride rsi ; src_pixel_step
170 %define scratch1 xmm5
171 %define scratch2 xmm6
176 %define i2 [src + 2 * stride]
177 %define i3 [spp + 2 * stride]
178 %define i4 [src + 4 * stride]
179 %define i5 [spp + 4 * stride]
180 %define i6 [src + 2 * stride3]
181 %define i7 [spp + 2 * stride3]
182 %define i8 [src + 8 * stride]
183 %define i9 [spp + 8 * stride]
184 %define i10 [src + 2 * stride5]
185 %define i11 [spp + 2 * stride5]
186 %define i12 [src + 4 * stride3]
187 %define i13 [spp + 4 * stride3]
188 %define i14 [src + 2 * stride7]
189 %define i15 [spp + 2 * stride7]
192 lea spp, [src + stride]
193 lea stride3, [stride + 2 * stride]
194 lea stride5, [stride3 + 2 * stride]
195 lea stride7, [stride3 + 4 * stride]
198 ; load the first set into registers
205 movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
207 LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
213 LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
225 movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
227 LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
233 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
245 movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
247 LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
253 LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
259 %if LIBVPX_YASM_WIN64
269 ;void vp8_loop_filter_bv_y_sse2
271 ; unsigned char *src_ptr,
272 ; int src_pixel_step,
273 ; const char *blimit,
278 global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
279 sym(vp8_loop_filter_bv_y_sse2):
281 %if LIBVPX_YASM_WIN64
282 %define src rcx ; src_ptr
283 %define stride rdx ; src_pixel_step
312 %define scratch1 xmm5
313 %define scratch2 xmm6
318 %define s2 [src + 2 * stride]
319 %define s3 [spp + 2 * stride]
320 %define s4 [src + 4 * stride]
321 %define s5 [spp + 4 * stride]
322 %define s6 [src + 2 * stride3]
323 %define s7 [spp + 2 * stride3]
324 %define s8 [src + 8 * stride]
325 %define s9 [spp + 8 * stride]
326 %define s10 [src + 2 * stride5]
327 %define s11 [spp + 2 * stride5]
328 %define s12 [src + 4 * stride3]
329 %define s13 [spp + 4 * stride3]
330 %define s14 [src + 2 * stride7]
331 %define s15 [spp + 2 * stride7]
334 %define i1 [rsp + 16]
335 %define i2 [rsp + 32]
336 %define i3 [rsp + 48]
337 %define i4 [rsp + 64]
338 %define i5 [rsp + 80]
339 %define i6 [rsp + 96]
340 %define i7 [rsp + 112]
341 %define i8 [rsp + 128]
342 %define i9 [rsp + 144]
343 %define i10 [rsp + 160]
344 %define i11 [rsp + 176]
345 %define i12 [rsp + 192]
346 %define i13 [rsp + 208]
347 %define i14 [rsp + 224]
348 %define i15 [rsp + 240]
352 ; reserve stack space
353 %define temp_storage 0 ; size is 256 (16*16)
354 %define stack_size 256
358 lea spp, [src + stride]
359 lea stride3, [stride + 2 * stride]
360 lea stride5, [stride3 + 2 * stride]
361 lea stride7, [stride3 + 4 * stride]
366 punpcklbw xmm0, s9 ; 80 90
367 punpckhbw xmm1, s9 ; 88 98
371 punpcklbw xmm2, s11 ; a0 b0
372 punpckhbw xmm3, s11 ; a8 b8
375 punpcklwd xmm0, xmm2 ; 80 90 a0 b0
376 punpckhwd xmm4, xmm2 ; 84 94 a4 b4
379 punpcklwd xmm1, xmm3 ; 88 98 a8 b8
380 punpckhwd xmm2, xmm3 ; 8c 9c ac bc
383 ; work on next 4 rows
387 punpcklbw xmm3, s13 ; c0 d0
388 punpckhbw xmm5, s13 ; c8 d8
392 punpcklbw xmm6, s15 ; e0 f0
393 punpckhbw xmm7, s15 ; e8 f8
396 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
397 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
400 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
401 punpckhwd xmm6, xmm7 ; cc dc ec fc
403 ; pull the third and fourth sets together
406 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
407 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
410 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
411 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
414 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
415 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
418 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
419 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
421 ; save the calculations. we only have 15 registers ...
434 punpcklbw xmm0, s1 ; 00 10
435 punpckhbw xmm1, s1 ; 08 18
439 punpcklbw xmm2, s3 ; 20 30
440 punpckhbw xmm3, s3 ; 28 38
443 punpcklwd xmm0, xmm2 ; 00 10 20 30
444 punpckhwd xmm4, xmm2 ; 04 14 24 34
447 punpcklwd xmm1, xmm3 ; 08 18 28 38
448 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
451 ; work on next 4 rows
455 punpcklbw xmm3, s5 ; 40 50
456 punpckhbw xmm5, s5 ; 48 58
460 punpcklbw xmm6, s7 ; 60 70
461 punpckhbw xmm7, s7 ; 68 78
464 punpcklwd xmm3, xmm6 ; 40 50 60 70
465 punpckhwd xmm8, xmm6 ; 44 54 64 74
468 punpcklwd xmm5, xmm7 ; 48 58 68 78
469 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
471 ; pull the first two sets together
474 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
475 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
478 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
479 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
482 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
483 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
486 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
487 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
539 ; TRANSPOSED DATA AVAILABLE ON THE STACK
546 LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
552 LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
564 movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
566 LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
572 LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
584 movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
586 LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
592 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
599 ; RESHUFFLE AND WRITE OUT
603 punpcklbw xmm0, i9 ; 80 90
604 punpckhbw xmm1, i9 ; 88 98
608 punpcklbw xmm2, i11 ; a0 b0
609 punpckhbw xmm3, i11 ; a8 b8
612 punpcklwd xmm0, xmm2 ; 80 90 a0 b0
613 punpckhwd xmm4, xmm2 ; 84 94 a4 b4
616 punpcklwd xmm1, xmm3 ; 88 98 a8 b8
617 punpckhwd xmm2, xmm3 ; 8c 9c ac bc
620 ; work on next 4 rows
624 punpcklbw xmm3, i13 ; c0 d0
625 punpckhbw xmm5, i13 ; c8 d8
629 punpcklbw xmm6, i15 ; e0 f0
630 punpckhbw xmm7, i15 ; e8 f8
633 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
634 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
637 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
638 punpckhwd xmm6, xmm7 ; cc dc ec fc
640 ; pull the third and fourth sets together
643 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
644 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
647 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
648 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
651 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
652 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
655 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
656 punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
658 ; save the calculations. we only have 15 registers ...
671 punpcklbw xmm0, i1 ; 00 10
672 punpckhbw xmm1, i1 ; 08 18
676 punpcklbw xmm2, i3 ; 20 30
677 punpckhbw xmm3, i3 ; 28 38
680 punpcklwd xmm0, xmm2 ; 00 10 20 30
681 punpckhwd xmm4, xmm2 ; 04 14 24 34
684 punpcklwd xmm1, xmm3 ; 08 18 28 38
685 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
688 ; work on next 4 rows
692 punpcklbw xmm3, i5 ; 40 50
693 punpckhbw xmm5, i5 ; 48 58
697 punpcklbw xmm6, i7 ; 60 70
698 punpckhbw xmm7, i7 ; 68 78
701 punpcklwd xmm3, xmm6 ; 40 50 60 70
702 punpckhwd xmm8, xmm6 ; 44 54 64 74
705 punpcklwd xmm5, xmm7 ; 48 58 68 78
706 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
708 ; pull the first two sets together
711 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
712 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
715 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
716 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
719 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
720 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
723 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
724 punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
737 punpckhqdq xmm10, i10
741 punpckhqdq xmm11, i11
745 punpckhqdq xmm12, i12
749 punpckhqdq xmm13, i13
753 punpckhqdq xmm14, i14
757 punpckhqdq xmm15, i15
782 %if LIBVPX_YASM_WIN64