2 ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "third_party/x86inc/x86inc.asm"
16 ; %define USE_PMULHRSW
17 ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
18 ; when using this instruction.
22 %define LOCAL_VARS_SIZE 16*4
24 %define LOCAL_VARS_SIZE 16*6
27 %macro SETUP_LOCAL_VARS 0
28 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
29 ; pmaddubsw has a higher latency on some platforms, this might be eased by
30 ; interleaving the instructions.
31 %define k0k1 [rsp + 16*0]
32 %define k2k3 [rsp + 16*1]
33 %define k4k5 [rsp + 16*2]
34 %define k6k7 [rsp + 16*3]
36 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
38 pshuflw m0, m4, 0b ;k0_k1
39 pshuflw m1, m4, 01010101b ;k2_k3
40 pshuflw m2, m4, 10101010b ;k4_k5
41 pshuflw m3, m4, 11111111b ;k6_k7
53 mova krd, [GLOBAL(pw_64)]
55 %define tmp [rsp + 16*4]
56 %define krd [rsp + 16*5]
58 mova m6, [GLOBAL(pw_64)]
60 ; build constants without accessing global memory
61 pcmpeqb m6, m6 ;all ones
63 psllw m6, 6 ;aka pw_64
78 pmaddubsw %2, k0k1k4k5
79 pmaddubsw m3, k2k3k6k7
97 ;-------------------------------------------------------------------------------
98 %macro SUBPIX_HFILTER4 1
99 cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
100 src, sstride, dst, dstride, height, filter
107 %define orig_height r7
108 mova krd, [GLOBAL(pw_64)]
109 pshuflw k0k1k4k5, m4, 0b ;k0_k1
110 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
111 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
112 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
114 %define k0k1k4k5 [rsp + 16*0]
115 %define k2k3k6k7 [rsp + 16*1]
116 %define krd [rsp + 16*2]
117 %define orig_height [rsp + 16*3]
118 pshuflw m6, m4, 0b ;k0_k1
119 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
120 pshuflw m7, m4, 01010101b ;k2_k3
121 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
123 mova m1, [GLOBAL(pw_64)]
125 ; build constants without accessing global memory
126 pcmpeqb m1, m1 ;all ones
128 psllw m1, 6 ;aka pw_64
134 mov orig_height, heightq
142 movh m2, [srcq + sstrideq - 3]
143 movh m3, [srcq + sstrideq + 5]
152 pmaddubsw m4, k0k1k4k5
154 pmaddubsw m1, k2k3k6k7
157 pmaddubsw m7, k0k1k4k5
159 pmaddubsw m3, k2k3k6k7
191 movd m2, [dstq + dstrideq]
195 movd [dstq + dstrideq], m7
197 lea srcq, [srcq + sstrideq ]
198 prefetcht0 [srcq + 4 * sstrideq - 3]
199 lea srcq, [srcq + sstrideq ]
200 lea dstq, [dstq + 2 * dstrideq ]
201 prefetcht0 [srcq + 2 * sstrideq - 3]
206 ; Do last row if output_height is odd
207 mov heightq, orig_height
211 movh m0, [srcq - 3] ; load src
255 ;-------------------------------------------------------------------------------
256 %macro SUBPIX_HFILTER8 1
257 cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \
258 src, sstride, dst, dstride, height, filter
262 %define orig_height r7
264 %define orig_height heightmp
266 mov orig_height, heightq
272 movh m4, [srcq + sstrideq - 3]
273 movh m7, [srcq + sstrideq + 5]
327 movh m2, [dstq + dstrideq]
332 movh [dstq + dstrideq], m6
334 lea srcq, [srcq + sstrideq ]
335 prefetcht0 [srcq + 4 * sstrideq - 3]
336 lea srcq, [srcq + sstrideq ]
337 lea dstq, [dstq + 2 * dstrideq ]
338 prefetcht0 [srcq + 2 * sstrideq - 3]
342 ;Do last row if output_height is odd
343 mov heightq, orig_height
351 HORIZx8_ROW m0, m1, m2, m3, m4
362 ;-------------------------------------------------------------------------------
363 %macro SUBPIX_HFILTER16 1
364 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \
365 src, sstride, dst, dstride, height, filter
369 prefetcht0 [srcq + 2 * sstrideq -3]
427 lea srcq, [srcq + sstrideq]
429 lea dstq, [dstq + dstrideq]
437 SUBPIX_HFILTER16 h8_avg
439 SUBPIX_HFILTER8 h8_avg
441 SUBPIX_HFILTER4 h8_avg
443 ;-------------------------------------------------------------------------------
444 %macro SUBPIX_VFILTER 2
445 cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
446 src, sstride, dst, dstride, height, filter
452 %define dst_stride dstrideq
454 %define src1q filterq
455 %define sstride6q dstrideq
456 %define dst_stride dstridemp
460 lea sstride6q, [sstrideq + sstrideq * 4]
461 add sstride6q, sstrideq ;pitch * 6
470 movx m1, [srcq + sstrideq ] ;B
471 punpcklbw m0, m1 ;A B
472 movx m2, [srcq + sstrideq * 2 ] ;C
475 movx m3, [src1q + sstrideq * 2] ;D
476 punpcklbw m2, m3 ;C D
478 movx m4, [srcq + sstrideq * 4 ] ;E
480 movx m5, [src1q + sstrideq * 4] ;F
481 punpcklbw m4, m5 ;E F
483 punpcklbw m1, m6 ;A B next iter
484 movx m6, [srcq + sstride6q ] ;G
485 punpcklbw m5, m6 ;E F next iter
486 punpcklbw m3, m7 ;C D next iter
488 movx m7, [src1q + sstride6q ] ;H
489 punpcklbw m6, m7 ;G H
496 movx m6, [srcq + sstrideq * 8 ] ;H next iter
514 lea srcq, [srcq + sstrideq * 2 ]
515 lea src1q, [src1q + sstrideq * 2]
538 movx m1, [srcq + sstrideq ] ;B
539 movx m6, [srcq + sstride6q ] ;G
540 punpcklbw m0, m1 ;A B
541 movx m7, [rax + sstride6q ] ;H
543 movx m2, [srcq + sstrideq * 2 ] ;C
544 punpcklbw m6, m7 ;G H
545 movx m3, [rax + sstrideq * 2 ] ;D
547 movx m4, [srcq + sstrideq * 4 ] ;E
548 punpcklbw m2, m3 ;C D
549 movx m5, [src1q + sstrideq * 4] ;F
550 punpcklbw m4, m5 ;E F
571 ;-------------------------------------------------------------------------------
572 %macro SUBPIX_VFILTER16 1
573 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \
574 src, sstride, dst, dstride, height, filter
581 %define dst_stride dstrideq
583 %define src1q filterq
584 %define sstride6q dstrideq
585 %define dst_stride dstridemp
589 lea sstride6q, [sstrideq + sstrideq * 4]
590 add sstride6q, sstrideq ;pitch * 6
594 movh m1, [srcq + sstrideq ] ;B
595 movh m2, [srcq + sstrideq * 2 ] ;C
596 movh m3, [src1q + sstrideq * 2] ;D
597 movh m4, [srcq + sstrideq * 4 ] ;E
598 movh m5, [src1q + sstrideq * 4] ;F
600 punpcklbw m0, m1 ;A B
601 movh m6, [srcq + sstride6q] ;G
602 punpcklbw m2, m3 ;C D
603 movh m7, [src1q + sstride6q] ;H
604 punpcklbw m4, m5 ;E F
606 movh m3, [srcq + 8] ;A
608 punpcklbw m6, m7 ;G H
609 movh m5, [srcq + sstrideq + 8] ;B
611 punpcklbw m3, m5 ;A B
612 movh m7, [srcq + sstrideq * 2 + 8] ;C
615 movh m5, [src1q + sstrideq * 2 + 8] ;D
617 punpcklbw m7, m5 ;C D
621 movh m1, [srcq + sstrideq * 4 + 8] ;E
624 movh m6, [src1q + sstrideq * 4 + 8] ;F
625 punpcklbw m1, m6 ;E F
628 movh m2, [srcq + sstride6q + 8] ;G
630 movh m5, [src1q + sstride6q + 8] ;H
632 punpcklbw m2, m5 ;G H
665 SUBPIX_VFILTER16 v8_avg
667 SUBPIX_VFILTER v8_avg, 8
669 SUBPIX_VFILTER v8_avg, 4