1 ;*****************************************************************************
2 ;* x86inc.asm: x264asm abstraction layer
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2019 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Henrik Gramner <henrik@gramner.com>
8 ;* Anton Mitrofanov <BugMaster@narod.ru>
9 ;* Fiona Glaser <fiona@x264.com>
11 ;* Permission to use, copy, modify, and/or distribute this software for any
12 ;* purpose with or without fee is hereby granted, provided that the above
13 ;* copyright notice and this permission notice appear in all copies.
15 ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16 ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17 ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18 ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21 ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 ;*****************************************************************************
24 ; This is a header file for the x264ASM assembly language, which uses
25 ; NASM/YASM syntax combined with a large number of macros to provide easy
26 ; abstraction between different calling conventions (x86_32, win64, linux64).
27 ; It also has various other useful features to simplify writing the kind of
28 ; DSP functions that are most often used in x264.
30 ; Unlike the rest of x264, this file is available under an ISC license, as it
31 ; has significant usefulness outside of x264 and we want it to be available
32 ; to the largest audience possible. Of course, if you modify it for your own
33 ; purposes to add a new feature, we strongly encourage contributing a patch
34 ; as this feature might be useful for others as well. Send patches or ideas
35 ; to x264-devel@videolan.org .
37 %include "vpx_config.asm"
39 %ifndef private_prefix
40 %define private_prefix vpx
44 %define public_prefix private_prefix
47 %ifndef STACK_ALIGNMENT
49 %define STACK_ALIGNMENT 16
51 %define STACK_ALIGNMENT 4
58 %ifidn __OUTPUT_FORMAT__,win32
60 %elifidn __OUTPUT_FORMAT__,win64
62 %elifidn __OUTPUT_FORMAT__,x64
70 %define FORMAT_MACHO 0
71 %ifidn __OUTPUT_FORMAT__,elf
73 %elifidn __OUTPUT_FORMAT__,elf32
75 %elifidn __OUTPUT_FORMAT__,elf64
77 %elifidn __OUTPUT_FORMAT__,macho
78 %define FORMAT_MACHO 1
79 %elifidn __OUTPUT_FORMAT__,macho32
80 %define FORMAT_MACHO 1
81 %elifidn __OUTPUT_FORMAT__,macho64
82 %define FORMAT_MACHO 1
85 ; Set PREFIX for libvpx builds.
95 %define mangle(x) _ %+ x
100 ; In some instances macho32 tables get misaligned when using .rodata.
101 ; When looking at the disassembly it appears that the offset is either
102 ; correct or consistently off by 90. Placing them in the .text section
103 ; works around the issue. It appears to be specific to the way libvpx
104 ; handles the tables.
105 %macro SECTION_RODATA 0-1 16
106 %ifidn __OUTPUT_FORMAT__,win32
107 SECTION .rdata align=%1
109 SECTION .rdata align=%1
110 %elifidn __OUTPUT_FORMAT__,macho32
111 SECTION .text align=%1
113 %elifidn __OUTPUT_FORMAT__,aout
116 SECTION .rodata align=%1
120 ; PIC macros from vpx_ports/x86_abi_support.asm.
121 %ifidn __OUTPUT_FORMAT__,elf32
122 %define ABI_IS_32BIT 1
123 %elifidn __OUTPUT_FORMAT__,macho32
124 %define ABI_IS_32BIT 1
125 %elifidn __OUTPUT_FORMAT__,win32
126 %define ABI_IS_32BIT 1
127 %elifidn __OUTPUT_FORMAT__,aout
128 %define ABI_IS_32BIT 1
130 %define ABI_IS_32BIT 0
135 %ifidn __OUTPUT_FORMAT__,elf32
136 %define GET_GOT_DEFINED 1
137 %define WRT_PLT wrt ..plt
139 extern _GLOBAL_OFFSET_TABLE_
146 add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
150 %define GLOBAL(x) x + %1 wrt ..gotoff
152 %define RESTORE_GOT pop %1
154 %elifidn __OUTPUT_FORMAT__,macho32
155 %define GET_GOT_DEFINED 1
162 %define GLOBAL(x) x + %1 - %%get_got
164 %define RESTORE_GOT pop %1
167 %define GET_GOT_DEFINED 0
178 %define GLOBAL(x) rel x
179 %define WRT_PLT wrt ..plt
183 %elifidn __OUTPUT_FORMAT__,macho64
206 %ifndef GET_GOT_DEFINED
207 %define GET_GOT_DEFINED 0
209 ; End PIC macros from vpx_ports/x86_abi_support.asm.
211 %define HAVE_PRIVATE_EXTERN 1
214 %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
215 %define HAVE_PRIVATE_EXTERN 0
219 ; Macros to eliminate most code duplication between x86_32 and x86_64:
220 ; Currently this works only for leaf functions which load all their arguments
221 ; into registers at the start, and make no other use of the stack. Luckily that
222 ; covers most of x264's asm.
225 ; %1 = number of arguments. loads them from stack if needed.
226 ; %2 = number of registers used. pushes callee-saved regs if needed.
227 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
228 ; %4 = (optional) stack size to be allocated. The stack will be aligned before
229 ; allocating the specified stack size. If the required stack alignment is
230 ; larger than the known stack alignment the stack will be manually aligned
231 ; and an extra register will be allocated to hold the original stack
232 ; pointer (to not invalidate r0m etc.). To prevent the use of an extra
233 ; register as stack pointer, request a negative stack size.
234 ; %4+/%5+ = list of names to define to registers
235 ; PROLOGUE can also be invoked by adding the same options to cglobal
238 ; cglobal foo, 2,3,7,0x40, dst, src, tmp
239 ; declares a function (foo) that automatically loads two arguments (dst and
240 ; src) into registers, uses one additional register (tmp) plus 7 vector
241 ; registers (m0-m6) and allocates 0x40 bytes of stack space.
243 ; TODO Some functions can use some args directly from the stack. If they're the
244 ; last args then you can just not declare them, but if they're in the middle
245 ; we need more flexible macro.
248 ; Pops anything that was pushed by PROLOGUE, and returns.
251 ; Use this instead of RET if it's a branch target.
254 ; rN and rNq are the native-size register holding function argument N
255 ; rNd, rNw, rNb are dword, word, and byte size
256 ; rNh is the high 8 bits of the word size
257 ; rNm is the original location of arg N (a register or on the stack), dword
258 ; rNmp is native size
260 %macro DECLARE_REG 2-3
270 %elif ARCH_X86_64 ; memory
271 %define r%1m [rstk + stack_offset + %3]
272 %define r%1mp qword r %+ %1 %+ m
274 %define r%1m [rstk + stack_offset + %3]
275 %define r%1mp dword r %+ %1 %+ m
280 %macro DECLARE_REG_SIZE 3
296 DECLARE_REG_SIZE ax, al, ah
297 DECLARE_REG_SIZE bx, bl, bh
298 DECLARE_REG_SIZE cx, cl, ch
299 DECLARE_REG_SIZE dx, dl, dh
300 DECLARE_REG_SIZE si, sil, null
301 DECLARE_REG_SIZE di, dil, null
302 DECLARE_REG_SIZE bp, bpl, null
304 ; t# defines for when per-arch register allocation is more complex than just function arguments
306 %macro DECLARE_REG_TMP 1-*
309 CAT_XDEFINE t, %%i, r%1
315 %macro DECLARE_REG_TMP_SIZE 0-*
317 %define t%1q t%1 %+ q
318 %define t%1d t%1 %+ d
319 %define t%1w t%1 %+ w
320 %define t%1h t%1 %+ h
321 %define t%1b t%1 %+ b
326 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
338 call $+5 ; special-cased to not affect the RSB on most CPU:s
349 %assign stack_offset stack_offset+gprsize
356 %assign stack_offset stack_offset-gprsize
360 %macro PUSH_IF_USED 1-*
369 %macro POP_IF_USED 1-*
378 %macro LOAD_IF_USED 1-*
381 mov r%1, r %+ %1 %+ mp
390 %assign stack_offset stack_offset+(%2)
397 %assign stack_offset stack_offset-(%2)
408 %define movsxd movifnidn
411 %macro movsxdifnidn 2
419 %error assertion ``%1'' failed
423 %macro DEFINE_ARGS 0-*
427 CAT_UNDEF arg_name %+ %%i, q
428 CAT_UNDEF arg_name %+ %%i, d
429 CAT_UNDEF arg_name %+ %%i, w
430 CAT_UNDEF arg_name %+ %%i, h
431 CAT_UNDEF arg_name %+ %%i, b
432 CAT_UNDEF arg_name %+ %%i, m
433 CAT_UNDEF arg_name %+ %%i, mp
434 CAT_UNDEF arg_name, %%i
439 %xdefine %%stack_offset stack_offset
440 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
443 %xdefine %1q r %+ %%i %+ q
444 %xdefine %1d r %+ %%i %+ d
445 %xdefine %1w r %+ %%i %+ w
446 %xdefine %1h r %+ %%i %+ h
447 %xdefine %1b r %+ %%i %+ b
448 %xdefine %1m r %+ %%i %+ m
449 %xdefine %1mp r %+ %%i %+ mp
450 CAT_XDEFINE arg_name, %%i, %1
454 %xdefine stack_offset %%stack_offset
455 %assign n_arg_names %0
458 %define required_stack_alignment ((mmsize + 15) & ~15)
459 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
460 %define high_mm_regs (16*cpuflag(avx512))
462 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
466 %assign stack_size %1
468 %assign stack_size -stack_size
471 %assign %%pad %%pad + 32 ; shadow space
473 %assign xmm_regs_used %2
474 %if xmm_regs_used > 8
475 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
479 %if required_stack_alignment <= STACK_ALIGNMENT
480 ; maintain the current stack alignment
481 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
482 SUB rsp, stack_size_padded
484 %assign %%reg_num (regs_used - 1)
485 %xdefine rstk r %+ %%reg_num
486 ; align stack, and save original stack location directly above
487 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
488 ; stack in a single instruction (i.e. mov rsp, rstk or mov
489 ; rsp, [rsp+stack_size_padded])
490 %if %1 < 0 ; need to store rsp on stack
491 %xdefine rstkm [rsp + stack_size + %%pad]
492 %assign %%pad %%pad + gprsize
493 %else ; can keep rsp in rstk during whole function
496 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
498 and rsp, ~(required_stack_alignment-1)
499 sub rsp, stack_size_padded
500 movifnidn rstkm, rstk
507 %macro SETUP_STACK_POINTER 1
509 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
511 ; Reserve an additional register for storing the original stack pointer, but avoid using
512 ; eax/rax for this purpose since it can potentially get overwritten as a return value.
513 %assign regs_used (regs_used + 1)
514 %if ARCH_X86_64 && regs_used == 7
516 %elif ARCH_X86_64 == 0 && regs_used == 1
520 %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
521 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
522 ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
523 %assign regs_used 5 + UNIX64 * 3
529 %macro DEFINE_ARGS_INTERNAL 3+
539 %if WIN64 ; Windows x64 ;=================================================
545 DECLARE_REG 4, R10, 40
546 DECLARE_REG 5, R11, 48
547 DECLARE_REG 6, rax, 56
548 DECLARE_REG 7, rdi, 64
549 DECLARE_REG 8, rsi, 72
550 DECLARE_REG 9, rbx, 80
551 DECLARE_REG 10, rbp, 88
552 DECLARE_REG 11, R14, 96
553 DECLARE_REG 12, R15, 104
554 DECLARE_REG 13, R12, 112
555 DECLARE_REG 14, R13, 120
557 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
560 ASSERT regs_used >= num_args
561 SETUP_STACK_POINTER %4
562 ASSERT regs_used <= 15
563 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
565 %if mmsize != 8 && stack_size == 0
568 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
569 DEFINE_ARGS_INTERNAL %0, %4, %5
572 %macro WIN64_PUSH_XMM 0
573 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
574 %if xmm_regs_used > 6 + high_mm_regs
575 movaps [rstk + stack_offset + 8], xmm6
577 %if xmm_regs_used > 7 + high_mm_regs
578 movaps [rstk + stack_offset + 24], xmm7
580 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
581 %if %%xmm_regs_on_stack > 0
583 %rep %%xmm_regs_on_stack
584 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
590 %macro WIN64_SPILL_XMM 1
591 %assign xmm_regs_used %1
592 ASSERT xmm_regs_used <= 16 + high_mm_regs
593 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
594 %if %%xmm_regs_on_stack > 0
595 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
596 %assign %%pad %%xmm_regs_on_stack*16 + 32
597 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
598 SUB rsp, stack_size_padded
603 %macro WIN64_RESTORE_XMM_INTERNAL 0
605 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
606 %if %%xmm_regs_on_stack > 0
607 %assign %%i xmm_regs_used - high_mm_regs
608 %rep %%xmm_regs_on_stack
610 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
613 %if stack_size_padded > 0
614 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
617 add rsp, stack_size_padded
618 %assign %%pad_size stack_size_padded
621 %if xmm_regs_used > 7 + high_mm_regs
622 movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
624 %if xmm_regs_used > 6 + high_mm_regs
625 movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
629 %macro WIN64_RESTORE_XMM 0
630 WIN64_RESTORE_XMM_INTERNAL
631 %assign stack_offset (stack_offset-stack_size_padded)
632 %assign stack_size_padded 0
633 %assign xmm_regs_used 0
636 %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
639 WIN64_RESTORE_XMM_INTERNAL
640 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
641 %if vzeroupper_required
647 %elif ARCH_X86_64 ; *nix x64 ;=============================================
655 DECLARE_REG 6, rax, 8
656 DECLARE_REG 7, R10, 16
657 DECLARE_REG 8, R11, 24
658 DECLARE_REG 9, rbx, 32
659 DECLARE_REG 10, rbp, 40
660 DECLARE_REG 11, R14, 48
661 DECLARE_REG 12, R15, 56
662 DECLARE_REG 13, R12, 64
663 DECLARE_REG 14, R13, 72
665 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
668 %assign xmm_regs_used %3
669 ASSERT regs_used >= num_args
670 SETUP_STACK_POINTER %4
671 ASSERT regs_used <= 15
672 PUSH_IF_USED 9, 10, 11, 12, 13, 14
674 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
675 DEFINE_ARGS_INTERNAL %0, %4, %5
678 %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
681 %if stack_size_padded > 0
682 %if required_stack_alignment > STACK_ALIGNMENT
685 add rsp, stack_size_padded
688 POP_IF_USED 14, 13, 12, 11, 10, 9
689 %if vzeroupper_required
695 %else ; X86_32 ;==============================================================
697 DECLARE_REG 0, eax, 4
698 DECLARE_REG 1, ecx, 8
699 DECLARE_REG 2, edx, 12
700 DECLARE_REG 3, ebx, 16
701 DECLARE_REG 4, esi, 20
702 DECLARE_REG 5, edi, 24
703 DECLARE_REG 6, ebp, 28
706 %macro DECLARE_ARG 1-*
708 %define r%1m [rstk + stack_offset + 4*%1 + 4]
709 %define r%1mp dword r%1m
714 DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
716 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
719 ASSERT regs_used >= num_args
726 SETUP_STACK_POINTER %4
727 ASSERT regs_used <= 7
728 PUSH_IF_USED 3, 4, 5, 6
730 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
731 DEFINE_ARGS_INTERNAL %0, %4, %5
734 %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
737 %if stack_size_padded > 0
738 %if required_stack_alignment > STACK_ALIGNMENT
741 add rsp, stack_size_padded
744 POP_IF_USED 6, 5, 4, 3
745 %if vzeroupper_required
751 %endif ;======================================================================
754 %macro WIN64_SPILL_XMM 1
756 %macro WIN64_RESTORE_XMM 0
758 %macro WIN64_PUSH_XMM 0
762 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
763 ; a branch or a branch target. So switch to a 2-byte form of ret in that case.
764 ; We can automatically detect "follows a branch", but not a branch target.
765 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
767 %if has_epilogue || cpuflag(ssse3)
772 annotate_function_size
775 %define last_branch_adr $$
776 %macro AUTO_REP_RET 0
777 %if notcpuflag(ssse3)
778 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
781 annotate_function_size
784 %macro BRANCH_INSTR 0-*
788 %if notcpuflag(ssse3)
790 %xdefine last_branch_adr %%branch_instr
797 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
799 %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
806 annotate_function_size
809 ;=============================================================================
810 ; arch-independent part
811 ;=============================================================================
813 %assign function_align 16
816 ; Applies any symbol mangling needed for C linkage, and sets up a define such that
817 ; subsequent uses of the function name automatically refer to the mangled version.
818 ; Appends cpuflags to the function name if cpuflags has been specified.
819 ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
820 ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
821 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
822 cglobal_internal 1, %1 %+ SUFFIX, %2
824 %macro cvisible 1-2+ "" ; name, [PROLOGUE args]
825 cglobal_internal 0, %1 %+ SUFFIX, %2
827 %macro cglobal_internal 2-3+
828 annotate_function_size
831 %xdefine %2 mangle(private_prefix %+ _ %+ %2)
833 %xdefine %2 mangle(public_prefix %+ _ %+ %2)
835 %xdefine %2.skip_prologue %2 %+ .skip_prologue
836 CAT_XDEFINE cglobaled_, %2, 1
838 %xdefine current_function %2
839 %xdefine current_function_section __SECT__
842 global %2:function hidden
846 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
847 global %2:private_extern
853 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
854 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
855 %assign stack_offset 0 ; stack pointer offset relative to the return address
856 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
857 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
858 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
864 ; Create a global symbol from a local label with the correct name mangling and type
865 %macro cglobal_label 1
867 global current_function %+ %1:function hidden
868 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
869 global current_function %+ %1:private_extern
871 global current_function %+ %1
877 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
878 CAT_XDEFINE cglobaled_, %1, 1
882 ; like cextern, but without the prefix
883 %macro cextern_naked 1
885 %xdefine %1 mangle(%1)
887 CAT_XDEFINE cglobaled_, %1, 1
892 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
894 global %1:data hidden
895 %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
896 global %1:private_extern
903 ; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
905 [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
908 ; Tell debuggers how large the function was.
909 ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
910 ; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
911 ; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
912 ; then its size might be unspecified.
913 %macro annotate_function_size 0
915 %ifdef current_function
917 current_function_section
919 size current_function %%ecf - current_function
928 %assign cpuflags_mmx (1<<0)
929 %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
930 %assign cpuflags_3dnow (1<<2) | cpuflags_mmx
931 %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
932 %assign cpuflags_sse (1<<4) | cpuflags_mmx2
933 %assign cpuflags_sse2 (1<<5) | cpuflags_sse
934 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
935 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
936 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2
937 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
938 %assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
939 %assign cpuflags_sse42 (1<<11)| cpuflags_sse4
940 %assign cpuflags_aesni (1<<12)| cpuflags_sse42
941 %assign cpuflags_gfni (1<<13)| cpuflags_sse42
942 %assign cpuflags_avx (1<<14)| cpuflags_sse42
943 %assign cpuflags_xop (1<<15)| cpuflags_avx
944 %assign cpuflags_fma4 (1<<16)| cpuflags_avx
945 %assign cpuflags_fma3 (1<<17)| cpuflags_avx
946 %assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt
947 %assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1
948 %assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2
949 %assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
951 %assign cpuflags_cache32 (1<<22)
952 %assign cpuflags_cache64 (1<<23)
953 %assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
954 %assign cpuflags_atom (1<<25)
956 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
957 %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
958 %define notcpuflag(x) (cpuflag(x) ^ 1)
960 ; Takes an arbitrary number of cpuflags from the above list.
961 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
962 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
963 %macro INIT_CPUFLAGS 0-*
971 %xdefine cpuname cpuname %+ _%1
975 %assign cpuflags cpuflags | cpuflags_%1
978 %xdefine SUFFIX _ %+ cpuname
981 %assign avx_enabled 1
983 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
986 %define movnta movntps
990 %elif cpuflag(sse3) && notcpuflag(ssse3)
995 %if ARCH_X86_64 || cpuflag(sse2)
1010 ; Merge mmx, sse*, and avx*
1011 ; m# is a simd register of the currently selected size
1012 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
1013 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
1014 ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
1015 ; (All 4 remain in sync through SWAP.)
1017 %macro CAT_XDEFINE 3
1025 %macro DEFINE_MMREGS 1 ; mmtype
1026 %assign %%prev_mmregs 0
1028 %assign %%prev_mmregs num_mmregs
1031 %assign num_mmregs 8
1032 %if ARCH_X86_64 && mmsize >= 16
1033 %assign num_mmregs 16
1034 %if cpuflag(avx512) || mmsize == 64
1035 %assign num_mmregs 32
1041 CAT_XDEFINE m, %%i, %1 %+ %%i
1042 CAT_XDEFINE nn%1, %%i, %%i
1045 %if %%prev_mmregs > num_mmregs
1046 %rep %%prev_mmregs - num_mmregs
1048 CAT_UNDEF nn %+ mmtype, %%i
1055 ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
1056 %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
1057 %if ARCH_X86_64 && cpuflag(avx512)
1060 %assign %%i_high %%i+16
1067 %macro INIT_MMX 0-1+
1068 %assign avx_enabled 0
1069 %define RESET_MM_PERMUTATION INIT_MMX %1
1074 %define movnta movntq
1079 %macro INIT_XMM 0-1+
1080 %assign avx_enabled 0
1081 %define RESET_MM_PERMUTATION INIT_XMM %1
1086 %define movnta movntdq
1090 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
1094 %macro INIT_YMM 0-1+
1095 %assign avx_enabled 1
1096 %define RESET_MM_PERMUTATION INIT_YMM %1
1101 %define movnta movntdq
1104 AVX512_MM_PERMUTATION
1107 %macro INIT_ZMM 0-1+
1108 %assign avx_enabled 1
1109 %define RESET_MM_PERMUTATION INIT_ZMM %1
1114 %define movnta movntdq
1117 AVX512_MM_PERMUTATION
1122 %macro DECLARE_MMCAST 1
1124 %define mmxmm%1 mm%1
1125 %define mmymm%1 mm%1
1126 %define mmzmm%1 mm%1
1127 %define xmmmm%1 mm%1
1128 %define xmmxmm%1 xmm%1
1129 %define xmmymm%1 xmm%1
1130 %define xmmzmm%1 xmm%1
1131 %define ymmmm%1 mm%1
1132 %define ymmxmm%1 xmm%1
1133 %define ymmymm%1 ymm%1
1134 %define ymmzmm%1 ymm%1
1135 %define zmmmm%1 mm%1
1136 %define zmmxmm%1 xmm%1
1137 %define zmmymm%1 ymm%1
1138 %define zmmzmm%1 zmm%1
1139 %define xm%1 xmm %+ m%1
1140 %define ym%1 ymm %+ m%1
1141 %define zm%1 zmm %+ m%1
1150 ; I often want to use macros that permute their arguments. e.g. there's no
1151 ; efficient way to implement butterfly or transpose or dct without swapping some
1154 ; I would like to not have to manually keep track of the permutations:
1155 ; If I insert a permutation in the middle of a function, it should automatically
1156 ; change everything that follows. For more complex macros I may also have multiple
1157 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
1159 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
1160 ; permutes its arguments. It's equivalent to exchanging the contents of the
1161 ; registers, except that this way you exchange the register names instead, so it
1162 ; doesn't cost any cycles.
1164 %macro PERMUTE 2-* ; takes a list of pairs to swap
1166 %xdefine %%tmp%2 m%2
1170 %xdefine m%1 %%tmp%2
1171 CAT_XDEFINE nn, m%1, %1
1176 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
1177 %ifnum %1 ; SWAP 0, 1, ...
1178 SWAP_INTERNAL_NUM %1, %2
1179 %else ; SWAP m0, m1, ...
1180 SWAP_INTERNAL_NAME %1, %2
1184 %macro SWAP_INTERNAL_NUM 2-*
1189 CAT_XDEFINE nn, m%1, %1
1190 CAT_XDEFINE nn, m%2, %2
1195 %macro SWAP_INTERNAL_NAME 2-*
1196 %xdefine %%args nn %+ %1
1198 %xdefine %%args %%args, nn %+ %2
1201 SWAP_INTERNAL_NUM %%args
1204 ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
1205 ; calls to that function will automatically load the permutation, so values can
1206 ; be returned in mmregs.
1207 %macro SAVE_MM_PERMUTATION 0-1
1211 %xdefine %%f current_function %+ _m
1215 %xdefine %%tmp m %+ %%i
1216 CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
1221 %macro LOAD_MM_PERMUTATION 0-1 ; name to load from
1225 %xdefine %%f current_function %+ _m
1227 %xdefine %%tmp %%f %+ 0
1229 RESET_MM_PERMUTATION
1232 %xdefine %%tmp %%f %+ %%i
1233 CAT_XDEFINE %%m, %%i, m %+ %%tmp
1238 CAT_XDEFINE m, %%i, %%m %+ %%i
1239 CAT_XDEFINE nn, m %+ %%i, %%i
1244 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
1247 call_internal %1 %+ SUFFIX, %1
1252 %macro call_internal 2
1254 %ifndef cglobaled_%2
1260 LOAD_MM_PERMUTATION %%i
1263 ; Substitutions that reduce instruction size but are functionally equivalent
1288 ;=============================================================================
1289 ; AVX abstraction layer
1290 ;=============================================================================
1295 CAT_XDEFINE sizeofmm, i, 8
1296 CAT_XDEFINE regnumofmm, i, i
1298 CAT_XDEFINE sizeofxmm, i, 16
1299 CAT_XDEFINE sizeofymm, i, 32
1300 CAT_XDEFINE sizeofzmm, i, 64
1301 CAT_XDEFINE regnumofxmm, i, i
1302 CAT_XDEFINE regnumofymm, i, i
1303 CAT_XDEFINE regnumofzmm, i, i
1308 %macro CHECK_AVX_INSTR_EMU 3-*
1309 %xdefine %%opcode %1
1313 %error non-avx emulation of ``%%opcode'' is not supported
1320 ;%2 == minimal instruction set
1321 ;%3 == 1 if float, 0 if int
1322 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
1323 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1325 %macro RUN_AVX_INSTR 6-9+
1327 %assign __sizeofreg sizeof%7
1329 %assign __sizeofreg sizeof%6
1331 %assign __sizeofreg mmsize
1333 %assign __emulate_avx 0
1334 %if avx_enabled && __sizeofreg >= 16
1335 %xdefine __instr v%1
1339 %assign __emulate_avx 1
1345 %error use of ``%1'' %2 instruction in cpuname function: current_function
1346 %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
1347 %error use of ``%1'' sse2 instruction in cpuname function: current_function
1348 %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
1349 %error use of ``%1'' avx2 instruction in cpuname function: current_function
1350 %elif __sizeofreg == 16 && notcpuflag(sse)
1351 %error use of ``%1'' sse instruction in cpuname function: current_function
1352 %elif __sizeofreg == 32 && notcpuflag(avx)
1353 %error use of ``%1'' avx instruction in cpuname function: current_function
1354 %elif __sizeofreg == 64 && notcpuflag(avx512)
1355 %error use of ``%1'' avx512 instruction in cpuname function: current_function
1356 %elifidn %1, pextrw ; special case because the base instruction is mmx2,
1357 %ifnid %6 ; but sse4 is required for memory operands
1358 %if notcpuflag(sse4)
1359 %error use of ``%1'' sse4 instruction in cpuname function: current_function
1375 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1376 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1377 ; So, if the instruction is commutative with a memory arg, swap them.
1385 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
1387 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
1389 %if __sizeofreg == 8
1403 __instr %6, %7, %8, %9
1405 %if avx_enabled && %5
1410 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
1411 ; Most VEX-encoded instructions require an additional byte to encode when
1412 ; src2 is a high register (e.g. m8..15). If the instruction is commutative
1413 ; we can swap src1 and src2 when doing so reduces the instruction length.
1419 __instr %6, __src1, __src2
1424 %if avx_enabled && %5
1429 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
1435 __instr %6, __src1, __src2
1445 ;%2 == minimal instruction set
1446 ;%3 == 1 if float, 0 if int
1447 ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
1448 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1449 %macro AVX_INSTR 1-5 fnord, 0, 255, 0
1450 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
1452 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
1454 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
1456 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
1458 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
1460 RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
1465 ; Instructions with both VEX/EVEX and legacy encodings
1466 ; Non-destructive instructions are written without parameters
1467 AVX_INSTR addpd, sse2, 1, 0, 1
1468 AVX_INSTR addps, sse, 1, 0, 1
1469 AVX_INSTR addsd, sse2, 1, 0, 0
1470 AVX_INSTR addss, sse, 1, 0, 0
1471 AVX_INSTR addsubpd, sse3, 1, 0, 0
1472 AVX_INSTR addsubps, sse3, 1, 0, 0
1473 AVX_INSTR aesdec, aesni, 0, 0, 0
1474 AVX_INSTR aesdeclast, aesni, 0, 0, 0
1475 AVX_INSTR aesenc, aesni, 0, 0, 0
1476 AVX_INSTR aesenclast, aesni, 0, 0, 0
1477 AVX_INSTR aesimc, aesni
1478 AVX_INSTR aeskeygenassist, aesni
1479 AVX_INSTR andnpd, sse2, 1, 0, 0
1480 AVX_INSTR andnps, sse, 1, 0, 0
1481 AVX_INSTR andpd, sse2, 1, 0, 1
1482 AVX_INSTR andps, sse, 1, 0, 1
1483 AVX_INSTR blendpd, sse4, 1, 1, 0
1484 AVX_INSTR blendps, sse4, 1, 1, 0
1485 AVX_INSTR blendvpd, sse4 ; can't be emulated
1486 AVX_INSTR blendvps, sse4 ; can't be emulated
1487 AVX_INSTR cmpeqpd, sse2, 1, 0, 1
1488 AVX_INSTR cmpeqps, sse, 1, 0, 1
1489 AVX_INSTR cmpeqsd, sse2, 1, 0, 0
1490 AVX_INSTR cmpeqss, sse, 1, 0, 0
1491 AVX_INSTR cmplepd, sse2, 1, 0, 0
1492 AVX_INSTR cmpleps, sse, 1, 0, 0
1493 AVX_INSTR cmplesd, sse2, 1, 0, 0
1494 AVX_INSTR cmpless, sse, 1, 0, 0
1495 AVX_INSTR cmpltpd, sse2, 1, 0, 0
1496 AVX_INSTR cmpltps, sse, 1, 0, 0
1497 AVX_INSTR cmpltsd, sse2, 1, 0, 0
1498 AVX_INSTR cmpltss, sse, 1, 0, 0
1499 AVX_INSTR cmpneqpd, sse2, 1, 0, 1
1500 AVX_INSTR cmpneqps, sse, 1, 0, 1
1501 AVX_INSTR cmpneqsd, sse2, 1, 0, 0
1502 AVX_INSTR cmpneqss, sse, 1, 0, 0
1503 AVX_INSTR cmpnlepd, sse2, 1, 0, 0
1504 AVX_INSTR cmpnleps, sse, 1, 0, 0
1505 AVX_INSTR cmpnlesd, sse2, 1, 0, 0
1506 AVX_INSTR cmpnless, sse, 1, 0, 0
1507 AVX_INSTR cmpnltpd, sse2, 1, 0, 0
1508 AVX_INSTR cmpnltps, sse, 1, 0, 0
1509 AVX_INSTR cmpnltsd, sse2, 1, 0, 0
1510 AVX_INSTR cmpnltss, sse, 1, 0, 0
1511 AVX_INSTR cmpordpd, sse2 1, 0, 1
1512 AVX_INSTR cmpordps, sse 1, 0, 1
1513 AVX_INSTR cmpordsd, sse2 1, 0, 0
1514 AVX_INSTR cmpordss, sse 1, 0, 0
1515 AVX_INSTR cmppd, sse2, 1, 1, 0
1516 AVX_INSTR cmpps, sse, 1, 1, 0
1517 AVX_INSTR cmpsd, sse2, 1, 1, 0
1518 AVX_INSTR cmpss, sse, 1, 1, 0
1519 AVX_INSTR cmpunordpd, sse2, 1, 0, 1
1520 AVX_INSTR cmpunordps, sse, 1, 0, 1
1521 AVX_INSTR cmpunordsd, sse2, 1, 0, 0
1522 AVX_INSTR cmpunordss, sse, 1, 0, 0
1523 AVX_INSTR comisd, sse2, 1
1524 AVX_INSTR comiss, sse, 1
1525 AVX_INSTR cvtdq2pd, sse2, 1
1526 AVX_INSTR cvtdq2ps, sse2, 1
1527 AVX_INSTR cvtpd2dq, sse2, 1
1528 AVX_INSTR cvtpd2ps, sse2, 1
1529 AVX_INSTR cvtps2dq, sse2, 1
1530 AVX_INSTR cvtps2pd, sse2, 1
1531 AVX_INSTR cvtsd2si, sse2, 1
1532 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
1533 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
1534 AVX_INSTR cvtsi2ss, sse, 1, 0, 0
1535 AVX_INSTR cvtss2sd, sse2, 1, 0, 0
1536 AVX_INSTR cvtss2si, sse, 1
1537 AVX_INSTR cvttpd2dq, sse2, 1
1538 AVX_INSTR cvttps2dq, sse2, 1
1539 AVX_INSTR cvttsd2si, sse2, 1
1540 AVX_INSTR cvttss2si, sse, 1
1541 AVX_INSTR divpd, sse2, 1, 0, 0
1542 AVX_INSTR divps, sse, 1, 0, 0
1543 AVX_INSTR divsd, sse2, 1, 0, 0
1544 AVX_INSTR divss, sse, 1, 0, 0
1545 AVX_INSTR dppd, sse4, 1, 1, 0
1546 AVX_INSTR dpps, sse4, 1, 1, 0
1547 AVX_INSTR extractps, sse4, 1
1548 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
1549 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
1550 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
1551 AVX_INSTR haddpd, sse3, 1, 0, 0
1552 AVX_INSTR haddps, sse3, 1, 0, 0
1553 AVX_INSTR hsubpd, sse3, 1, 0, 0
1554 AVX_INSTR hsubps, sse3, 1, 0, 0
1555 AVX_INSTR insertps, sse4, 1, 1, 0
1556 AVX_INSTR lddqu, sse3
1557 AVX_INSTR ldmxcsr, sse, 1
1558 AVX_INSTR maskmovdqu, sse2
1559 AVX_INSTR maxpd, sse2, 1, 0, 1
1560 AVX_INSTR maxps, sse, 1, 0, 1
1561 AVX_INSTR maxsd, sse2, 1, 0, 0
1562 AVX_INSTR maxss, sse, 1, 0, 0
1563 AVX_INSTR minpd, sse2, 1, 0, 1
1564 AVX_INSTR minps, sse, 1, 0, 1
1565 AVX_INSTR minsd, sse2, 1, 0, 0
1566 AVX_INSTR minss, sse, 1, 0, 0
1567 AVX_INSTR movapd, sse2, 1
1568 AVX_INSTR movaps, sse, 1
1570 AVX_INSTR movddup, sse3, 1
1571 AVX_INSTR movdqa, sse2
1572 AVX_INSTR movdqu, sse2
1573 AVX_INSTR movhlps, sse, 1, 0, 0
1574 AVX_INSTR movhpd, sse2, 1, 0, 0
1575 AVX_INSTR movhps, sse, 1, 0, 0
1576 AVX_INSTR movlhps, sse, 1, 0, 0
1577 AVX_INSTR movlpd, sse2, 1, 0, 0
1578 AVX_INSTR movlps, sse, 1, 0, 0
1579 AVX_INSTR movmskpd, sse2, 1
1580 AVX_INSTR movmskps, sse, 1
1581 AVX_INSTR movntdq, sse2
1582 AVX_INSTR movntdqa, sse4
1583 AVX_INSTR movntpd, sse2, 1
1584 AVX_INSTR movntps, sse, 1
1586 AVX_INSTR movsd, sse2, 1, 0, 0
1587 AVX_INSTR movshdup, sse3, 1
1588 AVX_INSTR movsldup, sse3, 1
1589 AVX_INSTR movss, sse, 1, 0, 0
1590 AVX_INSTR movupd, sse2, 1
1591 AVX_INSTR movups, sse, 1
1592 AVX_INSTR mpsadbw, sse4, 0, 1, 0
1593 AVX_INSTR mulpd, sse2, 1, 0, 1
1594 AVX_INSTR mulps, sse, 1, 0, 1
1595 AVX_INSTR mulsd, sse2, 1, 0, 0
1596 AVX_INSTR mulss, sse, 1, 0, 0
1597 AVX_INSTR orpd, sse2, 1, 0, 1
1598 AVX_INSTR orps, sse, 1, 0, 1
1599 AVX_INSTR pabsb, ssse3
1600 AVX_INSTR pabsd, ssse3
1601 AVX_INSTR pabsw, ssse3
1602 AVX_INSTR packsswb, mmx, 0, 0, 0
1603 AVX_INSTR packssdw, mmx, 0, 0, 0
1604 AVX_INSTR packuswb, mmx, 0, 0, 0
1605 AVX_INSTR packusdw, sse4, 0, 0, 0
1606 AVX_INSTR paddb, mmx, 0, 0, 1
1607 AVX_INSTR paddw, mmx, 0, 0, 1
1608 AVX_INSTR paddd, mmx, 0, 0, 1
1609 AVX_INSTR paddq, sse2, 0, 0, 1
1610 AVX_INSTR paddsb, mmx, 0, 0, 1
1611 AVX_INSTR paddsw, mmx, 0, 0, 1
1612 AVX_INSTR paddusb, mmx, 0, 0, 1
1613 AVX_INSTR paddusw, mmx, 0, 0, 1
1614 AVX_INSTR palignr, ssse3, 0, 1, 0
1615 AVX_INSTR pand, mmx, 0, 0, 1
1616 AVX_INSTR pandn, mmx, 0, 0, 0
1617 AVX_INSTR pavgb, mmx2, 0, 0, 1
1618 AVX_INSTR pavgw, mmx2, 0, 0, 1
1619 AVX_INSTR pblendvb, sse4 ; can't be emulated
1620 AVX_INSTR pblendw, sse4, 0, 1, 0
1621 AVX_INSTR pclmulqdq, fnord, 0, 1, 0
1622 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
1623 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
1624 AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
1625 AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
1626 AVX_INSTR pcmpestri, sse42
1627 AVX_INSTR pcmpestrm, sse42
1628 AVX_INSTR pcmpistri, sse42
1629 AVX_INSTR pcmpistrm, sse42
1630 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
1631 AVX_INSTR pcmpeqw, mmx, 0, 0, 1
1632 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
1633 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
1634 AVX_INSTR pcmpgtb, mmx, 0, 0, 0
1635 AVX_INSTR pcmpgtw, mmx, 0, 0, 0
1636 AVX_INSTR pcmpgtd, mmx, 0, 0, 0
1637 AVX_INSTR pcmpgtq, sse42, 0, 0, 0
1638 AVX_INSTR pextrb, sse4
1639 AVX_INSTR pextrd, sse4
1640 AVX_INSTR pextrq, sse4
1641 AVX_INSTR pextrw, mmx2
1642 AVX_INSTR phaddw, ssse3, 0, 0, 0
1643 AVX_INSTR phaddd, ssse3, 0, 0, 0
1644 AVX_INSTR phaddsw, ssse3, 0, 0, 0
1645 AVX_INSTR phminposuw, sse4
1646 AVX_INSTR phsubw, ssse3, 0, 0, 0
1647 AVX_INSTR phsubd, ssse3, 0, 0, 0
1648 AVX_INSTR phsubsw, ssse3, 0, 0, 0
1649 AVX_INSTR pinsrb, sse4, 0, 1, 0
1650 AVX_INSTR pinsrd, sse4, 0, 1, 0
1651 AVX_INSTR pinsrq, sse4, 0, 1, 0
1652 AVX_INSTR pinsrw, mmx2, 0, 1, 0
1653 AVX_INSTR pmaddwd, mmx, 0, 0, 1
1654 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
1655 AVX_INSTR pmaxsb, sse4, 0, 0, 1
1656 AVX_INSTR pmaxsw, mmx2, 0, 0, 1
1657 AVX_INSTR pmaxsd, sse4, 0, 0, 1
1658 AVX_INSTR pmaxub, mmx2, 0, 0, 1
1659 AVX_INSTR pmaxuw, sse4, 0, 0, 1
1660 AVX_INSTR pmaxud, sse4, 0, 0, 1
1661 AVX_INSTR pminsb, sse4, 0, 0, 1
1662 AVX_INSTR pminsw, mmx2, 0, 0, 1
1663 AVX_INSTR pminsd, sse4, 0, 0, 1
1664 AVX_INSTR pminub, mmx2, 0, 0, 1
1665 AVX_INSTR pminuw, sse4, 0, 0, 1
1666 AVX_INSTR pminud, sse4, 0, 0, 1
1667 AVX_INSTR pmovmskb, mmx2
1668 AVX_INSTR pmovsxbw, sse4
1669 AVX_INSTR pmovsxbd, sse4
1670 AVX_INSTR pmovsxbq, sse4
1671 AVX_INSTR pmovsxwd, sse4
1672 AVX_INSTR pmovsxwq, sse4
1673 AVX_INSTR pmovsxdq, sse4
1674 AVX_INSTR pmovzxbw, sse4
1675 AVX_INSTR pmovzxbd, sse4
1676 AVX_INSTR pmovzxbq, sse4
1677 AVX_INSTR pmovzxwd, sse4
1678 AVX_INSTR pmovzxwq, sse4
1679 AVX_INSTR pmovzxdq, sse4
1680 AVX_INSTR pmuldq, sse4, 0, 0, 1
1681 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
1682 AVX_INSTR pmulhuw, mmx2, 0, 0, 1
1683 AVX_INSTR pmulhw, mmx, 0, 0, 1
1684 AVX_INSTR pmullw, mmx, 0, 0, 1
1685 AVX_INSTR pmulld, sse4, 0, 0, 1
1686 AVX_INSTR pmuludq, sse2, 0, 0, 1
1687 AVX_INSTR por, mmx, 0, 0, 1
1688 AVX_INSTR psadbw, mmx2, 0, 0, 1
1689 AVX_INSTR pshufb, ssse3, 0, 0, 0
1690 AVX_INSTR pshufd, sse2
1691 AVX_INSTR pshufhw, sse2
1692 AVX_INSTR pshuflw, sse2
1693 AVX_INSTR psignb, ssse3, 0, 0, 0
1694 AVX_INSTR psignw, ssse3, 0, 0, 0
1695 AVX_INSTR psignd, ssse3, 0, 0, 0
1696 AVX_INSTR psllw, mmx, 0, 0, 0
1697 AVX_INSTR pslld, mmx, 0, 0, 0
1698 AVX_INSTR psllq, mmx, 0, 0, 0
1699 AVX_INSTR pslldq, sse2, 0, 0, 0
1700 AVX_INSTR psraw, mmx, 0, 0, 0
1701 AVX_INSTR psrad, mmx, 0, 0, 0
1702 AVX_INSTR psrlw, mmx, 0, 0, 0
1703 AVX_INSTR psrld, mmx, 0, 0, 0
1704 AVX_INSTR psrlq, mmx, 0, 0, 0
1705 AVX_INSTR psrldq, sse2, 0, 0, 0
1706 AVX_INSTR psubb, mmx, 0, 0, 0
1707 AVX_INSTR psubw, mmx, 0, 0, 0
1708 AVX_INSTR psubd, mmx, 0, 0, 0
1709 AVX_INSTR psubq, sse2, 0, 0, 0
1710 AVX_INSTR psubsb, mmx, 0, 0, 0
1711 AVX_INSTR psubsw, mmx, 0, 0, 0
1712 AVX_INSTR psubusb, mmx, 0, 0, 0
1713 AVX_INSTR psubusw, mmx, 0, 0, 0
1714 AVX_INSTR ptest, sse4
1715 AVX_INSTR punpckhbw, mmx, 0, 0, 0
1716 AVX_INSTR punpckhwd, mmx, 0, 0, 0
1717 AVX_INSTR punpckhdq, mmx, 0, 0, 0
1718 AVX_INSTR punpckhqdq, sse2, 0, 0, 0
1719 AVX_INSTR punpcklbw, mmx, 0, 0, 0
1720 AVX_INSTR punpcklwd, mmx, 0, 0, 0
1721 AVX_INSTR punpckldq, mmx, 0, 0, 0
1722 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
1723 AVX_INSTR pxor, mmx, 0, 0, 1
1724 AVX_INSTR rcpps, sse, 1
1725 AVX_INSTR rcpss, sse, 1, 0, 0
1726 AVX_INSTR roundpd, sse4, 1
1727 AVX_INSTR roundps, sse4, 1
1728 AVX_INSTR roundsd, sse4, 1, 1, 0
1729 AVX_INSTR roundss, sse4, 1, 1, 0
1730 AVX_INSTR rsqrtps, sse, 1
1731 AVX_INSTR rsqrtss, sse, 1, 0, 0
1732 AVX_INSTR shufpd, sse2, 1, 1, 0
1733 AVX_INSTR shufps, sse, 1, 1, 0
1734 AVX_INSTR sqrtpd, sse2, 1
1735 AVX_INSTR sqrtps, sse, 1
1736 AVX_INSTR sqrtsd, sse2, 1, 0, 0
1737 AVX_INSTR sqrtss, sse, 1, 0, 0
1738 AVX_INSTR stmxcsr, sse, 1
1739 AVX_INSTR subpd, sse2, 1, 0, 0
1740 AVX_INSTR subps, sse, 1, 0, 0
1741 AVX_INSTR subsd, sse2, 1, 0, 0
1742 AVX_INSTR subss, sse, 1, 0, 0
1743 AVX_INSTR ucomisd, sse2, 1
1744 AVX_INSTR ucomiss, sse, 1
1745 AVX_INSTR unpckhpd, sse2, 1, 0, 0
1746 AVX_INSTR unpckhps, sse, 1, 0, 0
1747 AVX_INSTR unpcklpd, sse2, 1, 0, 0
1748 AVX_INSTR unpcklps, sse, 1, 0, 0
1749 AVX_INSTR xorpd, sse2, 1, 0, 1
1750 AVX_INSTR xorps, sse, 1, 0, 1
1752 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1753 AVX_INSTR pfadd, 3dnow, 1, 0, 1
1754 AVX_INSTR pfsub, 3dnow, 1, 0, 0
1755 AVX_INSTR pfmul, 3dnow, 1, 0, 1
1758 ;%2 == minimal instruction set
1760 %macro %1 2-5 fnord, %1, %2
1763 %error use of ``%4'' %5 instruction in cpuname function: current_function
1774 GPR_INSTR andn, bmi1
1775 GPR_INSTR bextr, bmi1
1776 GPR_INSTR blsi, bmi1
1777 GPR_INSTR blsr, bmi1
1778 GPR_INSTR blsmsk, bmi1
1779 GPR_INSTR bzhi, bmi2
1780 GPR_INSTR mulx, bmi2
1781 GPR_INSTR pdep, bmi2
1782 GPR_INSTR pext, bmi2
1783 GPR_INSTR popcnt, sse42
1784 GPR_INSTR rorx, bmi2
1785 GPR_INSTR sarx, bmi2
1786 GPR_INSTR shlx, bmi2
1787 GPR_INSTR shrx, bmi2
1789 ; base-4 constants for shuffles
1792 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1794 CAT_XDEFINE q000, j, i
1796 CAT_XDEFINE q00, j, i
1798 CAT_XDEFINE q0, j, i
1808 %macro %1 4-7 %1, %2, %3
1815 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
1820 FMA_INSTR pmacsww, pmullw, paddw
1821 FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
1822 FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
1823 FMA_INSTR pmadcswd, pmaddwd, paddd
1825 ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
1826 ; FMA3 is only possible if dst is the same as one of the src registers.
1827 ; Either src2 or src3 can be a memory operand.
1828 %macro FMA4_INSTR 2-*
1830 %xdefine %$prefix %1
1832 %macro %$prefix%2 4-6 %$prefix, %2
1833 %if notcpuflag(fma3) && notcpuflag(fma4)
1834 %error use of ``%5%6'' fma instruction in cpuname function: current_function
1836 v%5%6 %1, %2, %3, %4
1838 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
1840 v%{5}213%6 %2, %3, %4
1842 v%{5}132%6 %2, %4, %3
1845 v%{5}213%6 %3, %2, %4
1847 v%{5}231%6 %4, %2, %3
1849 %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
1857 FMA4_INSTR fmadd, pd, ps, sd, ss
1858 FMA4_INSTR fmaddsub, pd, ps
1859 FMA4_INSTR fmsub, pd, ps, sd, ss
1860 FMA4_INSTR fmsubadd, pd, ps
1861 FMA4_INSTR fnmadd, pd, ps, sd, ss
1862 FMA4_INSTR fnmsub, pd, ps, sd, ss
1864 ; Macros for converting VEX instructions to equivalent EVEX ones.
1865 %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
1866 %macro %1 2-7 fnord, fnord, %1, %2, %3
1868 %define %%args %1, %2
1870 %define %%args %1, %2, %3
1872 %define %%args %1, %2, %3, %4
1874 %assign %%evex_required cpuflag(avx512) & %7
1876 %if regnumof%1 >= 16 || sizeof%1 > 32
1877 %assign %%evex_required 1
1881 %if regnumof%2 >= 16 || sizeof%2 > 32
1882 %assign %%evex_required 1
1886 %if regnumof%3 >= 16 || sizeof%3 > 32
1887 %assign %%evex_required 1
1893 %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
1898 EVEX_INSTR vbroadcastf128, vbroadcastf32x4
1899 EVEX_INSTR vbroadcasti128, vbroadcasti32x4
1900 EVEX_INSTR vextractf128, vextractf32x4
1901 EVEX_INSTR vextracti128, vextracti32x4
1902 EVEX_INSTR vinsertf128, vinsertf32x4
1903 EVEX_INSTR vinserti128, vinserti32x4
1904 EVEX_INSTR vmovdqa, vmovdqa32
1905 EVEX_INSTR vmovdqu, vmovdqu32
1906 EVEX_INSTR vpand, vpandd
1907 EVEX_INSTR vpandn, vpandnd
1908 EVEX_INSTR vpor, vpord
1909 EVEX_INSTR vpxor, vpxord
1910 EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
1911 EVEX_INSTR vrcpss, vrcp14ss, 1
1912 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
1913 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1