From: Craig Topper Date: Sat, 23 Jul 2016 07:16:56 +0000 (+0000) Subject: [AVX512] Implement commuting support for EVEX encoded FMA3 instructions. X-Git-Tag: llvmorg-4.0.0-rc1~14402 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b6519db90dcb422020fcc7e78f655ad2f3821453;p=platform%2Fupstream%2Fllvm.git [AVX512] Implement commuting support for EVEX encoded FMA3 instructions. llvm-svn: 276521 --- diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 9c065fb..0a96cc3 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3157,145 +3157,107 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, static bool isFMA3(unsigned Opcode, bool &IsIntrinsic) { IsIntrinsic = false; +#define FMA3_CASE(Name, Modifier) \ +case X86::Name##r##Modifier: case X86::Name##m##Modifier: + +#define FMA3_SCALAR_PAIR(Name, Form, Modifier) \ + FMA3_CASE(Name##SD##Form, Modifier) \ + FMA3_CASE(Name##SS##Form, Modifier) + +#define FMA3_PACKED_PAIR(Name, Form, Modifier) \ + FMA3_CASE(Name##PD##Form, Modifier) \ + FMA3_CASE(Name##PS##Form, Modifier) + +#define FMA3_PACKED_SET(Form, Modifier) \ + FMA3_PACKED_PAIR(VFMADD, Form, Modifier) \ + FMA3_PACKED_PAIR(VFMSUB, Form, Modifier) \ + FMA3_PACKED_PAIR(VFNMADD, Form, Modifier) \ + FMA3_PACKED_PAIR(VFNMSUB, Form, Modifier) \ + FMA3_PACKED_PAIR(VFMADDSUB, Form, Modifier) \ + FMA3_PACKED_PAIR(VFMSUBADD, Form, Modifier) + +#define FMA3_CASES(Form) \ + FMA3_SCALAR_PAIR(VFMADD, Form, ) \ + FMA3_SCALAR_PAIR(VFMSUB, Form, ) \ + FMA3_SCALAR_PAIR(VFNMADD, Form, ) \ + FMA3_SCALAR_PAIR(VFNMSUB, Form, ) \ + FMA3_PACKED_SET(Form, ) \ + FMA3_PACKED_SET(Form, Y) \ + +#define FMA3_SCALAR_PAIR_AVX512(Name, Modifier) \ + FMA3_CASE(Name##SD, Modifier) \ + FMA3_CASE(Name##SS, Modifier) + +#define FMA3_PACKED_PAIR_AVX512(Name, Size) \ + FMA3_CASE(Name##PD##Size, ) \ + FMA3_CASE(Name##PS##Size, ) + +#define FMA3_PACKED_SET_AVX512(Form, Size) \ + FMA3_PACKED_PAIR_AVX512(VFMADD##Form, Size) \ + FMA3_PACKED_PAIR_AVX512(VFMSUB##Form, Size) \ + FMA3_PACKED_PAIR_AVX512(VFNMADD##Form, Size) \ + FMA3_PACKED_PAIR_AVX512(VFNMSUB##Form, Size) \ + FMA3_PACKED_PAIR_AVX512(VFMADDSUB##Form, Size) \ + FMA3_PACKED_PAIR_AVX512(VFMSUBADD##Form, Size) + +#define FMA3_CASES_AVX512(Form) \ + FMA3_SCALAR_PAIR_AVX512(VFMADD##Form, ) \ + FMA3_SCALAR_PAIR_AVX512(VFMSUB##Form, ) \ + FMA3_SCALAR_PAIR_AVX512(VFNMADD##Form, ) \ + FMA3_SCALAR_PAIR_AVX512(VFNMSUB##Form, ) \ + FMA3_PACKED_SET_AVX512(Form, Z128) \ + FMA3_PACKED_SET_AVX512(Form, Z256) \ + FMA3_PACKED_SET_AVX512(Form, Z) + +#define FMA3_CASES_SCALAR_INT(Form) \ + FMA3_SCALAR_PAIR(VFMADD, Form, _Int) \ + FMA3_SCALAR_PAIR(VFMSUB, Form, _Int) \ + FMA3_SCALAR_PAIR(VFNMADD, Form, _Int) \ + FMA3_SCALAR_PAIR(VFNMSUB, Form, _Int) + +#define FMA3_CASES_SCALAR_INT_AVX512(Form) \ + FMA3_SCALAR_PAIR_AVX512(VFMADD##Form, _Int) \ + FMA3_SCALAR_PAIR_AVX512(VFMSUB##Form, _Int) \ + FMA3_SCALAR_PAIR_AVX512(VFNMADD##Form, _Int) \ + FMA3_SCALAR_PAIR_AVX512(VFNMSUB##Form, _Int) + switch (Opcode) { - case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: - case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: - case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: - case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: - case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: - case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: - case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: - case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: - - case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: - case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: - case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: - case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: - case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: - case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: - case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: - case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: - - case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: - case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: - case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: - case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: - case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: - case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: - case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: - case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: - - case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: - case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: - case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: - case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: - case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: - case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: - case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: - case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: - - case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: - case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: - case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: - case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: - case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: - case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: - case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: - case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: - case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: - case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: - case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: - case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: - case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: - case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: - case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: - case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: - - case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: - case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: - case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: - case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: - case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: - case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: - case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: - case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: - - case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: - case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: - case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: - case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: - case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: - case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: - case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: - case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: - case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: - case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: - case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: - case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: - case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: - case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: - case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: - case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: - - case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: - case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: - case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: - case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: - case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: - case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: - case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: - case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: - - case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: - case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: - case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: - case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: - case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: - case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: - case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: - case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: - case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: - case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: - case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: - case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: - case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: - case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: - case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: - case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + FMA3_CASES(r132) + FMA3_CASES(r213) + FMA3_CASES(r231) + + // AVX-512 instructions + FMA3_CASES_AVX512(132) + FMA3_CASES_AVX512(213) + FMA3_CASES_AVX512(231) return true; - case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: - case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: - case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: - case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: - case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: - case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: - case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: - case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: - - case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: - case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: - case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: - case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: - case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: - case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: - case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: - case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: - - case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: - case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: - case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: - case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: - case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: - case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: - case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: - case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + FMA3_CASES_SCALAR_INT(r132) + FMA3_CASES_SCALAR_INT(r213) + FMA3_CASES_SCALAR_INT(r231) + + // AVX-512 instructions + FMA3_CASES_SCALAR_INT_AVX512(132) + FMA3_CASES_SCALAR_INT_AVX512(213) + FMA3_CASES_SCALAR_INT_AVX512(231) IsIntrinsic = true; return true; default: return false; } llvm_unreachable("Opcode not handled by the switch"); + +#undef FMA3_CASE +#undef FMA3_SCALAR_PAIR +#undef FMA3_PACKED_PAIR +#undef FMA3_PACKED_SET +#undef FMA3_CASES +#undef FMA3_SCALAR_PAIR_AVX512 +#undef FMA3_PACKED_PAIR_AVX512 +#undef FMA3_PACKED_SET_AVX512 +#undef FMA3_CASES_AVX512 +#undef FMA3_CASES_SCALAR_INT +#undef FMA3_CASES_SCALAR_INT_AVX512 } /// Returns an adjusted FMA opcode that must be used in FMA instruction that @@ -3312,104 +3274,110 @@ static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc, bool IsIntrinOpcode, unsigned SrcOpIdx1, unsigned SrcOpIdx2) { +#define FMA3_ENTRY(Name, Suffix) \ + { X86::Name##132##Suffix, X86::Name##213##Suffix, X86::Name##231##Suffix }, + +#define FMA3_SCALAR_PAIR(Name, Suffix) \ + FMA3_ENTRY(Name##SSr, Suffix) \ + FMA3_ENTRY(Name##SDr, Suffix) + +#define FMA3_PACKED_PAIR(Name, Suffix) \ + FMA3_ENTRY(Name##PSr, Suffix) \ + FMA3_ENTRY(Name##PDr, Suffix) + +#define FMA3_PACKED_SIZES(Name, Suffix) \ + FMA3_PACKED_PAIR(Name, Suffix) \ + FMA3_PACKED_PAIR(Name, Suffix##Y) + +#define FMA3_TABLE_ALL(Name) \ + FMA3_SCALAR_PAIR(Name, r) \ + FMA3_PACKED_SIZES(Name, r) \ + FMA3_SCALAR_PAIR(Name, m) \ + FMA3_PACKED_SIZES(Name, m) + +#define FMA3_TABLE_PACKED(Name) \ + FMA3_PACKED_SIZES(Name, r) \ + FMA3_PACKED_SIZES(Name, m) + +#define FMA3_TABLE_SCALAR_INT(Name) \ + FMA3_SCALAR_PAIR(Name, r_Int) \ + FMA3_SCALAR_PAIR(Name, m_Int) + +#define FMA3_SCALAR_PAIR_AVX512(Name, Suffix) \ + FMA3_ENTRY(Name, SS##Suffix) \ + FMA3_ENTRY(Name, SD##Suffix) + +#define FMA3_PACKED_PAIR_AVX512(Name, Suffix) \ + FMA3_ENTRY(Name, PS##Suffix) \ + FMA3_ENTRY(Name, PD##Suffix) + +#define FMA3_PACKED_SIZES_AVX512(Name, Suffix) \ + FMA3_PACKED_PAIR_AVX512(Name, Z128##Suffix) \ + FMA3_PACKED_PAIR_AVX512(Name, Z256##Suffix) \ + FMA3_PACKED_PAIR_AVX512(Name, Z##Suffix) + +#define FMA3_TABLE_ALL_AVX512(Name) \ + FMA3_SCALAR_PAIR_AVX512(Name, r) \ + FMA3_PACKED_SIZES_AVX512(Name, r) \ + FMA3_SCALAR_PAIR_AVX512(Name, m) \ + FMA3_PACKED_SIZES_AVX512(Name, m) + +#define FMA3_TABLE_PACKED_AVX512(Name) \ + FMA3_PACKED_SIZES_AVX512(Name, r) \ + FMA3_PACKED_SIZES_AVX512(Name, m) + +#define FMA3_TABLE_SCALAR_INT_AVX512(Name) \ + FMA3_SCALAR_PAIR_AVX512(Name, r_Int) \ + FMA3_SCALAR_PAIR_AVX512(Name, m_Int) + // Define the array that holds FMA opcodes in groups // of 3 opcodes(132, 213, 231) in each group. static const uint16_t RegularOpcodeGroups[][3] = { - { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, - { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, - { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, - { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, - { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, - { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, - { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, - { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, - { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, - { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, - { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, - { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, - - { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, - { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, - { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, - { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, - { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, - { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, - { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, - { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, - { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, - { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, - { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, - { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, - - { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, - { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, - { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, - { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, - { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, - { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, - { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, - { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, - { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, - { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, - { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, - { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, - - { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, - { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, - { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, - { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, - { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, - { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, - { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, - { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, - { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, - { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, - { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, - { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, - - { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, - { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, - { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, - { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, - { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, - { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, - { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, - { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, - - { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, - { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, - { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, - { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, - { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, - { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, - { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, - { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } + FMA3_TABLE_ALL(VFMADD) + FMA3_TABLE_ALL(VFMSUB) + FMA3_TABLE_ALL(VFNMADD) + FMA3_TABLE_ALL(VFNMSUB) + FMA3_TABLE_PACKED(VFMADDSUB) + FMA3_TABLE_PACKED(VFMSUBADD) + + // AVX-512 instructions + FMA3_TABLE_ALL_AVX512(VFMADD) + FMA3_TABLE_ALL_AVX512(VFMSUB) + FMA3_TABLE_ALL_AVX512(VFNMADD) + FMA3_TABLE_ALL_AVX512(VFNMSUB) + FMA3_TABLE_PACKED_AVX512(VFMADDSUB) + FMA3_TABLE_PACKED_AVX512(VFMSUBADD) }; // Define the array that holds FMA*_Int opcodes in groups // of 3 opcodes(132, 213, 231) in each group. static const uint16_t IntrinOpcodeGroups[][3] = { - { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, - { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, - { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, - { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, - - { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, - { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, - { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, - { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, - - { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, - { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, - { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, - { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, - - { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, - { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, - { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, - { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + FMA3_TABLE_SCALAR_INT(VFMADD) + FMA3_TABLE_SCALAR_INT(VFMSUB) + FMA3_TABLE_SCALAR_INT(VFNMADD) + FMA3_TABLE_SCALAR_INT(VFNMSUB) + + // AVX-512 instructio + FMA3_TABLE_SCALAR_INT_AVX512(VFMADD) + FMA3_TABLE_SCALAR_INT_AVX512(VFMSUB) + FMA3_TABLE_SCALAR_INT_AVX512(VFNMADD) + FMA3_TABLE_SCALAR_INT_AVX512(VFNMSUB) }; +#undef FMA3_ENTRY +#undef FMA3_SCALAR_PAIR +#undef FMA3_PACKED_PAIR +#undef FMA3_PACKED_SIZES +#undef FMA3_TABLE_ALL +#undef FMA3_TABLE_PACKED +#undef FMA3_TABLE_SCALAR_INT +#undef FMA3_SCALAR_PAIR_AVX512 +#undef FMA3_PACKED_PAIR_AVX512 +#undef FMA3_PACKED_SIZES_AVX512 +#undef FMA3_TABLE_ALL_AVX512 +#undef FMA3_TABLE_PACKED_AVX512 +#undef FMA3_TABLE_SCALAR_INT_AVX512 + const unsigned Form132Index = 0; const unsigned Form213Index = 1; const unsigned Form231Index = 2; diff --git a/llvm/test/CodeGen/X86/avx512-fma.ll b/llvm/test/CodeGen/X86/avx512-fma.ll index b2d0835..f27b732 100644 --- a/llvm/test/CodeGen/X86/avx512-fma.ll +++ b/llvm/test/CodeGen/X86/avx512-fma.ll @@ -67,34 +67,20 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 } define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) { -; KNL-LABEL: test_x86_fmsub_213: -; KNL: ## BB#0: -; KNL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 -; KNL-NEXT: vmovaps %zmm1, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_x86_fmsub_213: -; SKX: ## BB#0: -; SKX-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 -; SKX-NEXT: vmovaps %xmm1, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_x86_fmsub_213: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 +; ALL-NEXT: retq %x = fmul double %a0, %a1 %res = fsub double %x, %a2 ret double %res } define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { -; KNL-LABEL: test_x86_fmsub_213_m: -; KNL: ## BB#0: -; KNL-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1 -; KNL-NEXT: vmovaps %zmm1, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_x86_fmsub_213_m: -; SKX: ## BB#0: -; SKX-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1 -; SKX-NEXT: vmovaps %xmm1, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_x86_fmsub_213_m: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 +; ALL-NEXT: retq %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a1 %res = fsub double %x, %a2 @@ -102,17 +88,10 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { } define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { -; KNL-LABEL: test_x86_fmsub_231_m: -; KNL: ## BB#0: -; KNL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 -; KNL-NEXT: vmovaps %zmm1, %zmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test_x86_fmsub_231_m: -; SKX: ## BB#0: -; SKX-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1 -; SKX-NEXT: vmovaps %xmm1, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_x86_fmsub_231_m: +; ALL: ## BB#0: +; ALL-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 +; ALL-NEXT: retq %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a2 %res = fsub double %x, %a1 diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index 62d1b82..b724da2 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4 @@ -22,8 +22,7 @@ define float @test_f32_fmadd(float %a0, float %a1, float %a2) { ; ; AVX512-LABEL: test_f32_fmadd: ; AVX512: # BB#0: -; AVX512-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul float %a0, %a1 %res = fadd float %x, %a2 @@ -83,8 +82,7 @@ define double @test_f64_fmadd(double %a0, double %a1, double %a2) { ; ; AVX512-LABEL: test_f64_fmadd: ; AVX512: # BB#0: -; AVX512-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul double %a0, %a1 %res = fadd double %x, %a2 @@ -148,8 +146,7 @@ define float @test_f32_fmsub(float %a0, float %a1, float %a2) { ; ; AVX512-LABEL: test_f32_fmsub: ; AVX512: # BB#0: -; AVX512-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul float %a0, %a1 %res = fsub float %x, %a2 @@ -209,8 +206,7 @@ define double @test_f64_fmsub(double %a0, double %a1, double %a2) { ; ; AVX512-LABEL: test_f64_fmsub: ; AVX512: # BB#0: -; AVX512-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul double %a0, %a1 %res = fsub double %x, %a2 @@ -274,8 +270,7 @@ define float @test_f32_fnmadd(float %a0, float %a1, float %a2) { ; ; AVX512-LABEL: test_f32_fnmadd: ; AVX512: # BB#0: -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul float %a0, %a1 %res = fsub float %a2, %x @@ -335,8 +330,7 @@ define double @test_f64_fnmadd(double %a0, double %a1, double %a2) { ; ; AVX512-LABEL: test_f64_fnmadd: ; AVX512: # BB#0: -; AVX512-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul double %a0, %a1 %res = fsub double %a2, %x @@ -400,8 +394,7 @@ define float @test_f32_fnmsub(float %a0, float %a1, float %a2) { ; ; AVX512-LABEL: test_f32_fnmsub: ; AVX512: # BB#0: -; AVX512-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul float %a0, %a1 %y = fsub float -0.000000e+00, %x @@ -464,8 +457,7 @@ define double @test_f64_fnmsub(double %a0, double %a1, double %a2) { ; ; AVX512-LABEL: test_f64_fnmsub: ; AVX512: # BB#0: -; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %x = fmul double %a0, %a1 %y = fsub double -0.000000e+00, %x @@ -533,8 +525,7 @@ define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x ; AVX512-LABEL: test_4f32_fmadd_load: ; AVX512: # BB#0: ; AVX512-NEXT: vmovaps (%rdi), %xmm2 -; AVX512-NEXT: vfmadd213ps %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: retq %x = load <4 x float>, <4 x float>* %a0 %y = fmul <4 x float> %x, %a1 @@ -556,8 +547,7 @@ define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, < ; AVX512-LABEL: test_2f64_fmsub_load: ; AVX512: # BB#0: ; AVX512-NEXT: vmovapd (%rdi), %xmm2 -; AVX512-NEXT: vfmsub213pd %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: vfmsub213pd %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: retq %x = load <2 x double>, <2 x double>* %a0 %y = fmul <2 x double> %x, %a1 @@ -829,8 +819,7 @@ define float @test_f32_interp(float %x, float %y, float %t) { ; AVX512-LABEL: test_f32_interp: ; AVX512: # BB#0: ; AVX512-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vfmadd213ss %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: retq %t1 = fsub float 1.0, %t %tx = fmul float %x, %t @@ -854,9 +843,8 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float ; ; AVX512-LABEL: test_v4f32_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps %xmm2, %xmm3 -; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm3 -; AVX512-NEXT: vfmadd213ps %xmm3, %xmm2, %xmm0 +; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: retq %t1 = fsub <4 x float> , %t %tx = fmul <4 x float> %x, %t @@ -880,9 +868,8 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float ; ; AVX512-LABEL: test_v8f32_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps %ymm2, %ymm3 -; AVX512-NEXT: vfnmadd213ps %ymm1, %ymm1, %ymm3 -; AVX512-NEXT: vfmadd213ps %ymm3, %ymm2, %ymm0 +; AVX512-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0 ; AVX512-NEXT: retq %t1 = fsub <8 x float> , %t %tx = fmul <8 x float> %x, %t @@ -907,8 +894,7 @@ define double @test_f64_interp(double %x, double %y, double %t) { ; AVX512-LABEL: test_f64_interp: ; AVX512: # BB#0: ; AVX512-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vfmadd213sd %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: retq %t1 = fsub double 1.0, %t %tx = fmul double %x, %t @@ -932,9 +918,8 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do ; ; AVX512-LABEL: test_v2f64_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps %xmm2, %xmm3 -; AVX512-NEXT: vfnmadd213pd %xmm1, %xmm1, %xmm3 -; AVX512-NEXT: vfmadd213pd %xmm3, %xmm2, %xmm0 +; AVX512-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0 ; AVX512-NEXT: retq %t1 = fsub <2 x double> , %t %tx = fmul <2 x double> %x, %t @@ -958,9 +943,8 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do ; ; AVX512-LABEL: test_v4f64_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps %ymm2, %ymm3 -; AVX512-NEXT: vfnmadd213pd %ymm1, %ymm1, %ymm3 -; AVX512-NEXT: vfmadd213pd %ymm3, %ymm2, %ymm0 +; AVX512-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0 ; AVX512-NEXT: retq %t1 = fsub <4 x double> , %t %tx = fmul <4 x double> %x, %t @@ -1101,8 +1085,7 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y ; ; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y: ; AVX512: # BB#0: -; AVX512-NEXT: vfmadd231ps {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0 ; AVX512-NEXT: retq %m0 = fmul <4 x float> %x, %m1 = fmul <4 x float> %m0, @@ -1128,8 +1111,7 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 { ; AVX512-LABEL: test_f64_fneg_fmul: ; AVX512: # BB#0: ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 ; AVX512-NEXT: retq %m = fmul nsz double %x, %y %n = fsub double -0.0, %m diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index bf9291c..98b6c49 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -218,8 +218,7 @@ define <16 x float> @test_16f32_fmadd_load(<16 x float>* %a0, <16 x float> %a1, ; AVX512-LABEL: test_16f32_fmadd_load: ; AVX512: # BB#0: ; AVX512-NEXT: vmovaps (%rdi), %zmm2 -; AVX512-NEXT: vfmadd213ps %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovaps %zmm2, %zmm0 +; AVX512-NEXT: vfmadd213ps %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %x = load <16 x float>, <16 x float>* %a0 %y = fmul <16 x float> %x, %a1 @@ -243,8 +242,7 @@ define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, < ; AVX512-LABEL: test_8f64_fmsub_load: ; AVX512: # BB#0: ; AVX512-NEXT: vmovapd (%rdi), %zmm2 -; AVX512-NEXT: vfmsub213pd %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovapd %zmm2, %zmm0 +; AVX512-NEXT: vfmsub213pd %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %x = load <8 x double>, <8 x double>* %a0 %y = fmul <8 x double> %x, %a1 @@ -543,9 +541,8 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x ; ; AVX512-LABEL: test_v16f32_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps %zmm2, %zmm3 -; AVX512-NEXT: vfnmadd213ps %zmm1, %zmm1, %zmm3 -; AVX512-NEXT: vfmadd213ps %zmm3, %zmm2, %zmm0 +; AVX512-NEXT: vfnmadd213ps %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vfmadd213ps %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %t1 = fsub <16 x float> , %t %tx = fmul <16 x float> %x, %t @@ -573,9 +570,8 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do ; ; AVX512-LABEL: test_v8f64_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovapd %zmm2, %zmm3 -; AVX512-NEXT: vfnmadd213pd %zmm1, %zmm1, %zmm3 -; AVX512-NEXT: vfmadd213pd %zmm3, %zmm2, %zmm0 +; AVX512-NEXT: vfnmadd213pd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vfmadd213pd %zmm1, %zmm2, %zmm0 ; AVX512-NEXT: retq %t1 = fsub <8 x double> , %t %tx = fmul <8 x double> %x, %t @@ -728,8 +724,7 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float ; ; AVX512-LABEL: test_v16f32_fma_fmul_x_c1_c2_y: ; AVX512: # BB#0: -; AVX512-NEXT: vfmadd231ps {{.*}}(%rip), %zmm0, %zmm1 -; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: vfmadd132ps {{.*}}(%rip), %zmm1, %zmm0 ; AVX512-NEXT: retq %m0 = fmul <16 x float> %x, %m1 = fmul <16 x float> %m0,