From 6daefcf6262299a4233231c49b9048c1062dbc90 Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Thu, 11 Aug 2016 22:07:33 +0000 Subject: [PATCH] X86-FMA3: Implemented commute transformation for EVEX/AVX512 FMA3 opcodes. This helped to improved memory-folding and register coalescing optimizations. Also, this patch fixed the tracker #17229. Reviewer: Craig Topper. Differential Revision: https://reviews.llvm.org/D23108 llvm-svn: 278431 --- llvm/lib/Target/X86/CMakeLists.txt | 1 + llvm/lib/Target/X86/X86InstrAVX512.td | 57 ++- llvm/lib/Target/X86/X86InstrFMA3Info.cpp | 284 +++++++++++ llvm/lib/Target/X86/X86InstrFMA3Info.h | 315 ++++++++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 640 ++++--------------------- llvm/lib/Target/X86/X86InstrInfo.h | 32 +- llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll | 36 +- llvm/test/CodeGen/X86/avx512-fma.ll | 12 +- llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll | 18 +- llvm/test/CodeGen/X86/fma-fneg-combine.ll | 7 +- 10 files changed, 792 insertions(+), 610 deletions(-) create mode 100644 llvm/lib/Target/X86/X86InstrFMA3Info.cpp create mode 100644 llvm/lib/Target/X86/X86InstrFMA3Info.h diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 894090f..8679278 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -24,6 +24,7 @@ set(sources X86FrameLowering.cpp X86ISelDAGToDAG.cpp X86ISelLowering.cpp + X86InstrFMA3Info.cpp X86InstrInfo.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 44b5bea..50791e9 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -194,7 +194,8 @@ multiclass AVX512_maskable_custom O, Format F, list ZeroMaskingPattern, string MaskingConstraint = "", InstrItinClass itin = NoItinerary, - bit IsCommutable = 0> { + bit IsCommutable = 0, + bit IsKCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512 O, Format F, Pattern, itin>; // Prefer over VMOV*rrk Pat<> - let AddedComplexity = 20 in + let AddedComplexity = 20, isCommutable = IsKCommutable in def NAME#k: AVX512 O, Format F, EVEX_K { // In case of the 3src subclass this is overridden with a let. string Constraints = MaskingConstraint; - } - let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + } + + // Zero mask does not add any restrictions to commute operands transformation. + // So, it is Ok to use IsCommutable instead of IsKCommutable. + let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512 O, Format F, X86VectorVTInfo _, SDNode Select = vselect, string MaskingConstraint = "", InstrItinClass itin = NoItinerary, - bit IsCommutable = 0> : + bit IsCommutable = 0, + bit IsKCommutable = 0> : AVX512_maskable_custom; + MaskingConstraint, NoItinerary, IsCommutable, + IsKCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -248,13 +254,14 @@ multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, string AttSrcAsm, string IntelSrcAsm, dag RHS, InstrItinClass itin = NoItinerary, - bit IsCommutable = 0, SDNode Select = vselect> : + bit IsCommutable = 0, bit IsKCommutable = 0, + SDNode Select = vselect> : AVX512_maskable_common; + "$src0 = $dst", itin, IsCommutable, IsKCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. @@ -278,15 +285,17 @@ multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_3src O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS> : + dag RHS, bit IsCommutable = 0, + bit IsKCommutable = 0> : AVX512_maskable_common; + (vselect _.KRCWM:$mask, RHS, _.RC:$src1), + vselect, "", NoItinerary, IsCommutable, IsKCommutable>; -// Similar to AVX512_maskable_3rc but in this case the input VT for the tied +// Similar to AVX512_maskable_3src but in this case the input VT for the tied // operand differs from the output VT. This requires a bitconvert on // the preserved vector going into the vselect. multiclass AVX512_maskable_3src_cast O, Format F, X86VectorVTInfo OutVT, @@ -305,14 +314,16 @@ multiclass AVX512_maskable_3src_cast O, Format F, X86VectorVTInfo OutVT, multiclass AVX512_maskable_3src_scalar O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS> : + dag RHS, bit IsCommutable = 0, + bit IsKCommutable = 0> : AVX512_maskable_common; + X86selects, "", NoItinerary, IsCommutable, + IsKCommutable>; multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, @@ -4842,13 +4853,13 @@ multiclass avx512_fma3p_213_rm opc, string OpcodeStr, SDNode OpNode, defm r: AVX512_maskable_3src, + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, AVX512FMA3Base; defm m: AVX512_maskable_3src, + (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, AVX512FMA3Base; defm mb: AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, AVX512FMA3Base, EVEX_B; } @@ -4875,7 +4886,7 @@ multiclass avx512_fma3_213_round opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src, + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC; } @@ -4917,13 +4928,13 @@ multiclass avx512_fma3p_231_rm opc, string OpcodeStr, SDNode OpNode, defm r: AVX512_maskable_3src, + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, AVX512FMA3Base; defm m: AVX512_maskable_3src, + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, AVX512FMA3Base; defm mb: AVX512_maskable_3src opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src1))>, AVX512FMA3Base, EVEX_B; + _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B; } // Additional patterns for folding broadcast nodes in other orders. @@ -4960,7 +4971,7 @@ multiclass avx512_fma3_231_round opc, string OpcodeStr, SDNode OpNode, defm rb: AVX512_maskable_3src, + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC; } @@ -6036,7 +6047,7 @@ multiclass avx512_cvtps2ph, AVX512AIi8Base; + NoItinerary, 0, 0, X86select>, AVX512AIi8Base; def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -6056,7 +6067,7 @@ multiclass avx512_cvtps2ph_sae { (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), (i32 FROUND_NO_EXC)), - NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base; + NoItinerary, 0, 0, X86select>, EVEX_B, AVX512AIi8Base; } let Predicates = [HasAVX512] in { defm VCVTPS2PHZ : avx512_cvtps2ph, diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp new file mode 100644 index 0000000..7bd8415 --- /dev/null +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -0,0 +1,284 @@ +//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the classes providing information +// about existing X86 FMA3 opcodes, classifying and grouping them. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrFMA3Info.h" +#include "X86InstrInfo.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/Threading.h" + +/// This flag is used in the method llvm::call_once() used below to make the +/// initialization of the map 'OpcodeToGroup' thread safe. +LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag); + +static ManagedStatic X86InstrFMA3InfoObj; +X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() { + return &*X86InstrFMA3InfoObj; +} + +void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes, + const uint16_t *MemOpcodes, unsigned Attr) { + // Create a new instance of this class that would hold a group of FMA opcodes. + X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr); + + // Add the references from indvidual opcodes to the group holding them. + assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] && + !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] && + !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) && + "Duplication or rewrite of elements in OpcodeToGroup."); + OpcodeToGroup[RegOpcodes[0]] = G; + OpcodeToGroup[RegOpcodes[1]] = G; + OpcodeToGroup[RegOpcodes[2]] = G; + OpcodeToGroup[MemOpcodes[0]] = G; + OpcodeToGroup[MemOpcodes[1]] = G; + OpcodeToGroup[MemOpcodes[2]] = G; +} + +void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) { + // Create a new instance of this class that would hold a group of FMA opcodes. + X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr); + + // Add the references from indvidual opcodes to the group holding them. + assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] && + !OpcodeToGroup[RegOpcodes[2]]) && + "Duplication or rewrite of elements in OpcodeToGroup."); + OpcodeToGroup[RegOpcodes[0]] = G; + OpcodeToGroup[RegOpcodes[1]] = G; + OpcodeToGroup[RegOpcodes[2]] = G; +} + +void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) { + // Create a new instance of this class that would hold a group of FMA opcodes. + X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr); + + // Add the references from indvidual opcodes to the group holding them. + assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] && + !OpcodeToGroup[MemOpcodes[2]]) && + "Duplication or rewrite of elements in OpcodeToGroup."); + OpcodeToGroup[MemOpcodes[0]] = G; + OpcodeToGroup[MemOpcodes[1]] = G; + OpcodeToGroup[MemOpcodes[2]] = G; +} + +#define FMA3RM(R132, R213, R231, M132, M213, M231) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \ + initRMGroup(Reg##R132, Mem##R132); + +#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \ + initRMGroup(Reg##R132, Mem##R132, (Attrs)); + +#define FMA3R(R132, R213, R231) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + initRGroup(Reg##R132); + +#define FMA3RA(R132, R213, R231, Attrs) \ + static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \ + initRGroup(Reg##R132, (Attrs)); + +#define FMA3M(M132, M213, M231) \ + static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \ + initMGroup(Mem##M132); + +#define FMA3MA(M132, M213, M231, Attrs) \ + static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \ + initMGroup(Mem##M132, (Attrs)); + +#define FMA3_AVX2_VECTOR_GROUP(Name) \ + FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr, \ + Name##132PSm, Name##213PSm, Name##231PSm); \ + FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr, \ + Name##132PDm, Name##213PDm, Name##231PDm); \ + FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr, \ + Name##132PSYm, Name##213PSYm, Name##231PSYm); \ + FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr, \ + Name##132PDYm, Name##213PDYm, Name##231PDYm); + +#define FMA3_AVX2_SCALAR_GROUP(Name) \ + FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr, \ + Name##132SSm, Name##213SSm, Name##231SSm); \ + FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr, \ + Name##132SDm, Name##213SDm, Name##231SDm); \ + FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int, \ + Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int, \ + Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); + +#define FMA3_AVX2_FULL_GROUP(Name) \ + FMA3_AVX2_VECTOR_GROUP(Name); \ + FMA3_AVX2_SCALAR_GROUP(Name); + +#define FMA3_AVX512_VECTOR_GROUP(Name) \ + FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r, \ + Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m); \ + FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r, \ + Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m); \ + FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r, \ + Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m); \ + FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r, \ + Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m); \ + FMA3RM(Name##132PSZr, Name##213PSZr, Name##231PSZr, \ + Name##132PSZm, Name##213PSZm, Name##231PSZm); \ + FMA3RM(Name##132PDZr, Name##213PDZr, Name##231PDZr, \ + Name##132PDZm, Name##213PDZm, Name##231PDZm); \ + FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk, \ + Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk, \ + Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk, \ + Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk, \ + Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PSZrk, Name##213PSZrk, Name##231PSZrk, \ + Name##132PSZmk, Name##213PSZmk, Name##231PSZmk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PDZrk, Name##213PDZrk, Name##231PDZrk, \ + Name##132PDZmk, Name##213PDZmk, Name##231PDZmk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz, \ + Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz, \ + Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz, \ + Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz, \ + Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PSZrkz, Name##213PSZrkz, Name##231PSZrkz, \ + Name##132PSZmkz, Name##213PSZmkz, Name##231PSZmkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132PDZrkz, Name##213PDZrkz, Name##231PDZrkz, \ + Name##132PDZmkz, Name##213PDZmkz, Name##231PDZmkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb); \ + FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb); \ + FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb); \ + FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb); \ + FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb); \ + FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb); \ + FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb); \ + FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb); \ + FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PSZmbk, Name##213PSZmbk, Name##231PSZmbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PDZmbk, Name##213PDZmbk, Name##231PDZmbk, \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz, \ + X86InstrFMA3Group::X86FMA3KZeroMasked); + +#define FMA3_AVX512_SCALAR_GROUP(Name) \ + FMA3RM(Name##132SSZr, Name##213SSZr, Name##231SSZr, \ + Name##132SSZm, Name##213SSZm, Name##231SSZm); \ + FMA3RM(Name##132SDZr, Name##213SDZr, Name##231SDZr, \ + Name##132SDZm, Name##213SDZm, Name##231SDZm); \ + FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int, \ + Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int, \ + Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk, \ + Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk, \ + Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz, \ + Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz, \ + Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int, \ + X86InstrFMA3Group::X86FMA3Intrinsic); \ + FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KMergeMasked); \ + FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); \ + FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz, \ + X86InstrFMA3Group::X86FMA3Intrinsic | \ + X86InstrFMA3Group::X86FMA3KZeroMasked); + +#define FMA3_AVX512_FULL_GROUP(Name) \ + FMA3_AVX512_VECTOR_GROUP(Name); \ + FMA3_AVX512_SCALAR_GROUP(Name); + +void X86InstrFMA3Info::initGroupsOnceImpl() { + FMA3_AVX2_FULL_GROUP(VFMADD); + FMA3_AVX2_FULL_GROUP(VFMSUB); + FMA3_AVX2_FULL_GROUP(VFNMADD); + FMA3_AVX2_FULL_GROUP(VFNMSUB); + + FMA3_AVX2_VECTOR_GROUP(VFMADDSUB); + FMA3_AVX2_VECTOR_GROUP(VFMSUBADD); + + FMA3_AVX512_FULL_GROUP(VFMADD); + FMA3_AVX512_FULL_GROUP(VFMSUB); + FMA3_AVX512_FULL_GROUP(VFNMADD); + FMA3_AVX512_FULL_GROUP(VFNMSUB); + + FMA3_AVX512_VECTOR_GROUP(VFMADDSUB); + FMA3_AVX512_VECTOR_GROUP(VFMSUBADD); +} + +void X86InstrFMA3Info::initGroupsOnce() { + llvm::call_once(InitGroupsOnceFlag, + []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); }); +} diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.h b/llvm/lib/Target/X86/X86InstrFMA3Info.h new file mode 100644 index 0000000..987ff9e --- /dev/null +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.h @@ -0,0 +1,315 @@ +//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the classes providing information +// about existing X86 FMA3 opcodes, classifying and grouping them. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H +#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H + +#include "X86.h" +#include "llvm/ADT/DenseMap.h" +#include +#include + +using namespace llvm; + +/// This class is used to group {132, 213, 231} forms of FMA opcodes together. +/// Each of the groups has either 3 register opcodes, 3 memory opcodes, +/// or 6 register and memory opcodes. Also, each group has an attrubutes field +/// describing it. +class X86InstrFMA3Group { +private: + /// Reference to an array holding 3 forms of register FMA opcodes. + /// It may be set to nullptr if the group of FMA opcodes does not have + /// any register form opcodes. + const uint16_t *RegOpcodes; + + /// Reference to an array holding 3 forms of memory FMA opcodes. + /// It may be set to nullptr if the group of FMA opcodes does not have + /// any register form opcodes. + const uint16_t *MemOpcodes; + + /// This bitfield specifies the attributes associated with the created + /// FMA groups of opcodes. + unsigned Attributes; + + static const unsigned Form132 = 0; + static const unsigned Form213 = 1; + static const unsigned Form231 = 2; + +public: + /// This bit must be set in the 'Attributes' field of FMA group if such + /// group of FMA opcodes consists of FMA intrinsic opcodes. + static const unsigned X86FMA3Intrinsic = 0x1; + + /// This bit must be set in the 'Attributes' field of FMA group if such + /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and + /// passing the elements from the 1st operand to the result of the operation + /// when the correpondings bits in the k-mask are unset. + static const unsigned X86FMA3KMergeMasked = 0x2; + + /// This bit must be set in the 'Attributes' field of FMA group if such + /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask. + static const unsigned X86FMA3KZeroMasked = 0x4; + + /// Constructor. Creates a new group of FMA opcodes with three register form + /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes. + /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr, + /// which means that the created group of FMA opcodes does not have the + /// corresponding (register or memory) opcodes. + /// The parameter \p Attr specifies the attributes describing the created + /// group. + X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes, + unsigned Attr) + : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) { + assert((RegOpcodes || MemOpcodes) && + "Cannot create a group not having any opcodes."); + } + + /// Returns a memory form opcode that is the equivalent of the given register + /// form opcode \p RegOpcode. 0 is returned if the group does not have + /// either register of memory opcodes. + unsigned getMemOpcode(unsigned RegOpcode) const { + if (!RegOpcodes || !MemOpcodes) + return 0; + for (unsigned Form = 0; Form < 3; Form++) + if (RegOpcodes[Form] == RegOpcode) + return MemOpcodes[Form]; + return 0; + } + + /// Returns the 132 form of FMA register opcode. + unsigned getReg132Opcode() const { + assert(RegOpcodes && "The group does not have register opcodes."); + return RegOpcodes[Form132]; + } + + /// Returns the 213 form of FMA register opcode. + unsigned getReg213Opcode() const { + assert(RegOpcodes && "The group does not have register opcodes."); + return RegOpcodes[Form213]; + } + + /// Returns the 231 form of FMA register opcode. + unsigned getReg231Opcode() const { + assert(RegOpcodes && "The group does not have register opcodes."); + return RegOpcodes[Form231]; + } + + /// Returns the 132 form of FMA memory opcode. + unsigned getMem132Opcode() const { + assert(MemOpcodes && "The group does not have memory opcodes."); + return MemOpcodes[Form132]; + } + + /// Returns the 213 form of FMA memory opcode. + unsigned getMem213Opcode() const { + assert(MemOpcodes && "The group does not have memory opcodes."); + return MemOpcodes[Form213]; + } + + /// Returns the 231 form of FMA memory opcode. + unsigned getMem231Opcode() const { + assert(MemOpcodes && "The group does not have memory opcodes."); + return MemOpcodes[Form231]; + } + + /// Returns true iff the group of FMA opcodes holds intrinsic opcodes. + bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; } + + /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes. + bool isKMergeMasked() const { + return (Attributes & X86FMA3KMergeMasked) != 0; + } + + /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes. + bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; } + + /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes. + bool isKMasked() const { + return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0; + } + + /// Returns true iff the given \p Opcode is a register opcode from the + /// groups of FMA opcodes. + bool isRegOpcodeFromGroup(unsigned Opcode) const { + if (!RegOpcodes) + return false; + for (unsigned Form = 0; Form < 3; Form++) + if (Opcode == RegOpcodes[Form]) + return true; + return false; + } + + /// Returns true iff the given \p Opcode is a memory opcode from the + /// groups of FMA opcodes. + bool isMemOpcodeFromGroup(unsigned Opcode) const { + if (!MemOpcodes) + return false; + for (unsigned Form = 0; Form < 3; Form++) + if (Opcode == MemOpcodes[Form]) + return true; + return false; + } +}; + +/// This class provides information about all existing FMA3 opcodes +/// +class X86InstrFMA3Info { +private: + /// A map that is used to find the group of FMA opcodes using any FMA opcode + /// from the group. + DenseMap OpcodeToGroup; + + /// Creates groups of FMA opcodes and initializes Opcode-to-Group map. + /// This method can be called many times, but the actual initialization is + /// called only once. + static void initGroupsOnce(); + + /// Creates groups of FMA opcodes and initializes Opcode-to-Group map. + /// This method must be called ONLY from initGroupsOnce(). Otherwise, such + /// call is not thread safe. + void initGroupsOnceImpl(); + + /// Creates one group of FMA opcodes having the register opcodes + /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr + /// specifies the attributes describing the created group. + void initRMGroup(const uint16_t *RegOpcodes, + const uint16_t *MemOpcodes, unsigned Attr = 0); + + /// Creates one group of FMA opcodes having only the register opcodes + /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing + /// the created group. + void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0); + + /// Creates one group of FMA opcodes having only the memory opcodes + /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing + /// the created group. + void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0); + +public: + /// Returns the reference to an object of this class. It is assumed that + /// only one object may exist. + static X86InstrFMA3Info *getX86InstrFMA3Info(); + + /// Constructor. Just creates an object of the class. + X86InstrFMA3Info() {} + + /// Destructor. Deallocates the memory used for FMA3 Groups. + ~X86InstrFMA3Info() { + std::set DeletedGroups; + auto E = OpcodeToGroup.end(); + for (auto I = OpcodeToGroup.begin(); I != E; I++) { + const X86InstrFMA3Group *G = I->second; + if (DeletedGroups.find(G) == DeletedGroups.end()) { + DeletedGroups.insert(G); + delete G; + } + } + } + + /// Returns a reference to a group of FMA3 opcodes to where the given + /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3 + /// and not included into any FMA3 group, then nullptr is returned. + static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) { + // Ensure that the groups of opcodes are initialized. + initGroupsOnce(); + + // Find the group including the given opcode. + const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info(); + auto I = FMA3Info->OpcodeToGroup.find(Opcode); + if (I == FMA3Info->OpcodeToGroup.end()) + return nullptr; + + return I->second; + } + + /// Returns true iff the given \p Opcode is recognized as FMA3 by this class. + static bool isFMA3(unsigned Opcode) { + return getFMA3Group(Opcode) != nullptr; + } + + /// Iterator that is used to walk on FMA register opcodes having memory + /// form equivalents. + class rm_iterator { + private: + /// Iterator associated with the OpcodeToGroup map. It must always be + /// initialized with an entry from OpcodeToGroup for which I->first + /// points to a register FMA opcode and I->second points to a group of + /// FMA opcodes having memory form equivalent of I->first. + DenseMap::const_iterator I; + + public: + /// Constructor. Creates rm_iterator. The parameter \p I must be an + /// iterator to OpcodeToGroup map entry having I->first pointing to + /// register form FMA opcode and I->second pointing to a group of FMA + /// opcodes holding memory form equivalent for I->fist. + rm_iterator(DenseMap::const_iterator I) + : I(I) {} + + /// Returns the register form FMA opcode. + unsigned getRegOpcode() const { return I->first; }; + + /// Returns the memory form equivalent opcode for FMA register opcode + /// referenced by I->first. + unsigned getMemOpcode() const { + unsigned Opcode = I->first; + const X86InstrFMA3Group *Group = I->second; + return Group->getMemOpcode(Opcode); + } + + /// Returns a reference to a group of FMA opcodes. + const X86InstrFMA3Group *getGroup() const { return I->second; } + + bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; } + bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; } + + /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry + /// having I->first pointing to register form FMA and I->second pointing + /// to a group of FMA opcodes holding memory form equivalen for I->first. + rm_iterator &operator++() { + auto E = getX86InstrFMA3Info()->OpcodeToGroup.end(); + for (++I; I != E; ++I) { + unsigned RegOpcode = I->first; + const X86InstrFMA3Group *Group = I->second; + if (Group->getMemOpcode(RegOpcode) != 0) + break; + } + return *this; + } + }; + + /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map + /// with a register FMA opcode having memory form opcode equivalent. + static rm_iterator rm_begin() { + initGroupsOnce(); + const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info(); + auto I = FMA3Info->OpcodeToGroup.begin(); + auto E = FMA3Info->OpcodeToGroup.end(); + while (I != E) { + unsigned Opcode = I->first; + const X86InstrFMA3Group *G = I->second; + if (G->getMemOpcode(Opcode) != 0) + break; + I++; + } + return rm_iterator(I); + } + + /// Returns the last rm_iterator. + static rm_iterator rm_end() { + initGroupsOnce(); + return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end()); + } +}; + +#endif diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 9a83c09..9df179d 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1855,281 +1855,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) } static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { - // FMA foldable instructions - { X86::VFMADD231SSr, X86::VFMADD231SSm, TB_ALIGN_NONE }, - { X86::VFMADD231SSr_Int, X86::VFMADD231SSm_Int, TB_ALIGN_NONE }, - { X86::VFMADD231SDr, X86::VFMADD231SDm, TB_ALIGN_NONE }, - { X86::VFMADD231SDr_Int, X86::VFMADD231SDm_Int, TB_ALIGN_NONE }, - { X86::VFMADD132SSr, X86::VFMADD132SSm, TB_ALIGN_NONE }, - { X86::VFMADD132SSr_Int, X86::VFMADD132SSm_Int, TB_ALIGN_NONE }, - { X86::VFMADD132SDr, X86::VFMADD132SDm, TB_ALIGN_NONE }, - { X86::VFMADD132SDr_Int, X86::VFMADD132SDm_Int, TB_ALIGN_NONE }, - { X86::VFMADD213SSr, X86::VFMADD213SSm, TB_ALIGN_NONE }, - { X86::VFMADD213SSr_Int, X86::VFMADD213SSm_Int, TB_ALIGN_NONE }, - { X86::VFMADD213SDr, X86::VFMADD213SDm, TB_ALIGN_NONE }, - { X86::VFMADD213SDr_Int, X86::VFMADD213SDm_Int, TB_ALIGN_NONE }, - { X86::VFMADD231SSZr, X86::VFMADD231SSZm, TB_ALIGN_NONE }, - { X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_ALIGN_NONE }, - { X86::VFMADD231SDZr, X86::VFMADD231SDZm, TB_ALIGN_NONE }, - { X86::VFMADD231SDZr_Int, X86::VFMADD231SDZm_Int, TB_ALIGN_NONE }, - { X86::VFMADD132SSZr, X86::VFMADD132SSZm, TB_ALIGN_NONE }, - { X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_ALIGN_NONE }, - { X86::VFMADD132SDZr, X86::VFMADD132SDZm, TB_ALIGN_NONE }, - { X86::VFMADD132SDZr_Int, X86::VFMADD132SDZm_Int, TB_ALIGN_NONE }, - { X86::VFMADD213SSZr, X86::VFMADD213SSZm, TB_ALIGN_NONE }, - { X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_ALIGN_NONE }, - { X86::VFMADD213SDZr, X86::VFMADD213SDZm, TB_ALIGN_NONE }, - { X86::VFMADD213SDZr_Int, X86::VFMADD213SDZm_Int, TB_ALIGN_NONE }, - - { X86::VFMADD231PSr, X86::VFMADD231PSm, TB_ALIGN_NONE }, - { X86::VFMADD231PDr, X86::VFMADD231PDm, TB_ALIGN_NONE }, - { X86::VFMADD132PSr, X86::VFMADD132PSm, TB_ALIGN_NONE }, - { X86::VFMADD132PDr, X86::VFMADD132PDm, TB_ALIGN_NONE }, - { X86::VFMADD213PSr, X86::VFMADD213PSm, TB_ALIGN_NONE }, - { X86::VFMADD213PDr, X86::VFMADD213PDm, TB_ALIGN_NONE }, - { X86::VFMADD231PSYr, X86::VFMADD231PSYm, TB_ALIGN_NONE }, - { X86::VFMADD231PDYr, X86::VFMADD231PDYm, TB_ALIGN_NONE }, - { X86::VFMADD132PSYr, X86::VFMADD132PSYm, TB_ALIGN_NONE }, - { X86::VFMADD132PDYr, X86::VFMADD132PDYm, TB_ALIGN_NONE }, - { X86::VFMADD213PSYr, X86::VFMADD213PSYm, TB_ALIGN_NONE }, - { X86::VFMADD213PDYr, X86::VFMADD213PDYm, TB_ALIGN_NONE }, - { X86::VFMADD231PSZr, X86::VFMADD231PSZm, TB_ALIGN_NONE }, - { X86::VFMADD231PDZr, X86::VFMADD231PDZm, TB_ALIGN_NONE }, - { X86::VFMADD132PSZr, X86::VFMADD132PSZm, TB_ALIGN_NONE }, - { X86::VFMADD132PDZr, X86::VFMADD132PDZm, TB_ALIGN_NONE }, - { X86::VFMADD213PSZr, X86::VFMADD213PSZm, TB_ALIGN_NONE }, - { X86::VFMADD213PDZr, X86::VFMADD213PDZm, TB_ALIGN_NONE }, - { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128m, TB_ALIGN_NONE }, - { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128m, TB_ALIGN_NONE }, - { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128m, TB_ALIGN_NONE }, - { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128m, TB_ALIGN_NONE }, - { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128m, TB_ALIGN_NONE }, - { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128m, TB_ALIGN_NONE }, - { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256m, TB_ALIGN_NONE }, - { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, TB_ALIGN_NONE }, - { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256m, TB_ALIGN_NONE }, - { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, TB_ALIGN_NONE }, - { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256m, TB_ALIGN_NONE }, - { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, TB_ALIGN_NONE }, - - { X86::VFNMADD231SSr, X86::VFNMADD231SSm, TB_ALIGN_NONE }, - { X86::VFNMADD231SSr_Int, X86::VFNMADD231SSm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD231SDr, X86::VFNMADD231SDm, TB_ALIGN_NONE }, - { X86::VFNMADD231SDr_Int, X86::VFNMADD231SDm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD132SSr, X86::VFNMADD132SSm, TB_ALIGN_NONE }, - { X86::VFNMADD132SSr_Int, X86::VFNMADD132SSm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD132SDr, X86::VFNMADD132SDm, TB_ALIGN_NONE }, - { X86::VFNMADD132SDr_Int, X86::VFNMADD132SDm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD213SSr, X86::VFNMADD213SSm, TB_ALIGN_NONE }, - { X86::VFNMADD213SSr_Int, X86::VFNMADD213SSm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD213SDr, X86::VFNMADD213SDm, TB_ALIGN_NONE }, - { X86::VFNMADD213SDr_Int, X86::VFNMADD213SDm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD231SSZr, X86::VFNMADD231SSZm, TB_ALIGN_NONE }, - { X86::VFNMADD231SSZr_Int, X86::VFNMADD231SSZm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD231SDZr, X86::VFNMADD231SDZm, TB_ALIGN_NONE }, - { X86::VFNMADD231SDZr_Int, X86::VFNMADD231SDZm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD132SSZr, X86::VFNMADD132SSZm, TB_ALIGN_NONE }, - { X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD132SDZr, X86::VFNMADD132SDZm, TB_ALIGN_NONE }, - { X86::VFNMADD132SDZr_Int, X86::VFNMADD132SDZm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD213SSZr, X86::VFNMADD213SSZm, TB_ALIGN_NONE }, - { X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_ALIGN_NONE }, - { X86::VFNMADD213SDZr, X86::VFNMADD213SDZm, TB_ALIGN_NONE }, - { X86::VFNMADD213SDZr_Int, X86::VFNMADD213SDZm_Int, TB_ALIGN_NONE }, - - { X86::VFNMADD231PSr, X86::VFNMADD231PSm, TB_ALIGN_NONE }, - { X86::VFNMADD231PDr, X86::VFNMADD231PDm, TB_ALIGN_NONE }, - { X86::VFNMADD132PSr, X86::VFNMADD132PSm, TB_ALIGN_NONE }, - { X86::VFNMADD132PDr, X86::VFNMADD132PDm, TB_ALIGN_NONE }, - { X86::VFNMADD213PSr, X86::VFNMADD213PSm, TB_ALIGN_NONE }, - { X86::VFNMADD213PDr, X86::VFNMADD213PDm, TB_ALIGN_NONE }, - { X86::VFNMADD231PSYr, X86::VFNMADD231PSYm, TB_ALIGN_NONE }, - { X86::VFNMADD231PDYr, X86::VFNMADD231PDYm, TB_ALIGN_NONE }, - { X86::VFNMADD132PSYr, X86::VFNMADD132PSYm, TB_ALIGN_NONE }, - { X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, TB_ALIGN_NONE }, - { X86::VFNMADD213PSYr, X86::VFNMADD213PSYm, TB_ALIGN_NONE }, - { X86::VFNMADD213PDYr, X86::VFNMADD213PDYm, TB_ALIGN_NONE }, - { X86::VFNMADD231PSZr, X86::VFNMADD231PSZm, TB_ALIGN_NONE }, - { X86::VFNMADD231PDZr, X86::VFNMADD231PDZm, TB_ALIGN_NONE }, - { X86::VFNMADD132PSZr, X86::VFNMADD132PSZm, TB_ALIGN_NONE }, - { X86::VFNMADD132PDZr, X86::VFNMADD132PDZm, TB_ALIGN_NONE }, - { X86::VFNMADD213PSZr, X86::VFNMADD213PSZm, TB_ALIGN_NONE }, - { X86::VFNMADD213PDZr, X86::VFNMADD213PDZm, TB_ALIGN_NONE }, - { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128m, TB_ALIGN_NONE }, - { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128m, TB_ALIGN_NONE }, - { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128m, TB_ALIGN_NONE }, - { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, TB_ALIGN_NONE }, - { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128m, TB_ALIGN_NONE }, - { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128m, TB_ALIGN_NONE }, - { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256m, TB_ALIGN_NONE }, - { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, TB_ALIGN_NONE }, - { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256m, TB_ALIGN_NONE }, - { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, TB_ALIGN_NONE }, - { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256m, TB_ALIGN_NONE }, - { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, TB_ALIGN_NONE }, - - { X86::VFMSUB231SSr, X86::VFMSUB231SSm, TB_ALIGN_NONE }, - { X86::VFMSUB231SSr_Int, X86::VFMSUB231SSm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB231SDr, X86::VFMSUB231SDm, TB_ALIGN_NONE }, - { X86::VFMSUB231SDr_Int, X86::VFMSUB231SDm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB132SSr, X86::VFMSUB132SSm, TB_ALIGN_NONE }, - { X86::VFMSUB132SSr_Int, X86::VFMSUB132SSm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB132SDr, X86::VFMSUB132SDm, TB_ALIGN_NONE }, - { X86::VFMSUB132SDr_Int, X86::VFMSUB132SDm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB213SSr, X86::VFMSUB213SSm, TB_ALIGN_NONE }, - { X86::VFMSUB213SSr_Int, X86::VFMSUB213SSm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB213SDr, X86::VFMSUB213SDm, TB_ALIGN_NONE }, - { X86::VFMSUB213SDr_Int, X86::VFMSUB213SDm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB231SSZr, X86::VFMSUB231SSZm, TB_ALIGN_NONE }, - { X86::VFMSUB231SSZr_Int, X86::VFMSUB231SSZm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB231SDZr, X86::VFMSUB231SDZm, TB_ALIGN_NONE }, - { X86::VFMSUB231SDZr_Int, X86::VFMSUB231SDZm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB132SSZr, X86::VFMSUB132SSZm, TB_ALIGN_NONE }, - { X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB132SDZr, X86::VFMSUB132SDZm, TB_ALIGN_NONE }, - { X86::VFMSUB132SDZr_Int, X86::VFMSUB132SDZm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB213SSZr, X86::VFMSUB213SSZm, TB_ALIGN_NONE }, - { X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_ALIGN_NONE }, - { X86::VFMSUB213SDZr, X86::VFMSUB213SDZm, TB_ALIGN_NONE }, - { X86::VFMSUB213SDZr_Int, X86::VFMSUB213SDZm_Int, TB_ALIGN_NONE }, - - { X86::VFMSUB231PSr, X86::VFMSUB231PSm, TB_ALIGN_NONE }, - { X86::VFMSUB231PDr, X86::VFMSUB231PDm, TB_ALIGN_NONE }, - { X86::VFMSUB132PSr, X86::VFMSUB132PSm, TB_ALIGN_NONE }, - { X86::VFMSUB132PDr, X86::VFMSUB132PDm, TB_ALIGN_NONE }, - { X86::VFMSUB213PSr, X86::VFMSUB213PSm, TB_ALIGN_NONE }, - { X86::VFMSUB213PDr, X86::VFMSUB213PDm, TB_ALIGN_NONE }, - { X86::VFMSUB231PSYr, X86::VFMSUB231PSYm, TB_ALIGN_NONE }, - { X86::VFMSUB231PDYr, X86::VFMSUB231PDYm, TB_ALIGN_NONE }, - { X86::VFMSUB132PSYr, X86::VFMSUB132PSYm, TB_ALIGN_NONE }, - { X86::VFMSUB132PDYr, X86::VFMSUB132PDYm, TB_ALIGN_NONE }, - { X86::VFMSUB213PSYr, X86::VFMSUB213PSYm, TB_ALIGN_NONE }, - { X86::VFMSUB213PDYr, X86::VFMSUB213PDYm, TB_ALIGN_NONE }, - { X86::VFMSUB231PSZr, X86::VFMSUB231PSZm, TB_ALIGN_NONE }, - { X86::VFMSUB231PDZr, X86::VFMSUB231PDZm, TB_ALIGN_NONE }, - { X86::VFMSUB132PSZr, X86::VFMSUB132PSZm, TB_ALIGN_NONE }, - { X86::VFMSUB132PDZr, X86::VFMSUB132PDZm, TB_ALIGN_NONE }, - { X86::VFMSUB213PSZr, X86::VFMSUB213PSZm, TB_ALIGN_NONE }, - { X86::VFMSUB213PDZr, X86::VFMSUB213PDZm, TB_ALIGN_NONE }, - { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128m, TB_ALIGN_NONE }, - { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128m, TB_ALIGN_NONE }, - { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128m, TB_ALIGN_NONE }, - { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128m, TB_ALIGN_NONE }, - { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128m, TB_ALIGN_NONE }, - { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128m, TB_ALIGN_NONE }, - { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256m, TB_ALIGN_NONE }, - { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, TB_ALIGN_NONE }, - { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256m, TB_ALIGN_NONE }, - { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, TB_ALIGN_NONE }, - { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256m, TB_ALIGN_NONE }, - { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, TB_ALIGN_NONE }, - - { X86::VFNMSUB231SSr, X86::VFNMSUB231SSm, TB_ALIGN_NONE }, - { X86::VFNMSUB231SSr_Int, X86::VFNMSUB231SSm_Int, TB_ALIGN_NONE }, - { X86::VFNMSUB231SDr, X86::VFNMSUB231SDm, TB_ALIGN_NONE }, - { X86::VFNMSUB231SDr_Int, X86::VFNMSUB231SDm_Int, TB_ALIGN_NONE }, - { X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, TB_ALIGN_NONE }, - { X86::VFNMSUB132SSr_Int, X86::VFNMSUB132SSm_Int, TB_ALIGN_NONE }, - { X86::VFNMSUB132SDr, X86::VFNMSUB132SDm, TB_ALIGN_NONE }, - { X86::VFNMSUB132SDr_Int, X86::VFNMSUB132SDm_Int, TB_ALIGN_NONE }, - { X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, TB_ALIGN_NONE }, - { X86::VFNMSUB213SSr_Int, X86::VFNMSUB213SSm_Int, TB_ALIGN_NONE }, - { X86::VFNMSUB213SDr, X86::VFNMSUB213SDm, TB_ALIGN_NONE }, - { X86::VFNMSUB213SDr_Int, X86::VFNMSUB213SDm_Int, TB_ALIGN_NONE }, - - { X86::VFNMSUB231PSr, X86::VFNMSUB231PSm, TB_ALIGN_NONE }, - { X86::VFNMSUB231PDr, X86::VFNMSUB231PDm, TB_ALIGN_NONE }, - { X86::VFNMSUB132PSr, X86::VFNMSUB132PSm, TB_ALIGN_NONE }, - { X86::VFNMSUB132PDr, X86::VFNMSUB132PDm, TB_ALIGN_NONE }, - { X86::VFNMSUB213PSr, X86::VFNMSUB213PSm, TB_ALIGN_NONE }, - { X86::VFNMSUB213PDr, X86::VFNMSUB213PDm, TB_ALIGN_NONE }, - { X86::VFNMSUB231PSYr, X86::VFNMSUB231PSYm, TB_ALIGN_NONE }, - { X86::VFNMSUB231PDYr, X86::VFNMSUB231PDYm, TB_ALIGN_NONE }, - { X86::VFNMSUB132PSYr, X86::VFNMSUB132PSYm, TB_ALIGN_NONE }, - { X86::VFNMSUB132PDYr, X86::VFNMSUB132PDYm, TB_ALIGN_NONE }, - { X86::VFNMSUB213PSYr, X86::VFNMSUB213PSYm, TB_ALIGN_NONE }, - { X86::VFNMSUB213PDYr, X86::VFNMSUB213PDYm, TB_ALIGN_NONE }, - { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZm, TB_ALIGN_NONE }, - { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZm, TB_ALIGN_NONE }, - { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZm, TB_ALIGN_NONE }, - { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZm, TB_ALIGN_NONE }, - { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZm, TB_ALIGN_NONE }, - { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZm, TB_ALIGN_NONE }, - { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128m, TB_ALIGN_NONE }, - { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128m, TB_ALIGN_NONE }, - { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128m, TB_ALIGN_NONE }, - { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128m, TB_ALIGN_NONE }, - { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128m, TB_ALIGN_NONE }, - { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128m, TB_ALIGN_NONE }, - { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256m, TB_ALIGN_NONE }, - { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, TB_ALIGN_NONE }, - { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256m, TB_ALIGN_NONE }, - { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, TB_ALIGN_NONE }, - { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256m, TB_ALIGN_NONE }, - { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, TB_ALIGN_NONE }, - - { X86::VFMADDSUB231PSr, X86::VFMADDSUB231PSm, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PDr, X86::VFMADDSUB231PDm, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PSr, X86::VFMADDSUB132PSm, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PDr, X86::VFMADDSUB132PDm, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PSr, X86::VFMADDSUB213PSm, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PDr, X86::VFMADDSUB213PDm, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PSYr, X86::VFMADDSUB231PSYm, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PDYr, X86::VFMADDSUB231PDYm, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PSYr, X86::VFMADDSUB132PSYm, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PDYr, X86::VFMADDSUB132PDYm, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PSYr, X86::VFMADDSUB213PSYm, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PDYr, X86::VFMADDSUB213PDYm, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZm, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZm, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZm, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZm, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZm, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZm, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128m, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128m, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128m, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128m, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128m, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128m, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256m, TB_ALIGN_NONE }, - { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256m, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256m, TB_ALIGN_NONE }, - { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256m, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256m, TB_ALIGN_NONE }, - { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256m, TB_ALIGN_NONE }, - - { X86::VFMSUBADD231PSr, X86::VFMSUBADD231PSm, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PDr, X86::VFMSUBADD231PDm, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PSr, X86::VFMSUBADD132PSm, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PDr, X86::VFMSUBADD132PDm, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PSr, X86::VFMSUBADD213PSm, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PDr, X86::VFMSUBADD213PDm, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PSYr, X86::VFMSUBADD231PSYm, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PDYr, X86::VFMSUBADD231PDYm, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PSYr, X86::VFMSUBADD132PSYm, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PDYr, X86::VFMSUBADD132PDYm, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PSYr, X86::VFMSUBADD213PSYm, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PDYr, X86::VFMSUBADD213PDYm, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZm, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZm, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZm, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZm, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZm, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZm, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128m, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128m, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128m, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128m, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128m, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128m, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256m, TB_ALIGN_NONE }, - { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256m, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256m, TB_ALIGN_NONE }, - { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256m, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256m, TB_ALIGN_NONE }, - { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256m, TB_ALIGN_NONE }, - // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, @@ -2234,6 +1959,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 3, folded load Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } + auto I = X86InstrFMA3Info::rm_begin(); + auto E = X86InstrFMA3Info::rm_end(); + for (; I != E; ++I) + if (!I.getGroup()->isKMasked()) + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD); static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { // AVX-512 foldable instructions @@ -2283,6 +2015,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // Index 4, folded load Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } + for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) + if (I.getGroup()->isKMasked()) + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + I.getRegOpcode(), I.getMemOpcode(), + TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD); } void @@ -3345,241 +3082,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// Returns true if the given instruction opcode is FMA3. -/// Otherwise, returns false. -/// The second parameter is optional and is used as the second return from -/// the function. It is set to true if the given instruction has FMA3 opcode -/// that is used for lowering of scalar FMA intrinsics, and it is set to false -/// otherwise. -static bool isFMA3(unsigned Opcode, bool &IsIntrinsic) { - IsIntrinsic = false; - -#define FMA3_CASE(Name, Modifier) \ -case X86::Name##r##Modifier: case X86::Name##m##Modifier: - -#define FMA3_SCALAR_PAIR(Name, Size, Modifier) \ - FMA3_CASE(Name##SD##Size, Modifier) \ - FMA3_CASE(Name##SS##Size, Modifier) - -#define FMA3_PACKED_PAIR(Name, Size) \ - FMA3_CASE(Name##PD##Size, ) \ - FMA3_CASE(Name##PS##Size, ) - -#define FMA3_PACKED_SET(Form, Size) \ - FMA3_PACKED_PAIR(VFMADD##Form, Size) \ - FMA3_PACKED_PAIR(VFMSUB##Form, Size) \ - FMA3_PACKED_PAIR(VFNMADD##Form, Size) \ - FMA3_PACKED_PAIR(VFNMSUB##Form, Size) \ - FMA3_PACKED_PAIR(VFMADDSUB##Form, Size) \ - FMA3_PACKED_PAIR(VFMSUBADD##Form, Size) - -#define FMA3_CASES(Form) \ - FMA3_SCALAR_PAIR(VFMADD##Form, ,) \ - FMA3_SCALAR_PAIR(VFMSUB##Form, ,) \ - FMA3_SCALAR_PAIR(VFNMADD##Form, ,) \ - FMA3_SCALAR_PAIR(VFNMSUB##Form, ,) \ - FMA3_PACKED_SET(Form, ) \ - FMA3_PACKED_SET(Form, Y) \ - -#define FMA3_CASES_AVX512(Form) \ - FMA3_SCALAR_PAIR(VFMADD##Form, Z, ) \ - FMA3_SCALAR_PAIR(VFMSUB##Form, Z, ) \ - FMA3_SCALAR_PAIR(VFNMADD##Form, Z, ) \ - FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, ) \ - FMA3_PACKED_SET(Form, Z128) \ - FMA3_PACKED_SET(Form, Z256) \ - FMA3_PACKED_SET(Form, Z) - -#define FMA3_CASES_SCALAR_INT(Form) \ - FMA3_SCALAR_PAIR(VFMADD##Form, , _Int) \ - FMA3_SCALAR_PAIR(VFMSUB##Form, , _Int) \ - FMA3_SCALAR_PAIR(VFNMADD##Form, , _Int) \ - FMA3_SCALAR_PAIR(VFNMSUB##Form, , _Int) - -#define FMA3_CASES_SCALAR_INT_AVX512(Form) \ - FMA3_SCALAR_PAIR(VFMADD##Form, Z, _Int) \ - FMA3_SCALAR_PAIR(VFMSUB##Form, Z, _Int) \ - FMA3_SCALAR_PAIR(VFNMADD##Form, Z, _Int) \ - FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, _Int) - - switch (Opcode) { - FMA3_CASES(132) - FMA3_CASES(213) - FMA3_CASES(231) - - // AVX-512 instructions - FMA3_CASES_AVX512(132) - FMA3_CASES_AVX512(213) - FMA3_CASES_AVX512(231) - return true; - - FMA3_CASES_SCALAR_INT(132) - FMA3_CASES_SCALAR_INT(213) - FMA3_CASES_SCALAR_INT(231) - - // AVX-512 instructions - FMA3_CASES_SCALAR_INT_AVX512(132) - FMA3_CASES_SCALAR_INT_AVX512(213) - FMA3_CASES_SCALAR_INT_AVX512(231) - IsIntrinsic = true; - return true; - default: - return false; - } - llvm_unreachable("Opcode not handled by the switch"); - -#undef FMA3_CASE -#undef FMA3_SCALAR_PAIR -#undef FMA3_PACKED_PAIR -#undef FMA3_PACKED_SET -#undef FMA3_CASES -#undef FMA3_CASES_AVX512 -#undef FMA3_CASES_SCALAR_INT -#undef FMA3_CASES_SCALAR_INT_AVX512 -} - -/// Returns an adjusted FMA opcode that must be used in FMA instruction that -/// performs the same computations as the given MI but which has the operands -/// \p SrcOpIdx1 and \p SrcOpIdx2 commuted. -/// It may return 0 if it is unsafe to commute the operands. -/// -/// The returned FMA opcode may differ from the opcode in the given \p MI. -/// For example, commuting the operands #1 and #3 in the following FMA -/// FMA213 #1, #2, #3 -/// results into instruction with adjusted opcode: -/// FMA231 #3, #2, #1 -static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc, - bool IsIntrinOpcode, - unsigned SrcOpIdx1, - unsigned SrcOpIdx2) { -#define FMA3_ENTRY(Name, Suffix) \ - { X86::Name##132##Suffix, X86::Name##213##Suffix, X86::Name##231##Suffix }, - -#define FMA3_SCALAR_PAIR(Name, Suffix) \ - FMA3_ENTRY(Name, SS##Suffix) \ - FMA3_ENTRY(Name, SD##Suffix) - -#define FMA3_PACKED_PAIR(Name, Suffix) \ - FMA3_ENTRY(Name, PS##Suffix) \ - FMA3_ENTRY(Name, PD##Suffix) - -#define FMA3_PACKED_SIZES(Name, Suffix) \ - FMA3_PACKED_PAIR(Name, Suffix) \ - FMA3_PACKED_PAIR(Name, Y##Suffix) - -#define FMA3_TABLE_ALL(Name) \ - FMA3_SCALAR_PAIR(Name, r) \ - FMA3_PACKED_SIZES(Name, r) \ - FMA3_SCALAR_PAIR(Name, m) \ - FMA3_PACKED_SIZES(Name, m) - -#define FMA3_TABLE_PACKED(Name) \ - FMA3_PACKED_SIZES(Name, r) \ - FMA3_PACKED_SIZES(Name, m) - -#define FMA3_TABLE_SCALAR_INT(Name) \ - FMA3_SCALAR_PAIR(Name, r_Int) \ - FMA3_SCALAR_PAIR(Name, m_Int) - -#define FMA3_PACKED_SIZES_AVX512(Name, Suffix) \ - FMA3_PACKED_PAIR(Name, Z128##Suffix) \ - FMA3_PACKED_PAIR(Name, Z256##Suffix) \ - FMA3_PACKED_PAIR(Name, Z##Suffix) - -#define FMA3_TABLE_ALL_AVX512(Name) \ - FMA3_SCALAR_PAIR(Name, Zr) \ - FMA3_PACKED_SIZES_AVX512(Name, r) \ - FMA3_SCALAR_PAIR(Name, Zm) \ - FMA3_PACKED_SIZES_AVX512(Name, m) - -#define FMA3_TABLE_PACKED_AVX512(Name) \ - FMA3_PACKED_SIZES_AVX512(Name, r) \ - FMA3_PACKED_SIZES_AVX512(Name, m) - -#define FMA3_TABLE_SCALAR_INT_AVX512(Name) \ - FMA3_SCALAR_PAIR(Name, Zr_Int) \ - FMA3_SCALAR_PAIR(Name, Zm_Int) - - // Define the array that holds FMA opcodes in groups - // of 3 opcodes(132, 213, 231) in each group. - static const uint16_t RegularOpcodeGroups[][3] = { - FMA3_TABLE_ALL(VFMADD) - FMA3_TABLE_ALL(VFMSUB) - FMA3_TABLE_ALL(VFNMADD) - FMA3_TABLE_ALL(VFNMSUB) - FMA3_TABLE_PACKED(VFMADDSUB) - FMA3_TABLE_PACKED(VFMSUBADD) - - // AVX-512 instructions - FMA3_TABLE_ALL_AVX512(VFMADD) - FMA3_TABLE_ALL_AVX512(VFMSUB) - FMA3_TABLE_ALL_AVX512(VFNMADD) - FMA3_TABLE_ALL_AVX512(VFNMSUB) - FMA3_TABLE_PACKED_AVX512(VFMADDSUB) - FMA3_TABLE_PACKED_AVX512(VFMSUBADD) - }; - - // Define the array that holds FMA*_Int opcodes in groups - // of 3 opcodes(132, 213, 231) in each group. - static const uint16_t IntrinOpcodeGroups[][3] = { - FMA3_TABLE_SCALAR_INT(VFMADD) - FMA3_TABLE_SCALAR_INT(VFMSUB) - FMA3_TABLE_SCALAR_INT(VFNMADD) - FMA3_TABLE_SCALAR_INT(VFNMSUB) - - // AVX-512 instructions - FMA3_TABLE_SCALAR_INT_AVX512(VFMADD) - FMA3_TABLE_SCALAR_INT_AVX512(VFMSUB) - FMA3_TABLE_SCALAR_INT_AVX512(VFNMADD) - FMA3_TABLE_SCALAR_INT_AVX512(VFNMSUB) - }; - -#undef FMA3_ENTRY -#undef FMA3_SCALAR_PAIR -#undef FMA3_PACKED_PAIR -#undef FMA3_PACKED_SIZES -#undef FMA3_TABLE_ALL -#undef FMA3_TABLE_PACKED -#undef FMA3_TABLE_SCALAR_INT -#undef FMA3_SCALAR_PAIR_AVX512 -#undef FMA3_PACKED_SIZES_AVX512 -#undef FMA3_TABLE_ALL_AVX512 -#undef FMA3_TABLE_PACKED_AVX512 -#undef FMA3_TABLE_SCALAR_INT_AVX512 - - const unsigned Form132Index = 0; - const unsigned Form213Index = 1; - const unsigned Form231Index = 2; - const unsigned FormsNum = 3; - - size_t GroupsNum; - const uint16_t (*OpcodeGroups)[3]; - if (IsIntrinOpcode) { - GroupsNum = array_lengthof(IntrinOpcodeGroups); - OpcodeGroups = IntrinOpcodeGroups; - } else { - GroupsNum = array_lengthof(RegularOpcodeGroups); - OpcodeGroups = RegularOpcodeGroups; - } - - const uint16_t *FoundOpcodesGroup = nullptr; - size_t FormIndex; - - // Look for the input opcode in the corresponding opcodes table. - for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; - ++GroupIndex) { - for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { - if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { - FoundOpcodesGroup = OpcodeGroups[GroupIndex]; - break; - } - } - } +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( + const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { - // The input opcode does not match with any of the opcodes from the tables. - // The unsupported FMA opcode must be added to one of the two opcode groups - // defined above. - assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); + unsigned Opc = MI.getOpcode(); // Put the lowest index to SrcOpIdx1 to simplify the checks below. if (SrcOpIdx1 > SrcOpIdx2) @@ -3591,15 +3098,40 @@ static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc, // not implemented yet. So, just return 0 in that case. // When such analysis are available this place will be the right place for // calling it. - if (IsIntrinOpcode && SrcOpIdx1 == 1) + if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1) return 0; + unsigned FMAOp1 = 1, FMAOp2 = 2, FMAOp3 = 3; + if (FMA3Group.isKMasked()) { + // The k-mask operand cannot be commuted. + if (SrcOpIdx1 == 2) + return 0; + + // For k-zero-masked operations it is Ok to commute the first vector + // operand. + // For regular k-masked operations a conservative choice is done as the + // elements of the first vector operand, for which the corresponding bit + // in the k-mask operand is set to 0, are copied to the result of FMA. + // TODO/FIXME: The commute still may be legal if it is known that the + // k-mask operand is set to either all ones or all zeroes. + // It is also Ok to commute the 1st operand if all users of MI use only + // the elements enabled by the k-mask operand. For example, + // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i] + // : v1[i]; + // VMOVAPSZmrk , k, v4; // this is the ONLY user of v4 -> + // // Ok, to commute v1 in FMADD213PSZrk. + if (FMA3Group.isKMergeMasked() && SrcOpIdx1 == FMAOp1) + return 0; + FMAOp2++; + FMAOp3++; + } + unsigned Case; - if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp2) Case = 0; - else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) + else if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp3) Case = 1; - else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) + else if (SrcOpIdx1 == FMAOp2 && SrcOpIdx2 == FMAOp3) Case = 2; else return 0; @@ -3607,6 +3139,9 @@ static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc, // Define the FMA forms mapping array that helps to map input FMA form // to output FMA form to preserve the operation semantics after // commuting the operands. + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; static const unsigned FormMapping[][3] = { // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; // FMA132 A, C, b; ==> FMA231 C, A, b; @@ -3625,9 +3160,24 @@ static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc, { Form213Index, Form132Index, Form231Index } }; + unsigned FMAForms[3]; + if (FMA3Group.isRegOpcodeFromGroup(Opc)) { + FMAForms[0] = FMA3Group.getReg132Opcode(); + FMAForms[1] = FMA3Group.getReg213Opcode(); + FMAForms[2] = FMA3Group.getReg231Opcode(); + } else { + FMAForms[0] = FMA3Group.getMem132Opcode(); + FMAForms[1] = FMA3Group.getMem213Opcode(); + FMAForms[2] = FMA3Group.getMem231Opcode(); + } + unsigned FormIndex; + for (FormIndex = 0; FormIndex < 3; FormIndex++) + if (Opc == FMAForms[FormIndex]) + break; + // Everything is ready, just adjust the FMA opcode and return it. FormIndex = FormMapping[Case][FormIndex]; - return FoundOpcodesGroup[FormIndex]; + return FMAForms[FormIndex]; } MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, @@ -3852,11 +3402,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, OpIdx1, OpIdx2); } default: - bool IsIntrinOpcode; - if (isFMA3(MI.getOpcode(), IsIntrinOpcode)) { - unsigned Opc = getFMA3OpcodeToCommuteOperands(MI.getOpcode(), - IsIntrinOpcode, - OpIdx1, OpIdx2); + const X86InstrFMA3Group *FMA3Group = + X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); + if (FMA3Group) { + unsigned Opc = + getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group); if (Opc == 0) return nullptr; auto &WorkingMI = cloneIfNew(MI); @@ -3869,21 +3419,37 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, } } -bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, - bool IsIntrinOpcode, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { +bool X86InstrInfo::findFMA3CommutedOpIndices( + const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const { + unsigned FirstCommutableVecOp = 1; + unsigned LastCommutableVecOp = 3; + unsigned KMaskOp = 0; + if (FMA3Group.isKMasked()) { + // The k-mask operand has index = 2 for masked and zero-masked operations. + KMaskOp = 2; + + // The operand with index = 1 is used as a source for those elements for + // which the corresponding bit in the k-mask is set to 0. + if (FMA3Group.isKMergeMasked()) + FirstCommutableVecOp = 3; + + LastCommutableVecOp++; + } - unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + if (isMem(MI, LastCommutableVecOp)) + LastCommutableVecOp--; // Only the first RegOpsNum operands are commutable. // Also, the value 'CommuteAnyOperandIndex' is valid here as it means // that the operand is not specified/fixed. if (SrcOpIdx1 != CommuteAnyOperandIndex && - (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp || + SrcOpIdx1 == KMaskOp)) return false; if (SrcOpIdx2 != CommuteAnyOperandIndex && - (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp || + SrcOpIdx2 == KMaskOp)) return false; // Look for two different register operands assumed to be commutable @@ -3898,7 +3464,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, if (SrcOpIdx1 == SrcOpIdx2) // Both of operands are not fixed. By default set one of commutable // operands to the last register operand of the instruction. - CommutableOpIdx2 = RegOpsNum; + CommutableOpIdx2 = LastCommutableVecOp; else if (SrcOpIdx2 == CommuteAnyOperandIndex) // Only one of operands is not fixed. CommutableOpIdx2 = SrcOpIdx1; @@ -3906,7 +3472,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); - for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + for (CommutableOpIdx1 = LastCommutableVecOp; + CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) { + // Just ignore and skip the k-mask operand. + if (CommutableOpIdx1 == KMaskOp) + continue; + // The commuted operands must have different registers. // Otherwise, the commute transformation does not change anything and // is useless then. @@ -3915,7 +3486,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, } // No appropriate commutable operands were found. - if (CommutableOpIdx1 == 0) + if (CommutableOpIdx1 < FirstCommutableVecOp) return false; // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 @@ -3927,8 +3498,7 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, // Check if we can adjust the opcode to preserve the semantics when // commute the register operands. - return getFMA3OpcodeToCommuteOperands(MI.getOpcode(), IsIntrinOpcode, - SrcOpIdx1, SrcOpIdx2) != 0; + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0; } bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, @@ -3955,10 +3525,10 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, return false; } default: - bool IsIntrinOpcode; - if (isFMA3(MI.getOpcode(), IsIntrinOpcode)) - return findFMA3CommutedOpIndices(MI, IsIntrinOpcode, - SrcOpIdx1, SrcOpIdx2); + const X86InstrFMA3Group *FMA3Group = + X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); + if (FMA3Group) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group); return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 7251aec..5c8de0f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H #include "MCTargetDesc/X86BaseInfo.h" +#include "X86InstrFMA3Info.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Target/TargetInstrInfo.h" @@ -265,7 +266,7 @@ public: unsigned &SrcOpIdx2) const override; /// Returns true if the routine could find two commutable operands - /// in the given FMA instruction. Otherwise, returns false. + /// in the given FMA instruction \p MI. Otherwise, returns false. /// /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments. /// The output indices of the commuted operands are returned in these @@ -274,10 +275,12 @@ public: /// value 'CommuteAnyOperandIndex' which means that the corresponding /// operand index is not set and this method is free to pick any of /// available commutable operands. + /// The parameter \p FMA3Group keeps the reference to the group of relative + /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes. /// /// For example, calling this method this way: /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; - /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group); /// can be interpreted as a query asking if the operand #1 can be swapped /// with any other available operand (e.g. operand #2, operand #3, etc.). /// @@ -286,9 +289,30 @@ public: /// FMA213 #1, #2, #3 /// results into instruction with adjusted opcode: /// FMA231 #3, #2, #1 - bool findFMA3CommutedOpIndices(MachineInstr &MI, bool IsIntrinOpcode, + bool findFMA3CommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const; + unsigned &SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given \p MI but which has the + /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// Note that a machine instruction (instead of its opcode) is passed as the + /// first parameter to make it possible to analyze the instruction's uses and + /// commute the first operand of FMA even when it seems unsafe when you look + /// at the opcode. For example, it is Ok to commute the first operand of + /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used. + /// + /// The returned FMA opcode may differ from the opcode in the given \p MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const; // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr &MI) const override; diff --git a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll index 5f9db2c..27350f5 100644 --- a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -310,8 +310,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind ret <16 x float> %res @@ -320,8 +319,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind ret <16 x float> %res @@ -330,8 +328,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind ret <16 x float> %res @@ -340,8 +337,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind ret <16 x float> %res @@ -443,8 +439,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind ret <8 x double> %res @@ -453,8 +448,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind ret <8 x double> %res @@ -463,8 +457,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind ret <8 x double> %res @@ -473,8 +466,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind ret <8 x double> %res @@ -641,8 +633,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne: ; CHECK: ## BB#0: -; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind ret <8 x double> %res @@ -651,8 +642,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn: ; CHECK: ## BB#0: -; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind ret <8 x double> %res @@ -661,8 +651,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp: ; CHECK: ## BB#0: -; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind ret <8 x double> %res @@ -671,8 +660,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz: ; CHECK: ## BB#0: -; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind ret <8 x double> %res diff --git a/llvm/test/CodeGen/X86/avx512-fma.ll b/llvm/test/CodeGen/X86/avx512-fma.ll index 3784c85f..9622b81 100644 --- a/llvm/test/CodeGen/X86/avx512-fma.ll +++ b/llvm/test/CodeGen/X86/avx512-fma.ll @@ -101,8 +101,7 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { ; ALL-LABEL: test231_br: ; ALL: ## BB#0: -; ALL-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1 -; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: vfmadd132ps {{.*}}(%rip){1to16}, %zmm1, %zmm0 ; ALL-NEXT: retq %b1 = fmul <16 x float> %a1, %b2 = fadd <16 x float> %b1, %a2 @@ -112,8 +111,7 @@ define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { ; ALL-LABEL: test213_br: ; ALL: ## BB#0: -; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm0, %zmm1 -; ALL-NEXT: vmovaps %zmm1, %zmm0 +; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0 ; ALL-NEXT: retq %b1 = fmul <16 x float> %a1, %a2 %b2 = fadd <16 x float> %b1, @@ -175,8 +173,7 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <1 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 ; KNL-NEXT: vpslld $31, %zmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 -; KNL-NEXT: vmovups (%rdi), %zmm2 -; KNL-NEXT: vfmadd132ps %zmm0, %zmm2, %zmm1 {%k1} +; KNL-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} ; KNL-NEXT: vmovaps %zmm1, %zmm0 ; KNL-NEXT: retq ; @@ -184,8 +181,7 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <1 ; SKX: ## BB#0: ; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 ; SKX-NEXT: vpmovb2m %xmm2, %k1 -; SKX-NEXT: vmovups (%rdi), %zmm2 -; SKX-NEXT: vfmadd132ps %zmm0, %zmm2, %zmm1 {%k1} +; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} ; SKX-NEXT: vmovaps %zmm1, %zmm0 ; SKX-NEXT: retq %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 1e9df3f..6dd59ac 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1812,8 +1812,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovaps (%rdi), %xmm2 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x17] -; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] +; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %a2 = load <4 x float>, <4 x float>* %ptr_a2 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind @@ -1824,8 +1823,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovups (%rdi), %xmm2 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x17] -; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] +; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind @@ -1885,8 +1883,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1 define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x18,0xa8,0x0f] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1] +; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %q = load float, float* %ptr_a2 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1900,8 +1897,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1 define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza: ; CHECK: ## BB#0: -; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x18,0xa8,0x0f] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1] +; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %q = load float, float* %ptr_a2, align 4 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -1935,8 +1931,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> % ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovapd (%rdi), %xmm2 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x17] -; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1] +; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %a2 = load <2 x double>, <2 x double>* %ptr_a2 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind @@ -1976,8 +1971,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> % ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vmovapd (%rdi), %ymm2 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x17] -; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1] +; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %a2 = load <4 x double>, <4 x double>* %ptr_a2 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine.ll b/llvm/test/CodeGen/X86/fma-fneg-combine.ll index 5ce22eb..766bc01 100644 --- a/llvm/test/CodeGen/X86/fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine.ll @@ -61,8 +61,7 @@ define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test5: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm2, %zmm2 -; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %sub.i = fsub <16 x float> , %c @@ -73,8 +72,8 @@ entry: define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) { ; CHECK-LABEL: test6: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vfnmsub213ps {ru-sae}, %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm1, %zmm0 +; CHECK-NEXT: vfnmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: retq entry: %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 2) #2 -- 2.7.4