From b5421c498d279fdfb5e6a079e7fd05a5fd6cbd7b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Oct 2018 21:48:34 +0000 Subject: [PATCH] [X86] Prevent non-temporal loads from folding into instructions by blocking them in X86DAGToDAGISel::IsProfitableToFold rather than with a predicate. Remove tryFoldVecLoad since tryFoldLoad would call IsProfitableToFold and pick up the new check. This saves about 5K out of ~600K on the generated isel table. llvm-svn: 344189 --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 32 +++++------------ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 53 ++++++++++++---------------- 2 files changed, 31 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 25a8567..5eb4dbb 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -239,12 +239,6 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } - // Try to fold a vector load. This makes sure the load isn't non-temporal. - bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, - SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, - SDValue &Segment); - /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -516,6 +510,10 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (N.getOpcode() != ISD::LOAD) return true; + // Don't fold non-temporal loads if we have an instruction for them. + if (useNonTemporalLoad(cast(N))) + return false; + // If N is a load, do additional profitability checks. if (U == Root) { switch (U->getOpcode()) { @@ -2053,20 +2051,6 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } -bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, - SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, - SDValue &Segment) { - if (!ISD::isNON_EXTLoad(N.getNode()) || - useNonTemporalLoad(cast(N)) || - !IsProfitableToFold(N, P, Root) || - !IsLegalToFold(N, P, Root, OptLevel)) - return false; - - return selectAddr(N.getNode(), - N.getOperand(1), Base, Scale, Index, Disp, Segment); -} - /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -2595,8 +2579,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, // alignment on this load. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() && - tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4)) { + tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { SDValue Load = N1.getOperand(0); SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Load.getOperand(0) }; @@ -2632,8 +2616,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, // alignment on this load. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() && - tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4)) { + tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { SDValue Load = N2.getOperand(0); SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Load.getOperand(0), InFlag }; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 3aa825e..f750fe3 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -647,28 +647,22 @@ def sdmem : Operand { // SSE pattern fragments //===----------------------------------------------------------------------===// -// Vector load wrappers to prevent folding of non-temporal aligned loads on -// supporting targets. -def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !useNonTemporalLoad(cast(N)); -}]>; - // 128-bit load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 -def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>; -def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>; -def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>; +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; // 256-bit load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 -def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>; -def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>; -def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>; +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // 512-bit load pattern fragments -def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>; -def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>; -def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>; +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; @@ -682,46 +676,45 @@ def alignedstore : PatFrag<(ops node:$val, node:$ptr), return St->getAlignment() >= St->getMemoryVT().getStoreSize(); }]>; -// Like 'load', but always requires 128-bit vector alignment. -def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ +// Like 'load', but always requires vector size alignment. +def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ auto *Ld = cast(N); - return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() && - !useNonTemporalLoad(cast(N)); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); }]>; // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv4f32 : PatFrag<(ops node:$ptr), - (v4f32 (alignedvecload node:$ptr))>; + (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), - (v2f64 (alignedvecload node:$ptr))>; + (v2f64 (alignedload node:$ptr))>; def alignedloadv2i64 : PatFrag<(ops node:$ptr), - (v2i64 (alignedvecload node:$ptr))>; + (v2i64 (alignedload node:$ptr))>; // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 def alignedloadv8f32 : PatFrag<(ops node:$ptr), - (v8f32 (alignedvecload node:$ptr))>; + (v8f32 (alignedload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), - (v4f64 (alignedvecload node:$ptr))>; + (v4f64 (alignedload node:$ptr))>; def alignedloadv4i64 : PatFrag<(ops node:$ptr), - (v4i64 (alignedvecload node:$ptr))>; + (v4i64 (alignedload node:$ptr))>; // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), - (v16f32 (alignedvecload node:$ptr))>; + (v16f32 (alignedload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), - (v8f64 (alignedvecload node:$ptr))>; + (v8f64 (alignedload node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), - (v8i64 (alignedvecload node:$ptr))>; + (v8i64 (alignedload node:$ptr))>; -// Like 'vecload', but uses special alignment checks suitable for use in +// Like 'load', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to // be naturally aligned on some targets but not on others. If the subtarget // allows unaligned accesses, match any load, though this may require // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. -def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{ +def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ auto *Ld = cast(N); return Subtarget->hasSSEUnalignedMem() || Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); -- 2.7.4