From: Arthur Eubanks Date: Tue, 22 Mar 2022 20:27:04 +0000 (-0700) Subject: Revert "Recommit "[SLP] Fix lookahead operand reordering for splat loads."" X-Git-Tag: upstream/15.0.7~12705 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f7d7d2a08d16356c57f6d2d36bc2fc0589a55df9;p=platform%2Fupstream%2Fllvm.git Revert "Recommit "[SLP] Fix lookahead operand reordering for splat loads."" This reverts commit 79613185d305013de743cdbd6690e4d77c8af27e. Causes crashes, see comments in https://reviews.llvm.org/D121973. --- diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 31eb40e..23a3faa 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -658,10 +658,6 @@ public: /// Return true if the target supports nontemporal load. bool isLegalNTLoad(Type *DataType, Align Alignment) const; - /// \Returns true if the target supports broadcasting a load to a vector of - /// type . - bool isLegalBroadcastLoad(Type *ElementTy, unsigned NumElements) const; - /// Return true if the target supports masked scatter. bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; /// Return true if the target supports masked gather. @@ -1048,14 +1044,11 @@ public: /// The exact mask may be passed as Mask, or else the array will be empty. /// The index and subtype parameters are used by the subvector insertion and /// extraction shuffle kinds to show the insert/extract point and the type of - /// the subvector being inserted/extracted. The operands of the shuffle can be - /// passed through \p Args, which helps improve the cost estimation in some - /// cases, like in broadcast loads. + /// the subvector being inserted/extracted. /// NOTE: For subvector extractions Tp represents the source type. InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask = None, int Index = 0, - VectorType *SubTp = nullptr, - ArrayRef Args = None) const; + VectorType *SubTp = nullptr) const; /// Represents a hint about the context in which a cast is used. /// @@ -1556,8 +1549,6 @@ public: virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; - virtual bool isLegalBroadcastLoad(Type *ElementTy, - unsigned NumElements) const = 0; virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; virtual bool forceScalarizeMaskedGather(VectorType *DataType, @@ -1668,8 +1659,7 @@ public: ArrayRef Args, const Instruction *CxtI = nullptr) = 0; virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args) = 0; + VectorType *SubTp) = 0; virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -1962,10 +1952,6 @@ public: bool isLegalNTLoad(Type *DataType, Align Alignment) override { return Impl.isLegalNTLoad(DataType, Alignment); } - bool isLegalBroadcastLoad(Type *ElementTy, - unsigned NumElements) const override { - return Impl.isLegalBroadcastLoad(ElementTy, NumElements); - } bool isLegalMaskedScatter(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedScatter(DataType, Alignment); } @@ -2193,9 +2179,8 @@ public: } InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args) override { - return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp, Args); + VectorType *SubTp) override { + return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp); } InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 806014e..4599026 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -256,10 +256,6 @@ public: return Alignment >= DataSize && isPowerOf2_32(DataSize); } - bool isLegalBroadcastLoad(Type *ElementTy, unsigned NumElements) const { - return false; - } - bool isLegalMaskedScatter(Type *DataType, Align Alignment) const { return false; } @@ -492,8 +488,7 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None) const { + VectorType *SubTp) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index d3b2272..8e62dff 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -871,8 +871,7 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None) { + VectorType *SubTp) { switch (improveShuffleKindFromMask(Kind, Mask)) { case TTI::SK_Broadcast: diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 804331e..ae22214 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -396,11 +396,6 @@ bool TargetTransformInfo::isLegalNTLoad(Type *DataType, Align Alignment) const { return TTIImpl->isLegalNTLoad(DataType, Alignment); } -bool TargetTransformInfo::isLegalBroadcastLoad(Type *ElementTy, - unsigned NumElements) const { - return TTIImpl->isLegalBroadcastLoad(ElementTy, NumElements); -} - bool TargetTransformInfo::isLegalMaskedGather(Type *DataType, Align Alignment) const { return TTIImpl->isLegalMaskedGather(DataType, Alignment); @@ -745,11 +740,12 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost( return Cost; } -InstructionCost TargetTransformInfo::getShuffleCost( - ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, int Index, - VectorType *SubTp, ArrayRef Args) const { - InstructionCost Cost = - TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp, Args); +InstructionCost TargetTransformInfo::getShuffleCost(ShuffleKind Kind, + VectorType *Ty, + ArrayRef Mask, + int Index, + VectorType *SubTp) const { + InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b9f6f49..2cd3c93 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2604,8 +2604,7 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args) { + VectorType *SubTp) { Kind = improveShuffleKindFromMask(Kind, Mask); if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 92005b3..a6029b9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -330,8 +330,7 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None); + VectorType *SubTp); /// @} }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index bdd22a4..a8df778 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1042,8 +1042,7 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, ArrayRef Mask, - int Index, VectorType *SubTp, - ArrayRef Args) { + int Index, VectorType *SubTp) { Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVOP3PInsts()) { if (cast(VT)->getNumElements() == 2 && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 4743042..e901b5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -201,8 +201,7 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None); + VectorType *SubTp); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 7e80223..d9d563e 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1202,8 +1202,7 @@ InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - int Index, VectorType *SubTp, - ArrayRef Args) { + int Index, VectorType *SubTp) { Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasNEON()) { if (Kind == TTI::SK_Broadcast) { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 3139c41..5bb8489 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -213,8 +213,7 @@ public: InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None); + VectorType *SubTp); bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 280d8f1..1bdd8c3 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -223,8 +223,7 @@ HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, int Index, - Type *SubTp, - ArrayRef Args) { + Type *SubTp) { return 1; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 65eb9d9..9e637df 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -125,8 +125,7 @@ public: Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - ArrayRef Mask, int Index, Type *SubTp, - ArrayRef Args = None); + ArrayRef Mask, int Index, Type *SubTp); InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 1ee960f..cc5738a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1015,8 +1015,7 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost( InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, int Index, - Type *SubTp, - ArrayRef Args) { + Type *SubTp) { InstructionCost CostFactor = vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 61cb689..0af6f2a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -111,8 +111,7 @@ public: ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - ArrayRef Mask, int Index, Type *SubTp, - ArrayRef Args = None); + ArrayRef Mask, int Index, Type *SubTp); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 5f797f1..6721a0a 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -175,8 +175,7 @@ InstructionCost RISCVTTIImpl::getSpliceCost(VectorType *Tp, int Index) { InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, - int Index, VectorType *SubTp, - ArrayRef Args) { + int Index, VectorType *SubTp) { if (Kind == TTI::SK_Splice && isa(Tp)) return getSpliceCost(Tp, Index); return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 9088d48..63c7ed0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -80,8 +80,7 @@ public: InstructionCost getSpliceCost(VectorType *Tp, int Index); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None); + VectorType *SubTp); InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 5e0c480..6d66ebf 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -559,8 +559,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost( InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args) { + VectorType *SubTp) { Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVector()) { unsigned NumVectors = getNumVectorRegs(Tp); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index acc9aee..db4ec79 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,8 +92,7 @@ public: const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None); + VectorType *SubTp); unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index ae3f8d4..32f9f56 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1085,8 +1085,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args) { + VectorType *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); @@ -1546,27 +1545,9 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; - static const CostTblEntry SSE3BroadcastLoadTbl[] = { - {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup - }; - - if (ST->hasSSE2()) { - bool IsLoad = !Args.empty() && llvm::all_of(Args, [](const Value *V) { - return isa(V); - }); - if (ST->hasSSE3() && IsLoad) - if (const auto *Entry = - CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { - assert(isLegalBroadcastLoad( - BaseTp->getElementType(), - cast(BaseTp)->getNumElements()) && - "Table entry missing from isLegalBroadcastLoad()"); - return LT.first * Entry->Cost; - } - + if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - } static const CostTblEntry SSE1ShuffleTbl[] = { { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps @@ -5137,13 +5118,6 @@ bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { return true; } -bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, - unsigned NumElements) const { - // movddup - return ST->hasSSSE3() && NumElements == 2 && - ElementTy == Type::getDoubleTy(ElementTy->getContext()); -} - bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { if (!isa(DataTy)) return false; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index d262835..a8909ee 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -131,8 +131,7 @@ public: const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, - VectorType *SubTp, - ArrayRef Args = None); + VectorType *SubTp); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -227,7 +226,6 @@ public: bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); - bool isLegalBroadcastLoad(Type *ElementTy, unsigned NumElements) const; bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment); bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { return forceScalarizeMaskedGather(VTy, Alignment); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c6f1e68..926b76f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1136,11 +1136,6 @@ public: /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). static const int ScoreConsecutiveLoads = 4; - /// The same load multiple times. This should have a better score than - /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it - /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for - /// a vector load and 1.0 for a broadcast. - static const int ScoreSplatLoads = 3; /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). static const int ScoreReversedLoads = 3; /// ExtractElementInst from same vector and consecutive indexes. @@ -1167,18 +1162,9 @@ public: /// MainAltOps. static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, ScalarEvolution &SE, int NumLanes, - ArrayRef MainAltOps, - const TargetTransformInfo *TTI) { - if (V1 == V2) { - if (isa(V1)) { - // A broadcast of a load can be cheaper on some targets. - // TODO: For now accept a broadcast load with no other internal uses. - if (TTI->isLegalBroadcastLoad(V1->getType(), NumLanes) && - (int)V1->getNumUses() == NumLanes) - return VLOperands::ScoreSplatLoads; - } + ArrayRef MainAltOps) { + if (V1 == V2) return VLOperands::ScoreSplat; - } auto *LI1 = dyn_cast(V1); auto *LI2 = dyn_cast(V2); @@ -1357,7 +1343,7 @@ public: // Get the shallow score of V1 and V2. int ShallowScoreAtThisLevel = - getShallowScore(LHS, RHS, DL, SE, getNumLanes(), MainAltOps, R.TTI); + getShallowScore(LHS, RHS, DL, SE, getNumLanes(), MainAltOps); // If reached MaxLevel, // or if V1 and V2 are not instructions, @@ -5251,9 +5237,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // broadcast. assert(VecTy == FinalVecTy && "No reused scalars expected for broadcast."); - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/None, /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/VL); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index 0054520..6c456bb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -643,62 +643,32 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { ; Same as @ChecksExtractScores, but the extratelement vector operands do not match. define void @ChecksExtractScores_different_vectors(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2, <2 x double>* %vecPtr3, <2 x double>* %vecPtr4) { -; SSE-LABEL: @ChecksExtractScores_different_vectors( -; SSE-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 -; SSE-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 -; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 -; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 -; SSE-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 -; SSE-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 -; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 -; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 -; SSE-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 -; SSE-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 -; SSE-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]] -; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]] -; SSE-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; SSE-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 -; SSE-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 -; SSE-NEXT: ret void -; -; AVX-LABEL: @ChecksExtractScores_different_vectors( -; AVX-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 -; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 -; AVX-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 -; AVX-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 -; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 -; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 -; AVX-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 -; AVX-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 -; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 -; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 -; AVX-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 -; AVX-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[EXTRA1]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[LOADA0]], i32 1 -; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[LOADA1]], i32 1 -; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]] -; AVX-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] -; AVX-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; AVX-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 -; AVX-NEXT: [[TMP12:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; AVX-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 -; AVX-NEXT: ret void +; CHECK-LABEL: @ChecksExtractScores_different_vectors( +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 +; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 +; CHECK-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 +; CHECK-NEXT: [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1 +; CHECK-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4 +; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 +; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 +; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]] +; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 +; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 %idx1 = getelementptr inbounds double, double* %array, i64 1 @@ -731,50 +701,28 @@ define void @ChecksExtractScores_different_vectors(double* %storeArray, double* ; This checks that we we prefer splats rather than reverse load vectors + shuffles. ; 2-wide splat loads in x86 use a single instruction so they are quite cheap. define double @splat_loads(double *%array1, double *%array2, double *%ptrA, double *%ptrB) { -; SSE-LABEL: @splat_loads( -; SSE-NEXT: entry: -; SSE-NEXT: [[GEP_1_0:%.*]] = getelementptr inbounds double, double* [[ARRAY1:%.*]], i64 0 -; SSE-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds double, double* [[ARRAY1]], i64 1 -; SSE-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* -; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; SSE-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0 -; SSE-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1 -; SSE-NEXT: [[TMP2:%.*]] = bitcast double* [[GEP_2_0]] to <2 x double>* -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]] -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 -; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 -; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP11]], [[TMP12]] -; SSE-NEXT: ret double [[ADD3]] -; -; AVX-LABEL: @splat_loads( -; AVX-NEXT: entry: -; AVX-NEXT: [[GEP_1_0:%.*]] = getelementptr inbounds double, double* [[ARRAY1:%.*]], i64 0 -; AVX-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds double, double* [[ARRAY1]], i64 1 -; AVX-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* -; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; AVX-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0 -; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1 -; AVX-NEXT: [[LD_2_0:%.*]] = load double, double* [[GEP_2_0]], align 8 -; AVX-NEXT: [[LD_2_1:%.*]] = load double, double* [[GEP_2_1]], align 8 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD_2_0]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD_2_1]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP1]], [[TMP6]] -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP9]], [[TMP10]] -; AVX-NEXT: ret double [[ADD3]] +; CHECK-LABEL: @splat_loads( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP_1_0:%.*]] = getelementptr inbounds double, double* [[ARRAY1:%.*]], i64 0 +; CHECK-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds double, double* [[ARRAY1]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[GEP_1_0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[GEP_2_0:%.*]] = getelementptr inbounds double, double* [[ARRAY2:%.*]], i64 0 +; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, double* [[ARRAY2]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[GEP_2_0]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP1]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 +; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[TMP11]], [[TMP12]] +; CHECK-NEXT: ret double [[ADD3]] ; entry: %gep_1_0 = getelementptr inbounds double, double* %array1, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll index 4877518..51d564f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -253,16 +253,13 @@ define void @vecload_vs_broadcast4(double * noalias %from, double * noalias %to, ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -309,16 +306,13 @@ define void @shuffle_nodes_match2(double * noalias %from, double * noalias %to, ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1 -; CHECK-NEXT: [[V0_1:%.*]] = load double, double* [[FROM]], align 4 -; CHECK-NEXT: [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void