From e463bd53c03ff9183bd30030477dfe6f3b2fdd0c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 19 Jan 2021 11:19:09 -0800 Subject: [PATCH] Revert "[SLP]Merge reorder and reuse shuffles." This reverts commit 438682de6a38ac97f89fa38faf5c8dc9b09cd9ad to fix the bug with the reducing size of the resulting vector for the entry node with multiple users. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 168 +++++++++------------ .../Transforms/SLPVectorizer/AArch64/PR38339.ll | 3 +- llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll | 3 +- .../SLPVectorizer/X86/shrink_after_reorder.ll | 5 +- 4 files changed, 77 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0fee52d..24885e4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3500,7 +3500,6 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::ExtractValue: case Instruction::ExtractElement: { - InstructionCost DeadCost = 0; if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { @@ -3528,10 +3527,12 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { ReuseShuffleCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } - DeadCost = ReuseShuffleCost; - } else if (!E->ReorderIndices.empty()) { - DeadCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - VecTy); + } + InstructionCost DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } for (unsigned I = 0, E = VL.size(); I < E; ++I) { Instruction *EI = cast(VL[I]); @@ -3755,9 +3756,11 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, alignment, CostKind, VL0); } - if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } @@ -3769,14 +3772,18 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { Align Alignment = SI->getAlign(); InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); + if (NeedToShuffleReuses) + ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) + if (IsReorder) { + // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); - return VecStCost - ScalarStCost; + return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -4323,64 +4330,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { return Vec; } -namespace { -/// Merges shuffle masks and emits final shuffle instruction, if required. -class ShuffleInstructionBuilder { - IRBuilderBase &Builder; - bool IsFinalized = false; - SmallVector Mask; - -public: - ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {} - - /// Adds a mask, inverting it before applying. - void addInversedMask(ArrayRef SubMask) { - if (SubMask.empty()) - return; - SmallVector NewMask; - inversePermutation(SubMask, NewMask); - addMask(NewMask); - } - - /// Functions adds masks, merging them into single one. - void addMask(ArrayRef SubMask) { - SmallVector NewMask(SubMask.begin(), SubMask.end()); - addMask(NewMask); - } - - void addMask(ArrayRef SubMask) { - if (SubMask.empty()) - return; - if (Mask.empty()) { - Mask.append(SubMask.begin(), SubMask.end()); - return; - } - SmallVector NewMask(SubMask.size(), SubMask.size()); - int TermValue = std::min(Mask.size(), SubMask.size()); - for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) { - NewMask[I] = E; - continue; - } - NewMask[I] = Mask[SubMask[I]]; - } - Mask.swap(NewMask); - } - - Value *finalize(Value *V) { - IsFinalized = true; - if (Mask.empty()) - return V; - return Builder.CreateShuffleVector(V, Mask, "shuffle"); - } - - ~ShuffleInstructionBuilder() { - assert((IsFinalized || Mask.empty()) && - "Must be finalized construction of the shuffles."); - } -}; -} // namespace - Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); @@ -4389,14 +4338,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - ShuffleInstructionBuilder ShuffleBuilder(Builder); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); Value *Vec = gather(E->Scalars); if (NeedToShuffleReuses) { - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - Vec = ShuffleBuilder.finalize(Vec); + Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); if (auto *I = dyn_cast(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); @@ -4454,10 +4401,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); - Builder.SetInsertPoint(VL0); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (!E->ReorderIndices.empty()) { + SmallVector Mask; + inversePermutation(E->ReorderIndices, Mask); + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; return V; } @@ -4468,9 +4423,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - NewV = ShuffleBuilder.finalize(NewV); + if (!E->ReorderIndices.empty()) { + SmallVector Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, + "shuffle"); + } E->VectorizedValue = NewV; return NewV; } @@ -4497,8 +4459,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4519,8 +4481,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4539,8 +4501,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateSelect(Cond, True, False); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4562,8 +4524,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4605,8 +4567,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4648,9 +4610,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = propagateMetadata(NewLI, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (IsReorder) { + SmallVector Mask; + inversePermutation(E->ReorderIndices, Mask); + V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); + } + if (NeedToShuffleReuses) { + // TODO: Merge this shuffle with the ReorderShuffleMask. + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4664,9 +4632,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); - ShuffleBuilder.addMask(E->ReorderIndices); - VecValue = ShuffleBuilder.finalize(VecValue); - + if (IsReorder) { + SmallVector Mask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); + VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); + } Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( ScalarPtr, VecValue->getType()->getPointerTo(AS)); @@ -4680,6 +4650,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); Value *V = propagateMetadata(ST, E->Scalars); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4717,8 +4689,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4780,8 +4752,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4847,8 +4819,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll index d1754c0..1c8ddba 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -82,7 +82,8 @@ define void @f3(<2 x i16> %x, i16* %a) { ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <2 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll index 741dbce..f9e38ea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -35,7 +35,8 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) { ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[REORDER_SHUFFLE]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll index 1b89c9a..e7b2ce8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -8,9 +8,10 @@ define void @wombat(i32* %ptr, i32* %ptr1) { ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[REORDER_SHUFFLE]], ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5 -- 2.7.4