From 845b73c06f533e623c8b0eae0b059aec1b5ff4a0 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Wed, 21 Nov 2012 08:16:30 +0000 Subject: [PATCH] PR14055: Implement support for sub-vector operations in SROA. Now if we can transform an alloca into a single vector value, but it has subvector, non-element accesses, we form the appropriate shufflevectors to allow SROA to proceed. This fixes PR14055 which pointed out a very common pattern that SROA couldn't handle -- mixed vec3 and vec4 operations on a single alloca. llvm-svn: 168418 --- llvm/lib/Transforms/Scalar/SROA.cpp | 96 +++++++++++++++++++++------ llvm/test/Transforms/SROA/vector-promotion.ll | 77 ++++++++++++++++++++- 2 files changed, 150 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 09dff27..8284e14 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2116,12 +2116,11 @@ static bool isVectorPromotionViable(const DataLayout &TD, EndIndex > Ty->getNumElements()) return false; - // FIXME: We should build shuffle vector instructions to handle - // non-element-sized accesses. See PR14055 for an example of where this - // matters. - if ((EndOffset - BeginOffset) != ElementSize && - (EndOffset - BeginOffset) != VecSize) - return false; + assert(EndIndex > BeginIndex && "Empty vector!"); + uint64_t NumElements = EndIndex - BeginIndex; + Type *PartitionTy + = (NumElements == 1) ? Ty->getElementType() + : VectorType::get(Ty->getElementType(), NumElements); if (MemIntrinsic *MI = dyn_cast(I->U->getUser())) { if (MI->isVolatile()) @@ -2138,9 +2137,13 @@ static bool isVectorPromotionViable(const DataLayout &TD, } else if (LoadInst *LI = dyn_cast(I->U->getUser())) { if (LI->isVolatile()) return false; + if (!canConvertValue(TD, PartitionTy, LI->getType())) + return false; } else if (StoreInst *SI = dyn_cast(I->U->getUser())) { if (SI->isVolatile()) return false; + if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) + return false; } else { return false; } @@ -2448,13 +2451,13 @@ private: return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); } - ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) { + unsigned getIndex(uint64_t Offset) { assert(VecTy && "Can only call getIndex when rewriting a vector"); uint64_t RelOffset = Offset - NewAllocaBeginOffset; assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); uint32_t Index = RelOffset / ElementSize; assert(Index * ElementSize == RelOffset); - return IRB.getInt32(Index); + return Index; } void deleteIfTriviallyDead(Value *V) { @@ -2466,10 +2469,24 @@ private: Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) { Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")); - if (LI.getType() == VecTy->getElementType() || - BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { - V = IRB.CreateExtractElement(V, getIndex(IRB, BeginOffset), + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + if (NumElements == 1) { + V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), getName(".extract")); + DEBUG(dbgs() << " extract: " << *V << "\n"); + } else if (NumElements < VecTy->getNumElements()) { + SmallVector Mask; + Mask.reserve(NumElements); + for (unsigned i = BeginIndex; i != EndIndex; ++i) + Mask.push_back(IRB.getInt32(i)); + V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(Mask), + getName(".extract")); + DEBUG(dbgs() << " shuffle: " << *V << "\n"); } return V; } @@ -2569,15 +2586,52 @@ private: bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, StoreInst &SI, Value *OldOp) { - if (V->getType() == ElementTy || - BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { - if (V->getType() != ElementTy) - V = convertValue(TD, IRB, V, ElementTy); + unsigned BeginIndex = getIndex(BeginOffset); + unsigned EndIndex = getIndex(EndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + Type *PartitionTy + = (NumElements == 1) ? ElementTy + : VectorType::get(ElementTy, NumElements); + if (V->getType() != PartitionTy) + V = convertValue(TD, IRB, V, PartitionTy); + if (NumElements < VecTy->getNumElements()) { + // We need to mix in the existing elements. LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")); - V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset), - getName(".insert")); - } else if (V->getType() != VecTy) { + if (NumElements == 1) { + V = IRB.CreateInsertElement(LI, V, IRB.getInt32(BeginIndex), + getName(".insert")); + DEBUG(dbgs() << " insert: " << *V << "\n"); + } else { + // When inserting a smaller vector into the larger to store, we first + // use a shuffle vector to widen it with undef elements, and then + // a second shuffle vector to select between the loaded vector and the + // incoming vector. + SmallVector Mask; + Mask.reserve(VecTy->getNumElements()); + for (unsigned i = 0; i != VecTy->getNumElements(); ++i) + if (i >= BeginIndex && i < EndIndex) + Mask.push_back(IRB.getInt32(i - BeginIndex)); + else + Mask.push_back(UndefValue::get(IRB.getInt32Ty())); + V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), + ConstantVector::get(Mask), + getName(".expand")); + DEBUG(dbgs() << " shuffle1: " << *V << "\n"); + + Mask.clear(); + for (unsigned i = 0; i != VecTy->getNumElements(); ++i) + if (i >= BeginIndex && i < EndIndex) + Mask.push_back(IRB.getInt32(i)); + else + Mask.push_back(IRB.getInt32(i + VecTy->getNumElements())); + V = IRB.CreateShuffleVector(V, LI, ConstantVector::get(Mask), + getName("insert")); + DEBUG(dbgs() << " shuffle2: " << *V << "\n"); + } + } else { V = convertValue(TD, IRB, V, VecTy); } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); @@ -2731,7 +2785,7 @@ private: IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), - V, getIndex(IRB, BeginOffset), + V, IRB.getInt32(getIndex(BeginOffset)), getName(".insert")), &NewAI, NewAI.getAlignment()); (void)Store; @@ -2899,7 +2953,7 @@ private: // We have to extract rather than load. Src = IRB.CreateExtractElement( IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), - getIndex(IRB, BeginOffset), + IRB.getInt32(getIndex(BeginOffset)), getName(".copyextract")); } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), @@ -2927,7 +2981,7 @@ private: // We have to insert into a loaded copy before storing. Src = IRB.CreateInsertElement( IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), - Src, getIndex(IRB, BeginOffset), + Src, IRB.getInt32(getIndex(BeginOffset)), getName(".insert")); } diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll index ea28f5d..f1e1189 100644 --- a/llvm/test/Transforms/SROA/vector-promotion.ll +++ b/llvm/test/Transforms/SROA/vector-promotion.ll @@ -36,15 +36,15 @@ entry: define i32 @test2(<4 x i32> %x, <4 x i32> %y) { ; CHECK: @test2 -; FIXME: This should be handled! entry: %a = alloca [2 x <4 x i32>] -; CHECK: alloca <4 x i32> +; CHECK-NOT: alloca %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0 store <4 x i32> %x, <4 x i32>* %a.x %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1 store <4 x i32> %y, <4 x i32>* %a.y +; CHECK-NOT: store %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2 %tmp1 = load i32* %a.tmp1 @@ -54,10 +54,18 @@ entry: %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>* %tmp3.vec = load <2 x i32>* %a.tmp3.cast %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0 +; CHECK-NOT: load +; CHECK: %[[extract1:.*]] = extractelement <4 x i32> %x, i32 2 +; CHECK-NEXT: %[[extract2:.*]] = extractelement <4 x i32> %y, i32 3 +; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> %y, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: %[[extract4:.*]] = extractelement <2 x i32> %[[extract3]], i32 0 %tmp4 = add i32 %tmp1, %tmp2 %tmp5 = add i32 %tmp3, %tmp4 ret i32 %tmp5 +; CHECK-NEXT: %[[sum1:.*]] = add i32 %[[extract1]], %[[extract2]] +; CHECK-NEXT: %[[sum2:.*]] = add i32 %[[extract4]], %[[sum1]] +; CHECK-NEXT: ret i32 %[[sum2]] } define i32 @test3(<4 x i32> %x, <4 x i32> %y) { @@ -206,6 +214,71 @@ define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) { ret i64 %res } +define <4 x i32> @test_subvec_store() { +; CHECK: @test_subvec_store +entry: + %a = alloca <4 x i32> +; CHECK-NOT: alloca + + %a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0 + %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>* + store <2 x i32> , <2 x i32>* %a.cast0 +; CHECK-NOT: store +; CHECK: %[[insert1:.*]] = shufflevector <4 x i32> , <4 x i32> undef, <4 x i32> + + %a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1 + %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>* + store <2 x i32> , <2 x i32>* %a.cast1 +; CHECK-NEXT: %[[insert2:.*]] = shufflevector <4 x i32> , <4 x i32> %[[insert1]], <4 x i32> + + %a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2 + %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>* + store <2 x i32> , <2 x i32>* %a.cast2 +; CHECK-NEXT: %[[insert3:.*]] = shufflevector <4 x i32> , <4 x i32> %[[insert2]], <4 x i32> + + %a.gep3 = getelementptr <4 x i32>* %a, i32 0, i32 3 + store i32 3, i32* %a.gep3 +; CHECK-NEXT: %[[insert4:.*]] = insertelement <4 x i32> %[[insert3]], i32 3, i32 3 + + %ret = load <4 x i32>* %a + + ret <4 x i32> %ret +; CHECK-NEXT: ret <4 x i32> %[[insert4]] +} + +define <4 x i32> @test_subvec_load() { +; CHECK: @test_subvec_load +entry: + %a = alloca <4 x i32> +; CHECK-NOT: alloca + store <4 x i32> , <4 x i32>* %a +; CHECK-NOT: store + + %a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0 + %a.cast0 = bitcast i32* %a.gep0 to <2 x i32>* + %first = load <2 x i32>* %a.cast0 +; CHECK-NOT: load +; CHECK: %[[extract1:.*]] = shufflevector <4 x i32> , <4 x i32> undef, <2 x i32> + + %a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1 + %a.cast1 = bitcast i32* %a.gep1 to <2 x i32>* + %second = load <2 x i32>* %a.cast1 +; CHECK-NEXT: %[[extract2:.*]] = shufflevector <4 x i32> , <4 x i32> undef, <2 x i32> + + %a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2 + %a.cast2 = bitcast i32* %a.gep2 to <2 x i32>* + %third = load <2 x i32>* %a.cast2 +; CHECK-NEXT: %[[extract3:.*]] = shufflevector <4 x i32> , <4 x i32> undef, <2 x i32> + + %tmp = shufflevector <2 x i32> %first, <2 x i32> %second, <2 x i32> + %ret = shufflevector <2 x i32> %tmp, <2 x i32> %third, <4 x i32> +; CHECK-NEXT: %[[tmp:.*]] = shufflevector <2 x i32> %[[extract1]], <2 x i32> %[[extract2]], <2 x i32> +; CHECK-NEXT: %[[ret:.*]] = shufflevector <2 x i32> %[[tmp]], <2 x i32> %[[extract3]], <4 x i32> + + ret <4 x i32> %ret +; CHECK-NEXT: ret <4 x i32> %[[ret]] +} + define i32 @PR14212() { ; CHECK: @PR14212 ; This caused a crash when "splitting" the load of the i32 in order to promote -- 2.7.4