From 4e7b71bc86e276bb87a4af7b457367f15eba00f2 Mon Sep 17 00:00:00 2001 From: Matthew Simpson Date: Thu, 23 Mar 2017 16:29:58 +0000 Subject: [PATCH] [LV] Vectorize GEPs This patch adds support for vectorizing GEPs. Previously, we only generated vector GEPs on-demand when creating gather or scatter operations. All GEPs from the original loop were scalarized by default, and if a pointer was to be stored to memory, we would have to build up the pointer vector with insertelement instructions. With this patch, we will vectorize all GEPs that haven't already been marked for scalarization. The patch refines collectLoopScalars to more exactly identify the scalar GEPs. The function now more closely resembles collectLoopUniforms. And the patch moves vector GEP creation out of vectorizeMemoryInstruction and into the main vectorization loop. The vector GEPs needed for gather and scatter operations will have already been generated before vectoring the memory accesses. Differential Revision: https://reviews.llvm.org/D30710 llvm-svn: 298620 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 184 +++++++++++++-------- .../LoopVectorize/X86/consecutive-ptr-uniforms.ll | 44 +++-- .../Transforms/LoopVectorize/X86/scatter_crash.ll | 106 ++---------- llvm/test/Transforms/LoopVectorize/vector-geps.ll | 61 +++++++ 4 files changed, 221 insertions(+), 174 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/vector-geps.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fa2c71d..e1e87f0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -277,32 +277,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -/// A helper function that returns GEP instruction and knows to skip a -/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination -/// pointee types of the 'bitcast' have the same size. -/// For example: -/// bitcast double** %var to i64* - can be skipped -/// bitcast double** %var to i8* - can not -static GetElementPtrInst *getGEPInstruction(Value *Ptr) { - - if (isa(Ptr)) - return cast(Ptr); - - if (isa(Ptr) && - isa(cast(Ptr)->getOperand(0))) { - Type *BitcastTy = Ptr->getType(); - Type *GEPTy = cast(Ptr)->getSrcTy(); - if (!isa(BitcastTy) || !isa(GEPTy)) - return nullptr; - Type *Pointee1Ty = cast(BitcastTy)->getPointerElementType(); - Type *Pointee2Ty = cast(GEPTy)->getPointerElementType(); - const DataLayout &DL = cast(Ptr)->getModule()->getDataLayout(); - if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) - return cast(cast(Ptr)->getOperand(0)); - } - return nullptr; -} - // FIXME: The following helper functions have multiple implementations // in the project. They can be effectively organized in a common Load/Store // utilities unit. @@ -2997,40 +2971,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts VectorGep; // Handle consecutive loads/stores. - GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { Ptr = getScalarValue(Ptr, 0, 0); } else { // At this point we should vector version of GEP for Gather or Scatter assert(CreateGatherScatter && "The instruction should be scalarized"); - if (Gep) { - // Vectorizing GEP, across UF parts. We want to get a vector value for base - // and each index that's defined inside the loop, even if it is - // loop-invariant but wasn't hoisted out. Otherwise we want to keep them - // scalar. - SmallVector OpsV; - for (Value *Op : Gep->operands()) { - Instruction *SrcInst = dyn_cast(Op); - if (SrcInst && OrigLoop->contains(SrcInst)) - OpsV.push_back(getVectorValue(Op)); - else - OpsV.push_back(VectorParts(UF, Op)); - } - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Ops; - Value *GEPBasePtr = OpsV[0][Part]; - for (unsigned i = 1; i < Gep->getNumOperands(); i++) - Ops.push_back(OpsV[i][Part]); - Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep"); - cast(NewGep)->setIsInBounds(Gep->isInBounds()); - assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); - - NewGep = - Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF)); - VectorGep.push_back(NewGep); - } - } else - VectorGep = getVectorValue(Ptr); + VectorGep = getVectorValue(Ptr); } VectorParts Mask = createBlockInMask(Instr->getParent()); @@ -4790,7 +4736,72 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) { widenPHIInstruction(&I, UF, VF); continue; } // End of PHI. + case Instruction::GetElementPtr: { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + auto *GEP = cast(&I); + VectorParts Entry(UF); + + if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateVectorSplat(VF, Clone); + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) + ? GEP->getPointerOperand() + : getVectorValue(GEP->getPointerOperand())[Part]; + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { + if (OrigLoop->isLoopInvariant(U.get())) + Indices.push_back(U.get()); + else + Indices.push_back(getVectorValue(U.get())[Part]); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = GEP->isInBounds() + ? Builder.CreateInBoundsGEP(Ptr, Indices) + : Builder.CreateGEP(Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + Entry[Part] = NewGEP; + } + } + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, GEP); + break; + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -5481,21 +5492,58 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // If an instruction is uniform after vectorization, it will remain scalar. Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); - // Collect the getelementptr instructions that will not be vectorized. A - // getelementptr instruction is only vectorized if it is used for a legal - // gather or scatter operation. - for (auto *BB : TheLoop->blocks()) + // These sets are used to seed the analysis of loop scalars with memory + // access pointer operands that will remain scalar. + SmallSetVector ScalarPtrs; + SmallPtrSet PossibleNonScalarPtrs; + + // Returns true if the given instruction will not be a gather or scatter + // operation with vectorization factor VF. + auto isScalarDecision = [&](Instruction *I, unsigned VF) { + InstWidening WideningDecision = getWideningDecision(I, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + return WideningDecision != CM_GatherScatter; + }; + + // Collect the initial values that we know will not be vectorized. A value + // will remain scalar if it is only used as the pointer operand of memory + // accesses that are not gather or scatter operations. + for (auto *BB : TheLoop->blocks()) { for (auto &I : *BB) { - if (auto *GEP = dyn_cast(&I)) { - Scalars[VF].insert(GEP); - continue; - } - auto *Ptr = getPointerOperand(&I); + + // If there's no pointer operand or the pointer operand is not an + // instruction, there's nothing to do. + auto *Ptr = dyn_cast_or_null(getPointerOperand(&I)); if (!Ptr) continue; - auto *GEP = getGEPInstruction(Ptr); - if (GEP && getWideningDecision(&I, VF) == CM_GatherScatter) - Scalars[VF].erase(GEP); + + // If the pointer has already been identified as scalar (e.g., if it was + // also inditifed as uniform), there's nothing to do. + if (Scalars[VF].count(Ptr)) + continue; + + // True if all users of Ptr are memory accesses that have Ptr as their + // pointer operand. + auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool { + return getPointerOperand(U) == Ptr; + }); + + // If the pointer is used by an instruction other than a memory access, + // it may not remain scalar. If the memory access is a gather or scatter + // operation, the pointer will not remain scalar. + if (!UsersAreMemAccesses || !isScalarDecision(&I, VF)) + PossibleNonScalarPtrs.insert(Ptr); + else + ScalarPtrs.insert(Ptr); + } + } + + // Add to the set of scalars all the pointers we know will not be vectorized. + for (auto *I : ScalarPtrs) + if (!PossibleNonScalarPtrs.count(I)) { + DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); + Scalars[VF].insert(I); } // An induction variable will remain scalar if all users of the induction @@ -5526,6 +5574,8 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // The induction variable and its update instruction will remain scalar. Scalars[VF].insert(Ind); Scalars[VF].insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } } diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index 8203352..82f2e06 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -13,23 +13,33 @@ target triple = "x86_64-unknown-linux-gnu" ; scatter operation. %tmp3 (and the induction variable) should not be marked ; uniform-after-vectorization. ; -; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 -; CHECK: vector.body: -; CHECK: %index = phi i64 -; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = mul i64 %index, 5 -; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] -; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 -; CHECK: %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]] -; CHECK: %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T4]], align 4 -; CHECK: %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind -; CHECK: call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}}) -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]] +; CHECK-NEXT: call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body %data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] } diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index ec67e63..bda4b24 100755 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -16,97 +16,23 @@ target triple = "x86_64-apple-macosx10.11.0" define void @_Z3fn1v() #0 { ; CHECK-LABEL: @_Z3fn1v( ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX:%.*]].next, %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ -; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ -; CHECK-NEXT: [[SHL:%.*]] = shl i64 %index, 1 -; CHECK-NEXT: %offset.idx = add i64 [[SHL]], 8 -; CHECK-NEXT: [[IND00:%.*]] = add i64 %offset.idx, 0 -; CHECK-NEXT: [[IND02:%.*]] = add i64 %offset.idx, 2 -; CHECK-NEXT: [[IND04:%.*]] = add i64 %offset.idx, 4 -; CHECK-NEXT: [[IND06:%.*]] = add i64 %offset.idx, 6 -; CHECK-NEXT: [[IND08:%.*]] = add i64 %offset.idx, 8 -; CHECK-NEXT: [[IND10:%.*]] = add i64 %offset.idx, 10 -; CHECK-NEXT: [[IND12:%.*]] = add i64 %offset.idx, 12 -; CHECK-NEXT: [[IND14:%.*]] = add i64 %offset.idx, 14 -; CHECK-NEXT: [[IND16:%.*]] = add i64 %offset.idx, 16 -; CHECK-NEXT: [[IND18:%.*]] = add i64 %offset.idx, 18 -; CHECK-NEXT: [[IND20:%.*]] = add i64 %offset.idx, 20 -; CHECK-NEXT: [[IND22:%.*]] = add i64 %offset.idx, 22 -; CHECK-NEXT: [[IND24:%.*]] = add i64 %offset.idx, 24 -; CHECK-NEXT: [[IND26:%.*]] = add i64 %offset.idx, 26 -; CHECK-NEXT: [[IND28:%.*]] = add i64 %offset.idx, 28 -; CHECK-NEXT: [[IND30:%.*]] = add i64 %offset.idx, 30 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> , [[VEC_IND]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]] -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]] -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]] -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]] -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]] -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]] -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0 -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0 -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6 -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0 -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7 -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0 -; CHECK-NEXT: [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8 -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0 -; CHECK-NEXT: [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9 -; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0 -; CHECK-NEXT: [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10 -; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0 -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11 -; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0 -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0 -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13 -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14 -; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0 -; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0 -; CHECK-NEXT: [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> ) -; CHECK: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], -; CHECK: [[STEP_ADD4:%.*]] = add <16 x i64> [[VEC_IND3]], +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[TMP13]], i32 16, <16 x i1> ) +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[TMP16]], i32 8, <16 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], +; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; entry: %0 = load i32, i32* @c, align 4 %cmp34 = icmp sgt i32 %0, 8 diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll new file mode 100644 index 0000000..bd79499 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @vector_gep_stored( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: @uniform_vector_gep_stored( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 1 + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} -- 2.7.4