From a08c7736a771d1be6e3168ff94aa8d7ff4479ef4 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 10 Mar 2021 17:06:47 +0000 Subject: [PATCH] [LoopVectorize] Add support for scalable vectorization of induction variables This patch adds support for the vectorization of induction variables when using scalable vectors, which required the following changes: 1. Removed assert from InnerLoopVectorizer::getStepVector. 2. Modified InnerLoopVectorizer::createVectorIntOrFpInductionPHI to use a runtime determined value for VF and removed an assert. 3. Modified InnerLoopVectorizer::buildScalarSteps to work for scalable vectors. I did this by calculating the full vector value for each Part of the unroll factor (UF) and caching this in the VP state. This means that we are always able to extract an arbitrary element from the vector if necessary. In addition to this, I also permitted the caching of the individual lane values themselves for the known minimum number of elements in the same way we do for fixed width vectors. This is a further optimisation that improves the code quality since it avoids unnecessary extractelement operations when extracting the first lane. 4. Added an assert to InnerLoopVectorizer::widenPHIInstruction, since while testing some code paths I noticed this is currently broken for scalable vectors. Various tests to support different cases have been added here: Transforms/LoopVectorize/AArch64/sve-inductions.ll Differential Revision: https://reviews.llvm.org/D98715 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 66 ++++-- .../LoopVectorize/AArch64/sve-inductions.ll | 233 +++++++++++++++++++++ 2 files changed, 279 insertions(+), 20 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 43443e4..be21f61 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2248,16 +2248,20 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = - getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); - Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF); + Type *StepType = Step->getType(); + if (Step->getType()->isFloatingPointTy()) + StepType = IntegerType::get(StepType->getContext(), + StepType->getScalarSizeInBits()); + Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF); + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType()); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. // // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *SplatVF = isa(Mul) ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); @@ -2460,8 +2464,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps BinOp) { // Create and check the types. - assert(isa(Val->getType()) && - "Creation of scalable step vector not yet supported"); auto *ValVTy = cast(Val->getType()); ElementCount VLen = ValVTy->getElementCount(); @@ -2532,23 +2534,46 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // Determine the number of scalars we need to generate for each unroll // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. - unsigned Lanes = - Cost->isUniformAfterVectorization(cast(EntryVal), VF) - ? 1 - : VF.getKnownMinValue(); - assert((!VF.isScalable() || Lanes == 1) && - "Should never scalarize a scalable vector"); + bool IsUniform = + Cost->isUniformAfterVectorization(cast(EntryVal), VF); + unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. + Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), + ScalarIVTy->getScalarSizeInBits()); + Type *VecIVTy = nullptr; + Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; + if (!IsUniform && VF.isScalable()) { + VecIVTy = VectorType::get(ScalarIVTy, VF); + UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); + SplatStep = Builder.CreateVectorSplat(VF, Step); + SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); + } + for (unsigned Part = 0; Part < UF; ++Part) { - for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), - ScalarIVTy->getScalarSizeInBits()); - Value *StartIdx = - createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + Value *StartIdx0 = + createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF); + + if (!IsUniform && VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); + auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) - StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy); - StartIdx = Builder.CreateBinOp( - AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)); + InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); + auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); + auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); + State.set(Def, Add, Part); + recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, + Part); + // It's useful to record the lane values too for the known minimum number + // of elements so we do those below. This improves the code quality when + // trying to extract the first element, for example. + } + + if (ScalarIVTy->isFloatingPointTy()) + StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy); + + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { + Value *StartIdx = Builder.CreateBinOp( + AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. assert((VF.isScalable() || isa(StartIdx)) && @@ -4723,6 +4748,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); + assert(!VF.isScalable() && "Currently unsupported for scalable vectors"); if (Cost->isScalarAfterVectorization(P, State.VF)) { // This is the normalized GEP that starts counting at zero. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll new file mode 100644 index 0000000..324a2d1 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -0,0 +1,233 @@ +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine < %s -S 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; Test that we can add on the induction variable +; for (long long i = 0; i < n; i++) { +; a[i] = b[i] + i; +; } +; with an unroll factor (interleave count) of 2. + +define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @add_ind64_unrolled( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND1:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %[[EC:.*]] = shl i64 %[[VSCALE]], 1 +; CHECK-NEXT: %[[TMP2:.*]] = insertelement poison, i64 %[[EC]], i32 0 +; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector %[[TMP2]], poison, zeroinitializer +; CHECK-NEXT: %[[TMP3:.*]] = add %[[ECSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VECIND2:.*]] = add %[[IDXSPLT]], %[[TMP3]] +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[STOREVAL1:.*]] = add nsw %[[LOAD1]], %[[VECIND1]] +; CHECK: %[[STOREVAL2:.*]] = add nsw %[[LOAD2]], %[[VECIND2]] +; CHECK: store %[[STOREVAL1]] +; CHECK: store %[[STOREVAL2]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.08 + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08 + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + ret void +} + + +; Same as above, except we test with a vectorisation factor of (1, scalable) + +define void @add_ind64_unrolled_nxv1i64(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @add_ind64_unrolled_nxv1i64( +; CHECK-NEXT: entry: +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-NEXT: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND1:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[EC:.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: %[[TMP2:.*]] = insertelement poison, i64 %[[EC]], i32 0 +; CHECK-NEXT: %[[ECSPLT:.*]] = shufflevector %[[TMP2]], poison, zeroinitializer +; CHECK-NEXT: %[[TMP3:.*]] = add %[[ECSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[VECIND2:.*]] = add %[[IDXSPLT]], %[[TMP3]] +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[STOREVAL1:.*]] = add nsw %[[LOAD1]], %[[VECIND1]] +; CHECK: %[[STOREVAL2:.*]] = add nsw %[[LOAD2]], %[[VECIND2]] +; CHECK: store %[[STOREVAL1]] +; CHECK: store %[[STOREVAL2]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %i.08 + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08 + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !9 + +exit: ; preds = %for.body + ret void +} + + +; Test that we can vectorize a separate induction variable (not used for the branch) +; int r = 0; +; for (long long i = 0; i < n; i++) { +; a[i] = r; +; r += 2; +; } +; with an unroll factor (interleave count) of 1. + + +define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @add_unique_ind32( +; CHECK: vector.ph: +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: %[[INDINIT:.*]] = shl %[[STEPVEC]], shufflevector ( insertelement ( undef, i32 1, i32 0), undef, zeroinitializer) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[INC:.*]] = shl i32 %[[VSCALE]], 3 +; CHECK-NEXT: %[[TMP:.*]] = insertelement poison, i32 %[[INC]], i32 0 +; CHECK-NEXT: %[[VECINC:.*]] = shufflevector %[[TMP]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECIND:.*]] = phi [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] +; CHECK: store %[[VECIND]] +; CHECK: %[[VECINDNXT]] = add %[[VECIND]], %[[VECINC]] +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.08 + store i32 %r.07, i32* %arrayidx, align 4 + %add = add nuw nsw i32 %r.07, 2 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body + ret void +} + + +; Test that we can vectorize a separate FP induction variable (not used for the branch) +; float r = 0; +; for (long long i = 0; i < n; i++) { +; a[i] = r; +; r += 2; +; } +; with an unroll factor (interleave count) of 1. + +define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) { +; CHECK-LABEL: @add_unique_indf32( +; CHECK: vector.ph: +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: %[[TMP1:.*]] = uitofp %[[STEPVEC]] to +; CHECK-NEXT: %[[TMP2:.*]] = fmul %[[TMP1]], shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %[[INDINIT:.*]] = fadd %[[TMP2]], shufflevector ( insertelement ( poison, float 0.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2 +; CHECK-NEXT: %[[TMP4:.*]] = sitofp i32 %[[TMP3]] to float +; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00 +; CHECK-NEXT: %[[TMP5:.*]] = insertelement poison, float %[[INC]], i32 0 +; CHECK-NEXT: %[[VECINC:.*]] = shufflevector %[[TMP5]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECIND:.*]] = phi [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ] +; CHECK: store %[[VECIND]] +; CHECK: %[[VECINDNXT]] = fadd %[[VECIND]], %[[VECINC]] + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %r.07 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %i.08 + store float %r.07, float* %arrayidx, align 4 + %add = fadd float %r.07, 2.000000e+00 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body + ret void +} + +; Test a case where the vectorised induction variable is used to +; generate a mask: +; for (long long i = 0; i < n; i++) { +; if (i & 0x1) +; a[i] = b[i]; +; } + +define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @cond_ind64( +; CHECK: vector.body: +; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK: %[[STEPVEC:.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: %[[TMP1:.*]] = insertelement poison, i64 %[[INDEX]], i32 0 +; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-NEXT: %[[VECIND:.*]] = add %[[IDXSPLT]], %[[STEPVEC]] +; CHECK-NEXT: %[[MASK:.*]] = trunc %[[VECIND]] to +; CHECK: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 4, %[[MASK]], poison) +; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[LOAD]], * %{{.*}}, i32 4, %[[MASK]]) +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %and = and i64 %i.08, 1 + %tobool.not = icmp eq i64 %and, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.08 + store i32 %0, i32* %arrayidx1, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.inc + ret void +} + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 2} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 2} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !7, !3, !8, !5} +!7 = !{!"llvm.loop.vectorize.width", i32 4} +!8 = !{!"llvm.loop.interleave.count", i32 1} +!9 = distinct !{!9, !1, !10, !3, !4, !5} +!10 = !{!"llvm.loop.vectorize.width", i32 1} -- 2.7.4