From 0e29a80fdc5e436f1ab23f9d08f3716467108cbe Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 9 Jun 2022 07:20:33 -0700 Subject: [PATCH] [RISCV] Add cost model for reverse shuffle The majority of the cost appears to be forming the indices vector. Differential Revision: https://reviews.llvm.org/D127141 --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 13 +- llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll | 28 +- .../LoopVectorize/RISCV/riscv-vector-reverse.ll | 406 ++------------------- 3 files changed, 52 insertions(+), 395 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index a73bcf1..7479bd3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -178,6 +178,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, int Index, VectorType *SubTp, ArrayRef Args) { if (isa(Tp)) { + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); switch (Kind) { default: // Fallthrough to generic handling. @@ -185,11 +186,21 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // must be implemented here. break; case TTI::SK_Broadcast: { - std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); return LT.first * 1; } case TTI::SK_Splice: return getSpliceCost(Tp, Index); + case TTI::SK_Reverse: + // Most of the cost here is producing the vrgather index register + // Example sequence: + // csrr a0, vlenb + // srli a0, a0, 3 + // addi a0, a0, -1 + // vsetvli a1, zero, e8, mf8, ta, mu (ignored) + // vid.v v9 + // vrsub.vx v10, v9, a0 + // vrgather.vv v9, v8, v10 + return LT.first * 6; } } diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll index b83ebf8..0a46092 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll @@ -55,20 +55,20 @@ declare @llvm.experimental.vector.insert.nxv16i32.nxv4i32( @llvm.experimental.vector.reverse.nxv16i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call @llvm.experimental.vector.reverse.nxv32i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call @llvm.experimental.vector.reverse.nxv2i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call @llvm.experimental.vector.reverse.nxv4i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call @llvm.experimental.vector.reverse.nxv8i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call @llvm.experimental.vector.reverse.nxv16i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call @llvm.experimental.vector.reverse.nxv4i32( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call @llvm.experimental.vector.reverse.nxv8i32( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call @llvm.experimental.vector.reverse.nxv2i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call @llvm.experimental.vector.reverse.nxv4i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call @llvm.experimental.vector.reverse.nxv16i1( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call @llvm.experimental.vector.reverse.nxv8i1( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call @llvm.experimental.vector.reverse.nxv4i1( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call @llvm.experimental.vector.reverse.nxv2i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call @llvm.experimental.vector.reverse.nxv16i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call @llvm.experimental.vector.reverse.nxv32i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call @llvm.experimental.vector.reverse.nxv2i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call @llvm.experimental.vector.reverse.nxv4i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call @llvm.experimental.vector.reverse.nxv8i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call @llvm.experimental.vector.reverse.nxv16i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call @llvm.experimental.vector.reverse.nxv4i32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call @llvm.experimental.vector.reverse.nxv8i32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call @llvm.experimental.vector.reverse.nxv2i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call @llvm.experimental.vector.reverse.nxv4i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i1 = call @llvm.experimental.vector.reverse.nxv16i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i1 = call @llvm.experimental.vector.reverse.nxv8i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i1 = call @llvm.experimental.vector.reverse.nxv4i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i1 = call @llvm.experimental.vector.reverse.nxv2i1( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %reverse_nxv16i8 = call @llvm.experimental.vector.reverse.nxv16i8( undef) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 13cb6cc..326e4a0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -36,122 +36,19 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: UserVF ignored because of invalid costs.. -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: remark: :0:0: UserVF ignored because of invalid costs. -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found scalar instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found scalar instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Scalarizing: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom +; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: VPlan 'Initial VPlan for VF={1},UF>=1' { -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<%n>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add vp<%4>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> -; CHECK-NEXT: CLONE ir<%1> = load ir<%arrayidx> -; CHECK-NEXT: CLONE ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> -; CHECK-NEXT: CLONE store ir<%add9>, ir<%arrayidx3> -; CHECK-NEXT: EMIT vp<%12> = VF * UF +(nuw) vp<%3> -; CHECK-NEXT: EMIT branch-on-count vp<%12> vp<%2> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK: middle.block: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: VPlan 'Initial VPlan for VF={2,4},UF>=1' { -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<%n>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add vp<%4>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> -; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx> -; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> -; CHECK-NEXT: WIDEN store ir<%arrayidx3>, ir<%add9> -; CHECK-NEXT: EMIT vp<%11> = VF * UF +(nuw) vp<%3> -; CHECK-NEXT: EMIT branch-on-count vp<%11> vp<%2> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK: middle.block: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { ; CHECK-NEXT: Live-in vp<%2> = vector-trip-count ; CHECK: vector.ph: ; CHECK-NEXT: Successor(s): vector loop @@ -174,92 +71,18 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: middle.block: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Scalar loop costs: 10. -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF 2 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width 2 costs: 9. -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF 4 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width 4 costs: 6. -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width vscale x 1 costs: Invalid (assuming a minimum vscale of 1). -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %add9 = add i32 %1, 1 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width vscale x 2 costs: Invalid (assuming a minimum vscale of 1). ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width vscale x 4 costs: Invalid (assuming a minimum vscale of 1). -; CHECK-NEXT: LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): load %1 = load i32, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: remark: :0:0: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): load -; CHECK-NEXT: LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): store store i32 %add9, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: remark: :0:0: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): store -; CHECK-NEXT: LV: Vectorization seems to be not beneficial, but was forced by a user. -; CHECK-NEXT: LV: Selecting VF: 4. ; CHECK-NEXT: LV(REG): Calculating max register usage: ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 @@ -271,22 +94,22 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #7 Interval # 3 ; CHECK-NEXT: LV(REG): At #9 Interval # 1 ; CHECK-NEXT: LV(REG): At #10 Interval # 2 -; CHECK-NEXT: LV(REG): VF = 4 +; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class -; CHECK-NEXT: LV: Loop cost is 27 +; CHECK-NEXT: LV: Loop cost is 22 ; CHECK-NEXT: LV: IC is 2 -; CHECK-NEXT: LV: VF is 4 +; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. -; CHECK-NEXT: LV: Found a vectorizable loop (4) in +; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=4, UF=1 +; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: @@ -342,122 +165,19 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: UserVF ignored because of invalid costs.. -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: remark: :0:0: UserVF ignored because of invalid costs. -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found scalar instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found scalar instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found scalar instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Scalarizing: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom +; CHECK-NEXT: LV: Using user VF vscale x 4. ; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom ; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: VPlan 'Initial VPlan for VF={1},UF>=1' { -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<%n>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add vp<%4>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> -; CHECK-NEXT: CLONE ir<%1> = load ir<%arrayidx> -; CHECK-NEXT: CLONE ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> -; CHECK-NEXT: CLONE store ir<%conv1>, ir<%arrayidx3> -; CHECK-NEXT: EMIT vp<%12> = VF * UF +(nuw) vp<%3> -; CHECK-NEXT: EMIT branch-on-count vp<%12> vp<%2> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK: middle.block: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: VPlan 'Initial VPlan for VF={2,4},UF>=1' { -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK: vector.ph: -; CHECK-NEXT: Successor(s): vector loop -; CHECK: vector loop: { -; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION -; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<%n>, ir<-1> -; CHECK-NEXT: CLONE ir<%i.0> = add vp<%4>, ir<-1> -; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom> -; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx> -; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> -; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom> -; CHECK-NEXT: WIDEN store ir<%arrayidx3>, ir<%conv1> -; CHECK-NEXT: EMIT vp<%11> = VF * UF +(nuw) vp<%3> -; CHECK-NEXT: EMIT branch-on-count vp<%11> vp<%2> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): middle.block -; CHECK: middle.block: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { ; CHECK-NEXT: Live-in vp<%2> = vector-trip-count ; CHECK: vector.ph: ; CHECK-NEXT: Successor(s): vector loop @@ -480,92 +200,18 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: middle.block: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF 1 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 1 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Scalar loop costs: 11. -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF 2 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width 2 costs: 10. -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF 4 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 4 for VF 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of 9 for VF 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width 4 costs: 7. -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 1 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 1 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 1 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width vscale x 1 costs: Invalid (assuming a minimum vscale of 1). -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %idxprom = zext i32 %i.0 to i64 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 2 For instruction: %conv1 = fadd float %1, 1.000000e+00 -; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 2 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 -; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width vscale x 2 costs: Invalid (assuming a minimum vscale of 1). ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom -; CHECK-NEXT: LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 +; CHECK-NEXT: LV: Found an estimated cost of 7 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 -; CHECK-NEXT: LV: Vector loop of width vscale x 4 costs: Invalid (assuming a minimum vscale of 1). -; CHECK-NEXT: LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): load %1 = load float, ptr %arrayidx, align 4 -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: remark: :0:0: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): load -; CHECK-NEXT: LV: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): store store float %conv1, ptr %arrayidx3, align 4 -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: remark: :0:0: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2, vscale x 4): store -; CHECK-NEXT: LV: Vectorization seems to be not beneficial, but was forced by a user. -; CHECK-NEXT: LV: Selecting VF: 4. ; CHECK-NEXT: LV(REG): Calculating max register usage: ; CHECK-NEXT: LV(REG): At #0 Interval # 0 ; CHECK-NEXT: LV(REG): At #1 Interval # 1 @@ -577,22 +223,22 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #7 Interval # 3 ; CHECK-NEXT: LV(REG): At #9 Interval # 1 ; CHECK-NEXT: LV(REG): At #10 Interval # 2 -; CHECK-NEXT: LV(REG): VF = 4 +; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class -; CHECK-NEXT: LV: Loop cost is 29 +; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 2 -; CHECK-NEXT: LV: VF is 4 +; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. -; CHECK-NEXT: LV: Found a vectorizable loop (4) in +; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=4, UF=1 +; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: -- 2.7.4