[LoopVectorize] Prevent multiple Phis being generated with in-order reductions

author Kerry McLaughlin <kerry.mclaughlin@arm.com>

Tue, 27 Apr 2021 13:50:27 +0000 (14:50 +0100)

committer Kerry McLaughlin <kerry.mclaughlin@arm.com>

Wed, 28 Apr 2021 10:29:01 +0000 (11:29 +0100)
author Kerry McLaughlin <kerry.mclaughlin@arm.com>
Tue, 27 Apr 2021 13:50:27 +0000 (14:50 +0100)
committer Kerry McLaughlin <kerry.mclaughlin@arm.com>
Wed, 28 Apr 2021 10:29:01 +0000 (11:29 +0100)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 17e9947..27161ea 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4292,11 +4292,15 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
    Value *OrigLoopVal = Phi->getIncomingValueForBlock(OrigLatch);
    BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
  
+  bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
+                   useOrderedReductions(RdxDesc);
+
    for (unsigned Part = 0; Part < UF; ++Part) {
+    if (IsOrdered && Part > 0)
+      break;
      Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
      Value *Val = State.get(State.Plan->getVPValue(OrigLoopVal), Part);
-    if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc) &&
-        State.VF.isVector())
+    if (IsOrdered)
        Val = State.get(State.Plan->getVPValue(OrigLoopVal), UF - 1);
      cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
    }
@@ -4388,7 +4392,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
    // terminate on this line. This is the easiest way to ensure we don't
    // accidentally cause an extra step back into the loop while debugging.
    setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
-  if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc))
+  if (IsOrdered)
      ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
    else {
      // Floating-point operations should have some FMF to enable the reduction.
@@ -4722,8 +4726,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
        }
      }
  
+    bool IsOrdered = State.VF.isVector() &&
+                     Cost->isInLoopReduction(cast<PHINode>(PN)) &&
+                     useOrderedReductions(*RdxDesc);
+
      for (unsigned Part = 0; Part < State.UF; ++Part) {
        // This is phase one of vectorizing PHIs.
+      if (Part > 0 && IsOrdered)
+        return;
        Value *EntryPart = PHINode::Create(
            VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
        State.set(PhiR, EntryPart, Part);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

index 98445a8..69748f0 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -30,9 +30,7 @@ define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
  ; CHECK-LABEL: @fadd_strict_unroll
  ; CHECK: vector.body:
  ; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
-; CHECK: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
-; CHECK: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
-; CHECK: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
  ; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
  ; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
  ; CHECK: %[[LOAD3:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll

index 5b865f1..14f1dd0 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -30,9 +30,7 @@ define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
  ; CHECK-LABEL: @fadd_strict_unroll
  ; CHECK: vector.body:
  ; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
-; CHECK: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
-; CHECK: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
-; CHECK: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
  ; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
  ; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
  ; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
@@ -61,6 +59,63 @@ for.end:
    ret float %add
  }
  
+; An additional test for unrolling where we need the last value of the reduction, i.e:
+; float sum = 0, sum2;
+; for(int i=0; i<N; ++i) {
+;   sum += ptr[i];
+;   *ptr2 = sum + 42;
+; }
+; return sum;
+
+define float @fadd_strict_unroll_last_val(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_strict_unroll_last_val
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK: for.body
+; CHECK: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ {{.*}}, %scalar.ph ]
+; CHECK: %[[LOAD5:.*]] = load float, float*
+; CHECK: %[[FADD]] =  fadd float %[[SUM_PHI]], %[[LOAD5]]
+; CHECK: for.cond.cleanup
+; CHECK: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
+; CHECK: store float %[[FADD_42]], float* %b
+; CHECK: for.end
+; CHECK: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %[[SUM_LCSSA]]
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %fadd, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %fadd = fadd float %sum, %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup:
+  %fadd.lcssa = phi float [ %fadd, %for.body ]
+  %fadd2 = fadd float %fadd.lcssa, 4.200000e+01
+  store float %fadd2, float* %b, align 4
+  br label %for.end
+
+for.end:
+  %sum.lcssa = phi float [ %fadd.lcssa, %for.cond.cleanup ], [ 0.000000e+00, %entry ]
+  ret float %sum.lcssa
+}
+
  define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
  ; CHECK-LABEL: @fadd_strict_interleave
  ; CHECK: entry
author	Kerry McLaughlin <kerry.mclaughlin@arm.com>
	Tue, 27 Apr 2021 13:50:27 +0000 (14:50 +0100)
committer	Kerry McLaughlin <kerry.mclaughlin@arm.com>
	Wed, 28 Apr 2021 10:29:01 +0000 (11:29 +0100)
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll		patch \| blob \| history