[LV] Reallow positive-stride interleaved load groups with gaps

author Matthew Simpson <mssimpso@codeaurora.org>

Wed, 27 Apr 2016 18:21:36 +0000 (18:21 +0000)

committer Matthew Simpson <mssimpso@codeaurora.org>

Wed, 27 Apr 2016 18:21:36 +0000 (18:21 +0000)
author Matthew Simpson <mssimpso@codeaurora.org>
Wed, 27 Apr 2016 18:21:36 +0000 (18:21 +0000)
committer Matthew Simpson <mssimpso@codeaurora.org>
Wed, 27 Apr 2016 18:21:36 +0000 (18:21 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index e0cede7..4a51fc9 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -833,7 +833,7 @@ class InterleavedAccessInfo {
  public:
    InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
                          DominatorTree *DT)
-      : PSE(PSE), TheLoop(L), DT(DT) {}
+      : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {}
  
    ~InterleavedAccessInfo() {
      SmallSet<InterleaveGroup *, 4> DelSet;
@@ -862,6 +862,10 @@ public:
      return nullptr;
    }
  
+  /// \brief Returns true if an interleaved group that may access memory
+  /// out-of-bounds requires a scalar epilogue iteration for correctness.
+  bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
+
  private:
    /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
    /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -871,6 +875,11 @@ private:
    Loop *TheLoop;
    DominatorTree *DT;
  
+  /// True if the loop may contain non-reversed interleaved groups with
+  /// out-of-bounds accesses. We ensure we don't speculatively access memory
+  /// out-of-bounds by executing at least one scalar epilogue iteration.
+  bool RequiresScalarEpilogue;
+
    /// Holds the relationships between the members and the interleave group.
    DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
  
@@ -1336,6 +1345,12 @@ public:
      return InterleaveInfo.getInterleaveGroup(Instr);
    }
  
+  /// \brief Returns true if an interleaved group requires a scalar iteration
+  /// to handle accesses with gaps.
+  bool requiresScalarEpilogue() const {
+    return InterleaveInfo.requiresScalarEpilogue();
+  }
+
    unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
  
    bool hasStride(Value *V) { return StrideSet.count(V); }
@@ -2867,12 +2882,26 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
    Value *TC = getOrCreateTripCount(L);
    IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
  
-  // Now we need to generate the expression for N - (N % VF), which is
-  // the part that the vectorized body will execute.
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
+  // Now we need to generate the expression for the part of the loop that the
+  // vectorized body will execute. This is equal to N - (N % Step) if scalar
+  // iterations are not required for correctness, or N - Step, otherwise. Step
+  // is equal to the vectorization factor (number of SIMD elements) times the
+  // unroll factor (number of SIMD instructions).
    Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
    Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
+  // If there is a non-reversed interleaved group that may speculatively access
+  // memory out-of-bounds, we need to ensure that there will be at least one
+  // iteration of the scalar epilogue loop. Thus, if the step evenly divides
+  // the trip count, we set the remainder to be equal to the step. If the step
+  // does not evenly divide the trip count, no adjustment is necessary since
+  // there will already be scalar iterations. Note that the minimum iterations
+  // check ensures that N >= Step.
+  if (VF > 1 && Legal->requiresScalarEpilogue()) {
+    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+    R = Builder.CreateSelect(IsZero, Step, R);
+  }
+
    VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
  
    return VectorTripCount;
@@ -5104,11 +5133,20 @@ void InterleavedAccessInfo::analyzeInterleaving(
      if (Group->getNumMembers() != Group->getFactor())
        releaseGroup(Group);
  
-  // Remove interleaved load groups that don't have the first and last member.
-  // This guarantees that we won't do speculative out of bounds loads.
+  // If there is a non-reversed interleaved load group with gaps, we will need
+  // to execute at least one scalar epilogue iteration. This will ensure that
+  // we don't speculatively access memory out-of-bounds. Note that we only need
+  // to look for a member at index factor - 1, since every group must have a
+  // member at index zero.
    for (InterleaveGroup *Group : LoadGroups)
-    if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1))
-      releaseGroup(Group);
+    if (!Group->getMember(Group->getFactor() - 1)) {
+      if (Group->isReverse()) {
+        releaseGroup(Group);
+      } else {
+        DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
+        RequiresScalarEpilogue = true;
+      }
+    }
  }
  
  LoopVectorizationCostModel::VectorizationFactor
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll

index 54ce3e2..1cce793 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -284,18 +284,24 @@ for.body:                                         ; preds = %for.body, %entry
  }
  
  ; Check vectorization on an interleaved load group of factor 2 with 1 gap
-; (missing the load of odd elements).
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
  
-; void even_load(int *A, int *B) {
+; void even_load_static_tc(int *A, int *B) {
  ;  for (unsigned i = 0; i < 1024; i+=2)
  ;     B[i/2] = A[i] * 2;
  ; }
  
-; CHECK-LABEL: @even_load(
-; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
-; CHECK-NOT: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-LABEL: @even_load_static_tc(
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, 508
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
  
-define void @even_load(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
  entry:
    br label %for.body
  
@@ -315,6 +321,93 @@ for.body:                                         ; preds = %for.body, %entry
    br i1 %cmp, label %for.body, label %for.cond.cleanup
  }
  
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
+
+; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
+;  for (unsigned i = 0; i < N; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load_dynamic_tc(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
+
+define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 with 1
+; gap and a reverse interleaved store group of factor 2. The interleaved load
+; group should be removed since it has a gap and is reverse.
+
+; struct pair {
+;  int x;
+;  int y;
+; };
+;
+; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = X + i;
+;     int b = A[i].y - i;
+;     B[i].x = a;
+;     B[i].y = b;
+;   }
+; }
+
+; CHECK-LABEL: @load_gap_reverse(
+; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
+; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+%pair = type { i64, i64 }
+define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
+  %0 = add nsw i64 %X, %i
+  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
+  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
+  %3 = load i64, i64* %2, align 8
+  %4 = sub nsw i64 %3, %i
+  store i64 %0, i64* %1, align 8
+  store i64 %4, i64* %2, align 8
+  %i.next = add nsw i64 %i, -1
+  %cond = icmp sgt i64 %i, 0
+  br i1 %cond, label %for.body, label %for.exit
+
+for.exit:
+  ret void
+}
+
  ; Check vectorization on interleaved access groups identified from mixed
  ; loads/stores.
  ; void mixed_load2_store2(int *A, int *B) {
author	Matthew Simpson <mssimpso@codeaurora.org>
	Wed, 27 Apr 2016 18:21:36 +0000 (18:21 +0000)
committer	Matthew Simpson <mssimpso@codeaurora.org>
	Wed, 27 Apr 2016 18:21:36 +0000 (18:21 +0000)
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll		patch \| blob \| history