[LoopVectorize] Fix bug where predicated loads/stores were dropped

author Joe Ellis <joe.ellis@arm.com>

Wed, 31 Mar 2021 10:41:15 +0000 (10:41 +0000)

committer Joe Ellis <joe.ellis@arm.com>

Thu, 22 Apr 2021 15:05:54 +0000 (15:05 +0000)
author Joe Ellis <joe.ellis@arm.com>
Wed, 31 Mar 2021 10:41:15 +0000 (10:41 +0000)
committer Joe Ellis <joe.ellis@arm.com>
Thu, 22 Apr 2021 15:05:54 +0000 (15:05 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 6f53055..a8a2bd5 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1514,14 +1514,14 @@ public:
    // Returns true if \p I is an instruction that will be predicated either
    // through scalar predication or masked load/store or masked gather/scatter.
    // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I) {
+  bool isPredicatedInst(Instruction *I, ElementCount VF) {
      if (!blockNeedsPredication(I->getParent()))
        return false;
      // Loads and stores that need some form of masked operation are predicated
      // instructions.
      if (isa<LoadInst>(I) || isa<StoreInst>(I))
        return Legal->isMaskRequired(I);
-    return isScalarWithPredication(I);
+    return isScalarWithPredication(I, VF);
    }
  
    /// Returns true if \p I is a memory instruction with consecutive memory
@@ -6552,7 +6552,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
    // from moving "masked load/store" check from legality to cost model.
    // Masked Load/Gather emulation was previously never allowed.
    // Limited number of Masked Store/Scatter emulation was allowed.
-  assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
+  assert(isPredicatedInst(I, ElementCount::getFixed(1)) &&
+         "Expecting a scalar emulated instruction");
    return isa<LoadInst>(I) ||
           (isa<StoreInst>(I) &&
            NumPredStores > NumberOfStoresToPredicate);
@@ -6824,7 +6825,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
    // If we have a predicated load/store, it will need extra i1 extracts and
    // conditional branches, but may not be executed for each vector lane. Scale
    // the cost by the probability of executing the predicated block.
-  if (isPredicatedInst(I)) {
+  if (isPredicatedInst(I, ElementCount::getFixed(1))) {
      Cost /= getReciprocalPredBlockProb();
  
      // Add the cost of an i1 extract and a branch
@@ -8677,8 +8678,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
        Range);
  
    bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
-      Range);
+      [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range);
  
    auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                         IsUniform, IsPredicated);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll

new file mode 100644 (file)

index 0000000..cf0dbb3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
@@ -0,0 +1,52 @@
+; RUN: opt -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s
+; RUN: opt -mattr=+sve -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test is defending against a bug that appeared when we have a target
+; configuration where masked loads/stores are legal -- e.g. AArch64 with SVE.
+; Predication would not be applied during interleaving, enabling the
+; possibility of superfluous loads/stores which could result in miscompiles.
+; This test checks that, when we disable vectorisation and force interleaving,
+; stores are predicated properly.
+;
+; This is _not_ an SVE-specific test. The same bug could manifest on any
+; architecture with masked loads/stores, but we use SVE for testing purposes
+; here.
+
+define void @foo(i32* %data1, i32* %data2) {
+; CHECK-LABEL: @foo(
+; CHECK:       vector.body:
+; CHECK:         br i1 {{%.*}}, label %pred.store.if, label %pred.store.continue
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    store i32 {{%.*}}, i32* {{%.*}}
+; CHECK-NEXT:    br label %pred.store.continue
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    br i1 {{%.*}}, label %pred.store.if2, label %pred.store.continue3
+; CHECK:       pred.store.if2:
+; CHECK-NEXT:    store i32 {{%.*}}, i32* {{%.*}}
+; CHECK-NEXT:    br label %pred.store.continue3
+; CHECK:       pred.store.continue3:
+
+entry:
+  br label %while.body
+
+while.body:
+  %i = phi i64 [ 1023, %entry ], [ %i.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %i
+  %ld = load i32, i32* %arrayidx, align 4
+  %cmp = icmp sgt i32 %ld, %ld
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %ld, i32* %arrayidx, align 4
+  br label %if.end
+
+if.end:
+  %i.next = add nsw i64 %i, -1
+  %tobool.not = icmp eq i64 %i, 0
+  br i1 %tobool.not, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll

index fb39c89..6545928 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -65,9 +65,12 @@ for.end:
  ; sink-scalar-operands optimization for predicated instructions.
  ;
  ; SINK-GATHER: vector.body:
-; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER: pred.load.if:
  ; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
-; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER: pred.load.continue:
+; SINK-GATHER:   %[[T1:.+]] = phi i32 [ poison, %vector.body ], [ %[[T0]], %pred.load.if ]
+; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T1]], %{{.*}}
  ; SINK-GATHER: pred.udiv.continue:
  define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
  entry:
author	Joe Ellis <joe.ellis@arm.com>
	Wed, 31 Mar 2021 10:41:15 +0000 (10:41 +0000)
committer	Joe Ellis <joe.ellis@arm.com>
	Thu, 22 Apr 2021 15:05:54 +0000 (15:05 +0000)
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll		patch \| blob \| history