Recommit "[LV] Emit @llvm.get.active.lane.mask for tail-folded loops"

author Sjoerd Meijer <sjoerd.meijer@arm.com>

Wed, 17 Jun 2020 09:48:20 +0000 (10:48 +0100)

committer Sjoerd Meijer <sjoerd.meijer@arm.com>

Wed, 17 Jun 2020 12:12:15 +0000 (13:12 +0100)
author Sjoerd Meijer <sjoerd.meijer@arm.com>
Wed, 17 Jun 2020 09:48:20 +0000 (10:48 +0100)
committer Sjoerd Meijer <sjoerd.meijer@arm.com>
Wed, 17 Jun 2020 12:12:15 +0000 (13:12 +0100)
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

index 6d7a190..82849dc 100644 (file)
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1413,14 +1413,14 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
  }
  
  bool ARMTTIImpl::emitGetActiveLaneMask() const {
-  if (!ST->hasMVEIntegerOps())
+  if (!ST->hasMVEIntegerOps() || DisableTailPredication)
      return false;
  
-  // TODO: Intrinsic @llvm.get.active.lane.mask is supported.
+  // Intrinsic @llvm.get.active.lane.mask is supported.
    // It is used in the MVETailPredication pass, which requires the number of
    // elements processed by this vector loop to setup the tail-predicated
    // loop.
-  return false;
+  return true;
  }
  void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::UnrollingPreferences &UP) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 3591f6b..ec75667 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6829,7 +6829,11 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
        IV = IVRecipe->getVPValue();
      }
      VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+    bool TailFolded = !CM.isScalarEpilogueAllowed();
+    if (TailFolded && CM.TTI.emitGetActiveLaneMask())
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
+    else
+      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
      return BlockMaskCache[BB] = BlockMask;
    }
  
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp

index a015550..1508b2e 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -380,6 +380,20 @@ void VPInstruction::generateInstruction(VPTransformState &State,
      State.set(this, V, Part);
      break;
    }
+  case VPInstruction::ActiveLaneMask: {
+    // Get first lane of vector induction variable.
+    Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
+    // Get first lane of backedge-taken-count.
+    Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
+
+    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+    auto *PredTy = VectorType::get(Int1Ty, State.VF);
+    Instruction *Call = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
+        {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
+    State.set(this, Call, Part);
+    break;
+  }
    default:
      llvm_unreachable("Unsupported opcode for instruction");
    }
@@ -421,6 +435,10 @@ void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
    case VPInstruction::SLPStore:
      O << "combined store";
      break;
+  case VPInstruction::ActiveLaneMask:
+    O << "active lane mask";
+    break;
+
    default:
      O << Instruction::getOpcodeName(getOpcode());
    }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h

index c3097e1..914cb03 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -685,6 +685,7 @@ public:
      ICmpULE,
      SLPLoad,
      SLPStore,
+    ActiveLaneMask,
    };
  
  private:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll

index 9939c21..0c2502d 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@@ -45,9 +45,12 @@
  define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
  ; CHECK-LABEL:    prefer_folding(
  ; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
-; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
+; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
  ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
  ;
  ; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
@@ -507,9 +510,13 @@ for.body:
  define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
  ; CHECK-LABEL:    float(
  ; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
-; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
+; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
+; PREFER-FOLDING: %index.next = add i32 %index, 4
  ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
  entry:
    br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll

index 274b626..23ecf5b 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -1,5 +1,5 @@
  ; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=COMMON,DEFAULT
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false  -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
  ; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-ENABLE-TP
  
  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -15,9 +15,13 @@ target triple = "thumbv8.1m.main-arm-unknown-eabihf"
  define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
  ; COMMON-LABEL: @sgt_loopguard(
  ; COMMON:       vector.body:
-; CHECK-TF:     masked.load
-; CHECK-TF:     masked.load
-; CHECK-TF:     masked.store
+
+; CHECK-TF:     %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
+; CHECK-TF:     %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0
+; CHECK-TF:     %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]])
+; CHECK-TF:     llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
  entry:
    %cmp5 = icmp sgt i32 %N, 0
    br i1 %cmp5, label %while.body.preheader, label %while.end
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll

index 72321f0..f3e1af6 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -1,7 +1,7 @@
-; RUN: opt < %s -loop-vectorize -S | \
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | \
  ; RUN:  FileCheck %s -check-prefixes=COMMON,CHECK
  
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | \
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -prefer-predicate-over-epilog -S | \
  ; RUN:   FileCheck -check-prefixes=COMMON,PREDFLAG %s
  
  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -41,11 +41,15 @@ for.body:
  define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
  ; COMMON-LABEL: tail_folding_enabled(
  ; COMMON: vector.body:
-; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
-; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; COMMON:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; COMMON:   %[[ELEM0:.*]] = add i64 %index, 0
+; COMMON:   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
+; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
  ; COMMON:   %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
-; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
-; COMMON:   br i1 %12, label %{{.*}}, label %vector.body
+; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
+; COMMON:   %index.next = add i64 %index, 4
+; COMMON:   br i1 %{{.*}}, label %{{.*}}, label %vector.body
  entry:
    br label %for.body
  
@@ -75,13 +79,16 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
  
  ; PREDFLAG-LABEL: tail_folding_disabled(
  ; PREDFLAG:  vector.body:
-; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
-; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; PREDFLAG:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREDFLAG:  %[[ELEM0:.*]] = add i64 %index, 0
+; PREDFLAG:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
+; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
  ; PREDFLAG:  %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32(
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
  ; PREDFLAG:  %index.next = add i64 %index, 4
-; PREDFLAG:  %12 = icmp eq i64 %index.next, 432
-; PREDFLAG:  br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
+; PREDFLAG:  %[[CMP:.*]] = icmp eq i64 %index.next, 432
+; PREDFLAG:  br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
  entry:
    br label %for.body
  
@@ -102,6 +109,59 @@ for.body:
    br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
  }
  
+define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; PREDFLAG-LABEL: interleave4(
+; PREDFLAG:  %[[ADD1:.*]] = add i32 %index, 0
+; PREDFLAG:  %[[ADD2:.*]] = add i32 %index, 4
+; PREDFLAG:  %[[ADD3:.*]] = add i32 %index, 8
+; PREDFLAG:  %[[ADD4:.*]] = add i32 %index, 12
+; PREDFLAG:  %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0
+; PREDFLAG:  %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]])
+; PREDFLAG:  %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]])
+; PREDFLAG:  %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]])
+; PREDFLAG:  %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]])
+;
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
+; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
+;
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
+;
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
+}
+
  ; CHECK:      !0 = distinct !{!0, !1}
  ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
  ; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
@@ -109,6 +169,7 @@ for.body:
  ; CHECK-NEXT: !4 = distinct !{!4, !1}
  ; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
  ; CHECK-NEXT: !6 = distinct !{!6, !1}
+
  attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
  
  !6 = distinct !{!6, !7, !8}
@@ -118,3 +179,6 @@ attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+
  !10 = distinct !{!10, !11, !12}
  !11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
  !12 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!14 = distinct !{!14, !15}
+!15 = !{!"llvm.loop.interleave.count", i32 4}
author	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Wed, 17 Jun 2020 09:48:20 +0000 (10:48 +0100)
committer	Sjoerd Meijer <sjoerd.meijer@arm.com>
	Wed, 17 Jun 2020 12:12:15 +0000 (13:12 +0100)
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/VPlan.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/VPlan.h		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll		patch \| blob \| history