CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
cl::desc("Set the loop counter bitwidth"));
+static cl::opt<bool>
+ForceGuardLoopEntry(
+ "force-hardware-loop-guard", cl::Hidden, cl::init(false),
+ cl::desc("Force generation of loop guard intrinsic"));
+
STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
namespace {
class HardwareLoop {
// Expand the trip count scev into a value that we can use.
- Value *InitLoopCount(BasicBlock *BB);
+ Value *InitLoopCount();
// Insert the set_loop_iteration intrinsic.
- void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB);
+ void InsertIterationSetup(Value *LoopCountInit);
// Insert the loop_decrement intrinsic.
void InsertLoopDec();
CountType(Info.CountType),
ExitBranch(Info.ExitBranch),
LoopDecrement(Info.LoopDecrement),
- UsePHICounter(Info.CounterInReg) { }
+ UsePHICounter(Info.CounterInReg),
+ UseLoopGuard(Info.PerformEntryTest) { }
void Create();
const SCEV *ExitCount = nullptr;
Type *CountType = nullptr;
BranchInst *ExitBranch = nullptr;
- Value *LoopDecrement = nullptr;
+ Value *LoopDecrement = nullptr;
bool UsePHICounter = false;
+ bool UseLoopGuard = false;
+ BasicBlock *BeginBB = nullptr;
};
}
void HardwareLoop::Create() {
LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
- BasicBlock *BeginBB = L->getLoopPreheader();
- Value *LoopCountInit = InitLoopCount(BeginBB);
+
+ Value *LoopCountInit = InitLoopCount();
if (!LoopCountInit)
return;
- InsertIterationSetup(LoopCountInit, BeginBB);
+ InsertIterationSetup(LoopCountInit);
if (UsePHICounter || ForceHardwareLoopPHI) {
Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
DeleteDeadPHIs(I);
}
-Value *HardwareLoop::InitLoopCount(BasicBlock *BB) {
+static bool CanGenerateTest(Loop *L, Value *Count) {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader->getSinglePredecessor())
+ return false;
+
+ BasicBlock *Pred = Preheader->getSinglePredecessor();
+ if (!isa<BranchInst>(Pred->getTerminator()))
+ return false;
+
+ auto *BI = cast<BranchInst>(Pred->getTerminator());
+ if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
+ return false;
+
+ // Check that the icmp is checking for equality of Count and zero and that
+ // a non-zero value results in entering the loop.
+ auto ICmp = cast<ICmpInst>(BI->getCondition());
+ if (!ICmp->isEquality())
+ return false;
+
+ auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
+ if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
+ return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
+ return false;
+ };
+
+ if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1))
+ return false;
+
+ unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
+ if (BI->getSuccessor(SuccIdx) != Preheader)
+ return false;
+
+ return true;
+}
+
+Value *HardwareLoop::InitLoopCount() {
+ LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
+ // Can we replace a conditional branch with an intrinsic that sets the
+ // loop counter and tests that is not zero?
+
SCEVExpander SCEVE(SE, DL, "loopcnt");
if (!ExitCount->getType()->isPointerTy() &&
ExitCount->getType() != CountType)
ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
+ // If we're trying to use the 'test and set' form of the intrinsic, we need
+ // to replace a conditional branch that is controlling entry to the loop. It
+ // is likely (guaranteed?) that the preheader has an unconditional branch to
+ // the loop header, so also check if it has a single predecessor.
+ if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
+ SE.getZero(ExitCount->getType()))) {
+ LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
+ UseLoopGuard |= ForceGuardLoopEntry;
+ } else
+ UseLoopGuard = false;
+
+ BasicBlock *BB = L->getLoopPreheader();
+ if (UseLoopGuard && BB->getSinglePredecessor() &&
+ cast<BranchInst>(BB->getTerminator())->isUnconditional())
+ BB = BB->getSinglePredecessor();
+
if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
- LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount "
+ LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
<< *ExitCount << "\n");
return nullptr;
}
Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
BB->getTerminator());
- LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n");
+
+ // FIXME: We've expanded Count where we hope to insert the counter setting
+ // intrinsic. But, in the case of the 'test and set' form, we may fallback to
+ // the just 'set' form and in which case the insertion block is most likely
+ // different. It means there will be instruction(s) in a block that possibly
+ // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue,
+ // but it's doesn't appear to work in all cases.
+
+ UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
+ BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
+ LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
+ << " - Expanded Count in " << BB->getName() << "\n"
+ << " - Will insert set counter intrinsic into: "
+ << BeginBB->getName() << "\n");
return Count;
}
-void HardwareLoop::InsertIterationSetup(Value *LoopCountInit,
- BasicBlock *BB) {
- IRBuilder<> Builder(BB->getTerminator());
+void HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
+ IRBuilder<> Builder(BeginBB->getTerminator());
Type *Ty = LoopCountInit->getType();
- Function *LoopIter =
- Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
- Builder.CreateCall(LoopIter, LoopCountInit);
+ Intrinsic::ID ID = UseLoopGuard ?
+ Intrinsic::test_set_loop_iterations : Intrinsic::set_loop_iterations;
+ Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
+ Value *SetCount = Builder.CreateCall(LoopIter, LoopCountInit);
+
+ // Use the return value of the intrinsic to control the entry of the loop.
+ if (UseLoopGuard) {
+ assert((isa<BranchInst>(BeginBB->getTerminator()) &&
+ cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
+ "Expected conditional branch");
+ auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
+ LoopGuard->setCondition(SetCount);
+ if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
+ LoopGuard->swapSuccessors();
+ }
+ LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: "
+ << *SetCount << "\n");
}
void HardwareLoop::InsertLoopDec() {
--- /dev/null
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EXIT
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LATCH
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=false -S %s -o - | FileCheck %s --check-prefix=NO-GUARD
+
+; NO-GUARD-NOT: @llvm.test.set.loop.iterations
+
+; CHECK-LABEL: test1
+; CHECK: entry:
+; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %N, 2
+; CHECK: [[MAX:%[^ ]+]] = select i1 [[CMP]], i32 %N, i32 2
+; CHECK: [[COUNT:%[^ ]+]] = add i32 [[MAX]], -1
+; CHECK: br i1 %t1, label %do.body.preheader
+; CHECK: do.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %do.body
+define void @test1(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ br i1 %t1, label %do.body, label %if.end
+
+do.body: ; preds = %do.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr1, %do.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 1, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr1 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp = icmp ult i32 %inc, %N
+ br i1 %cmp, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: test2
+; CHECK-NOT: call i1 @llvm.test.set.loop.iterations
+; CHECK-NOT: call void @llvm.set.loop.iterations
+define void @test2(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ br i1 %t1, label %do.body, label %if.end
+
+do.body: ; preds = %do.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr1, %do.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %add, %do.body ], [ 1, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr1 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %add = add i32 %i.0, 2
+ %cmp = icmp ult i32 %add, %N
+ br i1 %cmp, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: test3
+; CHECK: entry:
+; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %N, 1
+; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %N, i32 1
+; CHECK: br i1 %brmerge.demorgan, label %do.body.preheader
+; CHECK: do.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %do.body
+define void @test3(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %do.body, label %if.end
+
+do.body: ; preds = %do.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp = icmp ult i32 %inc, %N
+ br i1 %cmp, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: test4
+; CHECK: entry:
+; CHECK-LATCH: br i1 %brmerge.demorgan, label %while.cond
+; CHECK-LATCH-NOT: call void @llvm{{.*}}loop.iterations
+; CHECK-EXIT: br i1 %brmerge.demorgan, label %while.cond.preheader
+; CHECK-EXIT: while.cond.preheader:
+; CHECK-EXIT: [[COUNT:%[^ ]+]] = add i32 %N, 1
+; CHECK-EXIT: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-EXIT: br label %while.cond
+define void @test4(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.cond, label %if.end
+
+while.cond: ; preds = %while.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %exitcond = icmp eq i32 %i.0, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+while.body: ; preds = %while.cond
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add i32 %i.0, 1
+ br label %while.cond
+
+if.end: ; preds = %while.cond, %entry
+ ret void
+}
+
+; CHECK-LABEL: test5
+; CHECK: entry:
+; CHECK: br i1 %or.cond, label %while.body.preheader
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %while.body
+define void @test5(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ %cmp6 = icmp ne i32 %N, 0
+ %or.cond = and i1 %brmerge.demorgan, %cmp6
+ br i1 %or.cond, label %while.body, label %if.end
+
+while.body: ; preds = %while.body, %entry
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: test6
+; CHECK: entry:
+; CHECK: br i1 %brmerge.demorgan, label %while.preheader
+; CHECK: while.preheader:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %if.end
+; CHECK: while.body.preheader:
+; CHECK: br label %while.body
+define void @test6(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.preheader, label %if.end
+
+while.preheader: ; preds = %entry
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %while.body, label %if.end
+
+while.body: ; preds = %while.body, %while.preheader
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %while.preheader, %entry
+ ret void
+}
+
+; CHECK-LABEL: test7
+; CHECK: entry:
+; CHECK: br i1 %brmerge.demorgan, label %while.preheader
+; CHECK: while.preheader:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %if.end
+; CHECK: while.body.preheader:
+; CHECK: br label %while.body
+define void @test7(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.preheader, label %if.end
+
+while.preheader: ; preds = %entry
+ %cmp = icmp eq i32 %N, 0
+ br i1 %cmp, label %if.end, label %while.body
+
+while.body: ; preds = %while.body, %while.preheader
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %while.preheader, %entry
+ ret void
+}
+
+; TODO: Can we rearrange the conditional blocks so that we can use the test form?
+; CHECK-LABEL: test8
+; CHECK: entry:
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 %N, 0
+; CHECK: br i1 [[CMP]], label %while.preheader
+; CHECK: while.preheader:
+; CHECK: br i1 %brmerge.demorgan, label %while.body.preheader
+; CHECK: while.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %while.body
+define void @test8(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %while.preheader, label %if.end
+
+while.preheader: ; preds = %entry
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.body, label %if.end
+
+while.body: ; preds = %while.body, %while.preheader
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %while.preheader, %entry
+ ret void
+}
+
+; CHECK-LABEL: test9
+; CHECK: entry:
+; CHECK: br i1 %brmerge.demorgan, label %do.body.preheader
+; CHECK: do.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: br label %do.body
+define void @test9(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %cmp = icmp ne i32 %N, 0
+ %brmerge.demorgan = and i1 %t1, %cmp
+ br i1 %brmerge.demorgan, label %do.body, label %if.end
+
+do.body: ; preds = %do.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp.1 = icmp ult i32 %inc, %N
+ br i1 %cmp.1, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: test10
+; CHECK: entry:
+; CHECK: br i1 %cmp.1, label %do.body.preheader
+; CHECK: do.body.preheader:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32
+; CHECK: br label %do.body
+define void @test10(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %cmp = icmp ne i32 %N, 0
+ %sub = sub i32 %N, 1
+ %be = select i1 %cmp, i32 0, i32 %sub
+ %cmp.1 = icmp ne i32 %be, 0
+ br i1 %cmp.1, label %do.body, label %if.end
+
+do.body: ; preds = %do.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp.2 = icmp ult i32 %inc, %N
+ br i1 %cmp.2, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: test11
+; CHECK: entry:
+; CHECK: br label %do.body.preheader
+; CHECK: do.body.preheader:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK: br i1 [[TEST]], label %do.body.preheader1, label %if.end
+; CHECK: do.body.preheader1:
+; CHECK: br label %do.body
+define void @test11(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ br label %do.body.preheader
+
+do.body.preheader:
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %do.body, label %if.end
+
+do.body:
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %do.body.preheader ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %do.body.preheader ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp.1 = icmp ult i32 %inc, %N
+ br i1 %cmp.1, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD
+; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -force-hardware-loop-guard=true -S %s -o - | FileCheck %s --check-prefix=CHECK-GUARD
; CHECK-LABEL: while_lt
define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
%cmp4 = icmp ult i32 %i, %N
br i1 %cmp4, label %while.body, label %while.end
+; CHECK-GUARD-LABEL: while_lt
+; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 %N, %i
+; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-GUARD: br label %while.body
+
; CHECK: while.body.preheader:
; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
ret void
}
+; CHECK-GUARD-LABEL: while_gte
+; CHECK-GUARD: entry:
+; CHECK-GUARD: br i1 %cmp4, label %while.end, label %while.body.preheader
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD: [[ADD:%[^ ]+]] = add i32 %i, 1
+; CHECK-GUARD: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i
+; CHECK-GUARD: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i
+; CHECK-GUARD: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]]
+; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK-GUARD: br label %while.body
+
; CHECK-LABEL: while_gte
; CHECK: while.body.preheader:
; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1
ret void
}
+; CHECK-GUARD-LABEL: while_ne
+; CHECK-GUARD: entry:
+; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD: br label %while.body
+define void @while_ne(i32 %N, i32* nocapture %A) {
+entry:
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %while.body, label %while.end
+
+while.body:
+ %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+ store i32 %i.addr.05, i32* %arrayidx, align 4
+ %inc = add nuw i32 %i.addr.05, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+ ret void
+}
+
+; CHECK-GUARD-LABEL: while_eq
+; CHECK-GUARD: entry:
+; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD: br label %while.body
+define void @while_eq(i32 %N, i32* nocapture %A) {
+entry:
+ %cmp = icmp eq i32 %N, 0
+ br i1 %cmp, label %while.end, label %while.body
+
+while.body:
+ %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+ store i32 %i.addr.05, i32* %arrayidx, align 4
+ %inc = add nuw i32 %i.addr.05, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+ ret void
+}
+
+; CHECK-GUARD-LABEL: while_preheader_eq
+; CHECK-GUARD: entry:
+; CHECK-GUARD: br label %preheader
+; CHECK-GUARD: preheader:
+; CHECK-GUARD: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK-GUARD: br i1 [[TEST]], label %while.body.preheader, label %while.end
+; CHECK-GUARD: while.body.preheader:
+; CHECK-GUARD: br label %while.body
+define void @while_preheader_eq(i32 %N, i32* nocapture %A) {
+entry:
+ br label %preheader
+
+preheader:
+ %cmp = icmp eq i32 %N, 0
+ br i1 %cmp, label %while.end, label %while.body
+
+while.body:
+ %i.addr.05 = phi i32 [ %inc, %while.body ], [ 0, %preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
+ store i32 %i.addr.05, i32* %arrayidx, align 4
+ %inc = add nuw i32 %i.addr.05, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %while.end, label %while.body
+
+while.end:
+ ret void
+}
+
; CHECK-LABEL: nested
; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N)
; CHECK-NESTED: br label %while.cond1.preheader.us
; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
+; CHECK-GUARD: while.cond1.preheader.us:
+; CHECK-GUARD: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK-GUARD: br label %while.body3.us
+
define void @nested(i32* nocapture %A, i32 %N) {
entry:
%cmp20 = icmp eq i32 %N, 0