From 22ee3f63b9400786a17439a166fbfc31a7c5338d Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Sun, 28 Dec 2014 11:07:35 +0000 Subject: [PATCH] [CodeGenPrepare] Teach when it is profitable to speculate calls to @llvm.cttz/ctlz. If the control flow is modelling an if-statement where the only instruction in the 'then' basic block (excluding the terminator) is a call to cttz/ctlz, CodeGenPrepare can try to speculate the cttz/ctlz call and simplify the control flow graph. Example: \code entry: %cmp = icmp eq i64 %val, 0 br i1 %cmp, label %end.bb, label %then.bb then.bb: %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) br label %end.bb end.bb: %cond = phi i64 [ %c, %then.bb ], [ 64, %entry] \code In this example, basic block %then.bb is taken if value %val is not zero. Also, the phi node in %end.bb would propagate the size-of in bits of %val only if %val is equal to zero. With this patch, CodeGenPrepare will try to hoist the call to cttz from %then.bb into basic block %entry only if cttz is cheap to speculate for the target. Added two new hooks in TargetLowering.h to let targets customize the behavior (i.e. decide whether it is cheap or not to speculate calls to cttz/ctlz). The two new methods are 'isCheapToSpeculateCtlz' and 'isCheapToSpeculateCttz'. By default, both methods return 'false'. On X86, method 'isCheapToSpeculateCtlz' returns true only if the target has LZCNT. Method 'isCheapToSpeculateCttz' only returns true if the target has BMI. Differential Revision: http://reviews.llvm.org/D6728 llvm-svn: 224899 --- llvm/include/llvm/Target/TargetLowering.h | 10 ++ llvm/lib/CodeGen/CodeGenPrepare.cpp | 141 +++++++++++++++++ llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++ llvm/lib/Target/X86/X86ISelLowering.h | 4 + llvm/test/CodeGen/X86/cttz-ctlz.ll | 250 ++++++++++++++++++++++++++++++ 5 files changed, 415 insertions(+) create mode 100644 llvm/test/CodeGen/X86/cttz-ctlz.ll diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index 9c01897..9a04d3c 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -249,6 +249,16 @@ public: return true; } + /// \brief Return true if it is cheap to speculate a call to intrinsic cttz. + virtual bool isCheapToSpeculateCttz() const { + return false; + } + + /// \brief Return true if it is cheap to speculate a call to intrinsic ctlz. + virtual bool isCheapToSpeculateCtlz() const { + return false; + } + /// \brief Return if the target supports combining a /// chain like: /// \code diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index aba0d58..55a4637 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -3978,6 +3978,119 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) { Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted); } +// See if we can speculate calls to intrinsic cttz/ctlz. +// +// Example: +// entry: +// ... +// %cmp = icmp eq i64 %val, 0 +// br i1 %cmp, label %end.bb, label %then.bb +// +// then.bb: +// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) +// br label %EndBB +// +// end.bb: +// %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ] +// +// ==> +// +// entry: +// ... +// %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) +// +static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) { + assert(BrInst->isConditional() && "Expected a conditional branch!"); + BasicBlock *ThenBB = BrInst->getSuccessor(1); + BasicBlock *EndBB = BrInst->getSuccessor(0); + + // See if ThenBB contains only one instruction (excluding the + // terminator and DbgInfoIntrinsic calls). + IntrinsicInst *II = nullptr; + for (BasicBlock::iterator I = ThenBB->begin(), + E = std::prev(ThenBB->end()); I != E; ++I) { + // Skip debug info. + if (isa(I)) + continue; + + if (II) + // Avoid speculating more than one instruction. + return false; + + // See if this is a call to intrinsic cttz/ctlz. + if (match(cast(I), m_Intrinsic())) { + // Avoid speculating expensive intrinsic calls. + if (!TLI.isCheapToSpeculateCttz()) + return false; + } + else if (match(cast(I), m_Intrinsic())) { + // Avoid speculating expensive intrinsic calls. + if (!TLI.isCheapToSpeculateCtlz()) + return false; + } else + return false; + + II = cast(I); + } + + // Look for PHI nodes with 'II' as the incoming value from 'ThenBB'. + BasicBlock *EntryBB = BrInst->getParent(); + for (BasicBlock::iterator I = EndBB->begin(); + PHINode *PN = dyn_cast(I); ++I) { + Value *ThenV = PN->getIncomingValueForBlock(ThenBB); + Value *OrigV = PN->getIncomingValueForBlock(EntryBB); + + if (!OrigV || ThenV != II) + return false; + + if (ConstantInt *CInt = dyn_cast(OrigV)) { + unsigned BitWidth = ThenV->getType()->getIntegerBitWidth(); + + // Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz + // intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits + // of the value in input to the cttz/ctlz. + if (CInt->getValue() != BitWidth) + return false; + + // Hoist the call to cttz/ctlz from ThenBB into EntryBB. + EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(), + ThenBB->begin(), std::prev(ThenBB->end())); + + // Update PN setting ThenV as the incoming value from both 'EntryBB' + // and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this + // phi node if all the incoming values are the same. + PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV); + PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV); + + // Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call. + if (cast(II->getArgOperand(1))->isOne()) { + Type *Ty = II->getArgOperand(0)->getType(); + Value *Args[] = { II->getArgOperand(0), + ConstantInt::getFalse(II->getContext()) }; + Module *M = EntryBB->getParent()->getParent(); + Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty); + IRBuilder<> Builder(BrInst); + Instruction *NewI = Builder.CreateCall(IF, Args); + + // Replace the old call to cttz/ctlz. + II->replaceAllUsesWith(NewI); + II->eraseFromParent(); + } + + // Update BrInst condition so that the branch to EndBB is always taken. + // Later on, method 'ConstantFoldTerminator' will simplify this branch + // replacing it with a direct branch to 'EndBB'. + // As a side effect, CodeGenPrepare will attempt to simplify the control + // flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into + // 'EndBB' (calling method 'EliminateFallThrough'). + BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext())); + return true; + } + } + + return false; +} + /// Some targets can do store(extractelement) with one instruction. /// Try to push the extractelement towards the stores when the target /// has this feature and this is profitable. @@ -4130,6 +4243,34 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { if (isa(I)) return OptimizeExtractElementInst(I); + if (BranchInst *BI = dyn_cast(I)) { + if (TLI && BI->isConditional() && BI->getCondition()->hasOneUse()) { + // Check if the branch condition compares a value agaist zero. + if (ICmpInst *ICI = dyn_cast(BI->getCondition())) { + if (ICI->getPredicate() == ICmpInst::ICMP_EQ && + match(ICI->getOperand(1), m_Zero())) { + BasicBlock *ThenBB = BI->getSuccessor(1); + BasicBlock *EndBB = BI->getSuccessor(0); + + // Check if ThenBB is only reachable from this basic block; also, + // check if EndBB has more than one predecessor. + if (ThenBB->getSinglePredecessor() && + !EndBB->getSinglePredecessor()) { + TerminatorInst *TI = ThenBB->getTerminator(); + + if (TI->getNumSuccessors() == 1 && TI->getSuccessor(0) == EndBB && + // Try to speculate calls to intrinsic cttz/ctlz from 'ThenBB'. + OptimizeBranchInst(BI, *TLI)) { + ModifiedDT = true; + return true; + } + } + } + } + } + return false; + } + return false; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 77a1d6e..c966642 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3882,6 +3882,16 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, return (Index == 0 || Index == ResVT.getVectorNumElements()); } +bool X86TargetLowering::isCheapToSpeculateCttz() const { + // Don't try to speculate cttz if we can't directly use TZCNT. + return Subtarget->hasBMI(); +} + +bool X86TargetLowering::isCheapToSpeculateCtlz() const { + // Don't try to speculate ctlz if we can't directly use LZCNT. + return Subtarget->hasLZCNT(); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index c5e3e14..0b01bae 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -633,6 +633,10 @@ namespace llvm { /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool isCheapToSpeculateCttz() const override; + + bool isCheapToSpeculateCtlz() const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; diff --git a/llvm/test/CodeGen/X86/cttz-ctlz.ll b/llvm/test/CodeGen/X86/cttz-ctlz.ll new file mode 100644 index 0000000..88dcf65 --- /dev/null +++ b/llvm/test/CodeGen/X86/cttz-ctlz.ll @@ -0,0 +1,250 @@ +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC + + +define i64 @test1(i64 %A) { +; ALL-LABEL: @test1( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i64 @llvm.ctlz.i64(i64 %A, i1 false) +; LZCNT-NEXT: ret i64 [[CTLZ]] +; BMI: icmp eq i64 %A, 0 +; BMI: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +; GENERIC: icmp eq i64 %A, 0 +; GENERIC: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] + ret i64 %cond +} + + +define i32 @test2(i32 %A) { +; ALL-LABEL: @test2( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %A, i1 false) +; LZCNT-NEXT: ret i32 [[CTLZ]] +; BMI: icmp eq i32 %A, 0 +; BMI: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +; GENERIC: icmp eq i32 %A, 0 +; GENERIC: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ] + ret i32 %cond +} + + +define signext i16 @test3(i16 signext %A) { +; ALL-LABEL: @test3( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i16 @llvm.ctlz.i16(i16 %A, i1 false) +; LZCNT-NEXT: ret i16 [[CTLZ]] +; BMI: icmp eq i16 %A, 0 +; BMI: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +; GENERIC: icmp eq i16 %A, 0 +; GENERIC: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ] + ret i16 %cond +} + + +define i64 @test1b(i64 %A) { +; ALL-LABEL: @test1b( +; LZCNT: icmp eq i64 %A, 0 +; LZCNT: call i64 @llvm.cttz.i64(i64 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %A, i1 false) +; BMI-NEXT: ret i64 [[CTTZ]] +; GENERIC: icmp eq i64 %A, 0 +; GENERIC: call i64 @llvm.cttz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] + ret i64 %cond +} + + +define i32 @test2b(i32 %A) { +; ALL-LABEL: @test2b( +; LZCNT: icmp eq i32 %A, 0 +; LZCNT: call i32 @llvm.cttz.i32(i32 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %A, i1 false) +; BMI-NEXT: ret i32 [[CTTZ]] +; GENERIC: icmp eq i32 %A, 0 +; GENERIC: call i32 @llvm.cttz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ] + ret i32 %cond +} + + +define signext i16 @test3b(i16 signext %A) { +; ALL-LABEL: @test3b( +; LZCNT: icmp eq i16 %A, 0 +; LZCNT: call i16 @llvm.cttz.i16(i16 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i16 @llvm.cttz.i16(i16 %A, i1 false) +; BMI-NEXT: ret i16 [[CTTZ]] +; GENERIC: icmp eq i16 %A, 0 +; GENERIC: call i16 @llvm.cttz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ] + ret i16 %cond +} + + +define i64 @test1c(i64 %A) { +; ALL-LABEL: @test1c( +; ALL: icmp eq i64 %A, 0 +; ALL: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ] + ret i64 %cond +} + +define i32 @test2c(i32 %A) { +; ALL-LABEL: @test2c( +; ALL: icmp eq i32 %A, 0 +; ALL: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ] + ret i32 %cond +} + + +define signext i16 @test3c(i16 signext %A) { +; ALL-LABEL: @test3c( +; ALL: icmp eq i16 %A, 0 +; ALL: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ] + ret i16 %cond +} + + +define i64 @test1d(i64 %A) { +; ALL-LABEL: @test1d( +; ALL: icmp eq i64 %A, 0 +; ALL: call i64 @llvm.cttz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ] + ret i64 %cond +} + + +define i32 @test2d(i32 %A) { +; ALL-LABEL: @test2d( +; ALL: icmp eq i32 %A, 0 +; ALL: call i32 @llvm.cttz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ] + ret i32 %cond +} + + +define signext i16 @test3d(i16 signext %A) { +; ALL-LABEL: @test3d( +; ALL: icmp eq i16 %A, 0 +; ALL: call i16 @llvm.cttz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ] + ret i16 %cond +} + + +declare i64 @llvm.ctlz.i64(i64, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i64 @llvm.cttz.i64(i64, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i16 @llvm.cttz.i16(i16, i1) -- 2.7.4