From 9af38fc247a34bb2a9b69f3653041ebc9d1cea30 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 26 Jul 2013 08:20:39 +0000 Subject: [PATCH] Re-implement the analysis of uses in mem2reg to be significantly more robust. It now uses an InstVisitor and worklist to actually walk the uses of the Alloca transitively and detect the pattern which we can directly promote: loads & stores of the whole alloca and instructions we can completely ignore. Also, with this new implementation teach both the predicate for testing whether we can promote and the promotion engine itself to use the same code so we no longer have strange divergence between the two code paths. I've added some silly test cases to demonstrate that we can handle slightly more degenerate code patterns now. See the below for why this is even interesting. Performance impact: roughly 1% regression in the performance of SROA or ScalarRepl on a large C++-ish test case where most of the allocas are basically ready for promotion. The reason is because of silly redundant work that I've left FIXMEs for and which I'll address in the next commit. I wanted to separate this commit as it changes the behavior. Once the redundant work in removing the dead uses of the alloca is fixed, this code appears to be faster than the old version. =] So why is this useful? Because the previous requirement for promotion required a *specific* visit pattern of the uses of the alloca to verify: we *had* to look for no more than 1 intervening use. The end goal is to have SROA automatically detect when an alloca is already promotable and directly hand it to the mem2reg machinery rather than trying to partition and rewrite it. This is a 25% or more performance improvement for SROA, and a significant chunk of the delta between it and ScalarRepl. To get there, we need to make mem2reg actually capable of promoting allocas which *look* promotable to SROA without have SROA do tons of work to massage the code into just the right form. This is actually the tip of the iceberg. There are tremendous potential savings we can realize here by de-duplicating work between mem2reg and SROA. llvm-svn: 187191 --- .../Transforms/Utils/PromoteMemoryToRegister.cpp | 244 +++++++++++++-------- llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll | 26 --- llvm/test/Transforms/Mem2Reg/use-analysis.ll | 70 ++++++ 3 files changed, 227 insertions(+), 113 deletions(-) delete mode 100644 llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll create mode 100644 llvm/test/Transforms/Mem2Reg/use-analysis.ll diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 1b51255..7afca82 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -45,6 +46,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" +#include "llvm/InstVisitor.h" #include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" #include @@ -56,56 +58,13 @@ STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); -bool llvm::isAllocaPromotable(const AllocaInst *AI) { - // FIXME: If the memory unit is of pointer or integer type, we can permit - // assignments to subsections of the memory unit. - - // Only allow direct and non-volatile loads and stores... - for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE; ++UI) { // Loop over all of the uses of the alloca - const User *U = *UI; - if (const LoadInst *LI = dyn_cast(U)) { - // Note that atomic loads can be transformed; atomic semantics do - // not have any meaning for a local alloca. - if (LI->isVolatile()) - return false; - } else if (const StoreInst *SI = dyn_cast(U)) { - if (SI->getOperand(0) == AI) - return false; // Don't allow a store OF the AI, only INTO the AI. - // Note that atomic stores can be transformed; atomic semantics do - // not have any meaning for a local alloca. - if (SI->isVolatile()) - return false; - } else if (const IntrinsicInst *II = dyn_cast(U)) { - if (II->getIntrinsicID() != Intrinsic::lifetime_start && - II->getIntrinsicID() != Intrinsic::lifetime_end) - return false; - } else if (const BitCastInst *BCI = dyn_cast(U)) { - if (BCI->getType() != Type::getInt8PtrTy(U->getContext())) - return false; - if (!onlyUsedByLifetimeMarkers(BCI)) - return false; - } else if (const GetElementPtrInst *GEPI = dyn_cast(U)) { - if (GEPI->getType() != Type::getInt8PtrTy(U->getContext())) - return false; - if (!GEPI->hasAllZeroIndices()) - return false; - if (!onlyUsedByLifetimeMarkers(GEPI)) - return false; - } else { - return false; - } - } - - return true; -} - namespace { -struct AllocaInfo { +struct AllocaInfo : private InstVisitor { SmallVector DefiningBlocks; SmallVector UsingBlocks; + Type *AllocaTy; StoreInst *OnlyStore; BasicBlock *OnlyBlock; bool OnlyUsedInOneBlock; @@ -116,6 +75,7 @@ struct AllocaInfo { void clear() { DefiningBlocks.clear(); UsingBlocks.clear(); + AllocaTy = 0; OnlyStore = 0; OnlyBlock = 0; OnlyUsedInOneBlock = true; @@ -125,39 +85,107 @@ struct AllocaInfo { /// Scan the uses of the specified alloca, filling in the AllocaInfo used /// by the rest of the pass to reason about the uses of this alloca. - void AnalyzeAlloca(AllocaInst *AI) { + bool analyzeAlloca(AllocaInst &AI) { clear(); - // As we scan the uses of the alloca instruction, keep track of stores, - // and decide whether all of the loads and stores to the alloca are within - // the same basic block. - for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); - UI != E;) { - Instruction *User = cast(*UI++); - - if (StoreInst *SI = dyn_cast(User)) { - // Remember the basic blocks which define new values for the alloca - DefiningBlocks.push_back(SI->getParent()); - AllocaPointerVal = SI->getOperand(0); - OnlyStore = SI; - } else { - LoadInst *LI = cast(User); - // Otherwise it must be a load instruction, keep track of variable - // reads. - UsingBlocks.push_back(LI->getParent()); - AllocaPointerVal = LI; - } + AllocaTy = AI.getAllocatedType(); + enqueueUsers(AI); + + // Walk queued up uses in the worklist to handle nested uses. + while (!UseWorklist.empty()) { + U = UseWorklist.pop_back_val(); + Instruction &I = *cast(U->getUser()); + if (!visit(I)) + return false; // Propagate failure to promote up. if (OnlyUsedInOneBlock) { if (OnlyBlock == 0) - OnlyBlock = User->getParent(); - else if (OnlyBlock != User->getParent()) + OnlyBlock = I.getParent(); + else if (OnlyBlock != I.getParent()) OnlyUsedInOneBlock = false; } } - DbgDeclare = FindAllocaDbgDeclare(AI); + DbgDeclare = FindAllocaDbgDeclare(&AI); + return true; + } + +private: + // Befriend the base class so it can call through private visitor methods. + friend class InstVisitor; + + /// \brief A use pointer that is non-null when visiting uses. + Use *U; + + /// \brief A worklist for recursively visiting all uses of an alloca. + SmallVector UseWorklist; + + /// \brief A set for preventing cyclic visitation. + SmallPtrSet VisitedUses; + + void enqueueUsers(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (VisitedUses.insert(&UI.getUse())) + UseWorklist.push_back(&UI.getUse()); + } + + bool visitLoadInst(LoadInst &LI) { + if (LI.isVolatile() || LI.getType() != AllocaTy) + return false; + + // Keep track of variable reads. + UsingBlocks.push_back(LI.getParent()); + AllocaPointerVal = &LI; + return true; + } + + bool visitStoreInst(StoreInst &SI) { + if (SI.isVolatile() || SI.getValueOperand() == U->get() || + SI.getValueOperand()->getType() != AllocaTy) + return false; + + // Remember the basic blocks which define new values for the alloca + DefiningBlocks.push_back(SI.getParent()); + AllocaPointerVal = SI.getOperand(0); + OnlyStore = &SI; + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC); + return true; } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + if (GEPI.use_empty()) + return true; + + enqueueUsers(GEPI); + + return GEPI.hasAllZeroIndices(); + } + + // We can promote through debug info intrinsics as they don't alter the + // value stored in memory. + bool visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { return true; } + + bool visitIntrinsicInst(IntrinsicInst &II) { + switch (II.getIntrinsicID()) { + default: + return false; + + // Lifetime intrinsics don't preclude promoting the memory to a register. + // FIXME: We should use these to promote to undef when outside of a valid + // lifetime. + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return true; + } + } + + // The fallback is that the alloca cannot be promoted. + bool visitInstruction(Instruction &I) { return false; } }; // Data package used by RenamePass() @@ -316,24 +344,56 @@ private: static void removeLifetimeIntrinsicUsers(AllocaInst *AI) { // Knowing that this alloca is promotable, we know that it's safe to kill all // instructions except for load and store. + // FIXME: This function isn't actually specific to lifetime instrinsics. It + // also is redundant with the actual analysis of the loads and stores and + // should be folded into that. + + SmallVector Worklist; + SmallPtrSet Visited; + SmallVector Dead; + Worklist.push_back(AI); + + do { + Instruction *I = Worklist.pop_back_val(); - for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE;) { - Instruction *I = cast(*UI); - ++UI; - if (isa(I) || isa(I)) + if (I->use_empty()) { + Dead.push_back(I); continue; + } - if (!I->getType()->isVoidTy()) { - // The only users of this bitcast/GEP instruction are lifetime intrinsics. - // Follow the use/def chain to erase them now instead of leaving it for - // dead code elimination later. - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE;) { - Instruction *Inst = cast(*UI); - ++UI; - Inst->eraseFromParent(); - } + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + if (!isa(*UI) && !isa(*UI) && + Visited.insert(cast(*UI))) + Worklist.push_back(cast(*UI)); + } while (!Worklist.empty()); + + while (!Dead.empty()) { + Instruction *I = Dead.pop_back_val(); + + // Don't delete the alloca itself. + if (I == AI) + continue; + + // Note that we open code the deletion algorithm here because we know + // apriori that all of the instructions using an alloca that reaches here + // are trivially dead when their use list becomes empty (The only risk are + // lifetime markers which we specifically want to nuke). By coding it here + // we can skip the triviality test and be more efficient. + // + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; + ++OI) { + Instruction *Op = dyn_cast(*OI); + if (!Op) + continue; + + OI->set(0); + if (!Op->use_empty()) + continue; + + Dead.push_back(Op); } I->eraseFromParent(); } @@ -540,7 +600,7 @@ void PromoteMem2Reg::run() { for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { AllocaInst *AI = Allocas[AllocaNum]; - assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); + //assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); @@ -560,7 +620,9 @@ void PromoteMem2Reg::run() { // Calculate the set of read and write-locations for each alloca. This is // analogous to finding the 'uses' and 'definitions' of each variable. - Info.AnalyzeAlloca(AI); + bool Good = Info.analyzeAlloca(*AI); + (void)Good; + assert(Good && "Cannot promote non-promotable alloca!"); // If there is only a single store to this value, replace any loads of // it that are directly dominated by the definition with the value stored. @@ -1087,8 +1149,16 @@ NextIteration: goto NextIteration; } -void llvm::PromoteMemToReg(ArrayRef Allocas, DominatorTree &DT, - AliasSetTracker *AST) { +bool llvm::isAllocaPromotable(const AllocaInst *AI) { + // We cast away constness because we re-use the non-const analysis that the + // actual promotion routine uses. While it is non-const, it doesn't actually + // mutate anything at this phase, and we discard the non-const results that + // promotion uses to mutate the alloca. + return AllocaInfo().analyzeAlloca(*const_cast(AI)); +} + +void llvm::PromoteMemToReg(ArrayRef Allocas, + DominatorTree &DT, AliasSetTracker *AST) { // If there is nothing to do, bail out... if (Allocas.empty()) return; diff --git a/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll b/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll deleted file mode 100644 index 5e4f9bf..0000000 --- a/llvm/test/Transforms/Mem2Reg/ignore-lifetime.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: opt -mem2reg -S -o - < %s | FileCheck %s - -declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr) -declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr) - -define void @test1() { -; CHECK: test1 -; CHECK-NOT: alloca - %A = alloca i32 - %B = bitcast i32* %A to i8* - call void @llvm.lifetime.start(i64 2, i8* %B) - store i32 1, i32* %A - call void @llvm.lifetime.end(i64 2, i8* %B) - ret void -} - -define void @test2() { -; CHECK: test2 -; CHECK-NOT: alloca - %A = alloca {i8, i16} - %B = getelementptr {i8, i16}* %A, i32 0, i32 0 - call void @llvm.lifetime.start(i64 2, i8* %B) - store {i8, i16} zeroinitializer, {i8, i16}* %A - call void @llvm.lifetime.end(i64 2, i8* %B) - ret void -} diff --git a/llvm/test/Transforms/Mem2Reg/use-analysis.ll b/llvm/test/Transforms/Mem2Reg/use-analysis.ll new file mode 100644 index 0000000..b08b1f1 --- /dev/null +++ b/llvm/test/Transforms/Mem2Reg/use-analysis.ll @@ -0,0 +1,70 @@ +; RUN: opt -mem2reg -S -o - < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr) +declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr) + +define void @test1() { +; Ensure we can look through a bitcast to i8* and the addition of lifetime +; markers. +; +; CHECK-LABEL: @test1( +; CHECK-NOT: alloca +; CHECK: ret void + + %A = alloca i32 + %B = bitcast i32* %A to i8* + call void @llvm.lifetime.start(i64 2, i8* %B) + store i32 1, i32* %A + call void @llvm.lifetime.end(i64 2, i8* %B) + ret void +} + +define void @test2() { +; Ensure we can look through a GEP to i8* and the addition of lifetime +; markers. +; +; CHECK-LABEL: @test2( +; CHECK-NOT: alloca +; CHECK: ret void + + %A = alloca {i8, i16} + %B = getelementptr {i8, i16}* %A, i32 0, i32 0 + call void @llvm.lifetime.start(i64 2, i8* %B) + store {i8, i16} zeroinitializer, {i8, i16}* %A + call void @llvm.lifetime.end(i64 2, i8* %B) + ret void +} + +define i32 @test3(i32 %x) { +; CHECK-LABEL: @test3( +; +; Check that we recursively walk the uses of the alloca and thus can see +; through round trip bitcasts, dead bitcasts, GEPs, multiple GEPs, and lifetime +; markers. +entry: + %a = alloca i32 +; CHECK-NOT: alloca + + %b = bitcast i32* %a to i8* + %b2 = getelementptr inbounds i8* %b, i32 0 + %b3 = getelementptr inbounds i8* %b2, i32 0 + call void @llvm.lifetime.start(i64 -1, i8* %b3) +; CHECK-NOT: call void @llvm.lifetime.start + + store i32 %x, i32* %a +; CHECK-NOT: store + + %dead = bitcast i32* %a to i4096* + %dead1 = bitcast i4096* %dead to i42* + %dead2 = getelementptr inbounds i32* %a, i32 %x +; CHECK-NOT: bitcast +; CHECK-NOT: getelementptr + + %ret = load i32* %a +; CHECK-NOT: load + + ret i32 %ret +; CHECK: ret i32 %x +} -- 2.7.4