From: Luo, Yuanke Date: Wed, 17 Mar 2021 11:17:18 +0000 (+0800) Subject: [X86] Fix compile time regression of D93594. X-Git-Tag: llvmorg-14-init~12053 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e64adc0b88c2705425a9fe2345729e2688a4e4c6;p=platform%2Fupstream%2Fllvm.git [X86] Fix compile time regression of D93594. D93594 depend on the dominate tree and loop information. It increased the compile time when build with -O0. However this is just to amend the dominate tree and loop information, so that it is unnecessary to re-analyze them again. Given the dominate tree of loop information are absent in this pass, we can avoid amending them. Differential Revision: https://reviews.llvm.org/D98773 --- diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 9b6e546..134df5d 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -50,10 +50,38 @@ static bool isV256I32Ty(Type *Ty) { return false; } -static BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, - Value *Bound, Value *Step, StringRef Name, - IRBuilderBase &B, DomTreeUpdater &DTU, Loop *L, - LoopInfo &LI) { +namespace { +class X86LowerAMXIntrinsics { + Function &Func; + +public: + X86LowerAMXIntrinsics(Function &F, DomTreeUpdater &DomTU, LoopInfo *LoopI) + : Func(F), DTU(DomTU), LI(LoopI) {} + bool visit(); + +private: + DomTreeUpdater &DTU; + LoopInfo *LI; + BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, Value *Bound, + Value *Step, StringRef Name, IRBuilderBase &B, + Loop *L); + template + Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, + IRBuilderBase &B, Value *Row, Value *Col, + Value *Ptr, Value *Stride, Value *Tile); + Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, + IRBuilderBase &B, Value *Row, Value *Col, + Value *K, Value *Acc, Value *LHS, Value *RHS); + template + bool lowerTileLoadStore(Instruction *TileLoadStore); + bool lowerTileDPBSSD(Instruction *TileDPBSSD); + bool lowerTileZero(Instruction *TileZero); +}; + +BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader, + BasicBlock *Exit, Value *Bound, + Value *Step, StringRef Name, + IRBuilderBase &B, Loop *L) { LLVMContext &Ctx = Preheader->getContext(); BasicBlock *Header = BasicBlock::Create(Ctx, Name + ".header", Preheader->getParent(), Exit); @@ -86,35 +114,37 @@ static BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, {DominatorTree::Insert, Latch, Exit}, {DominatorTree::Insert, Preheader, Header}, }); - - L->addBasicBlockToLoop(Header, LI); - L->addBasicBlockToLoop(Body, LI); - L->addBasicBlockToLoop(Latch, LI); + if (LI) { + L->addBasicBlockToLoop(Header, *LI); + L->addBasicBlockToLoop(Body, *LI); + L->addBasicBlockToLoop(Latch, *LI); + } return Body; } template -static Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, - IRBuilderBase &B, DomTreeUpdater &DTU, - LoopInfo &LI, Value *Row, Value *Col, - Value *Ptr, Value *Stride, Value *Tile) { +Value *X86LowerAMXIntrinsics::createTileLoadStoreLoops( + BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row, + Value *Col, Value *Ptr, Value *Stride, Value *Tile) { std::string IntrinName = IsTileLoad ? "tileload" : "tilestore"; - Loop *RowLoop = LI.AllocateLoop(); - Loop *ColLoop = LI.AllocateLoop(); - RowLoop->addChildLoop(ColLoop); - if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(RowLoop); - else - LI.addTopLevelLoop(RowLoop); + Loop *RowLoop = nullptr; + Loop *ColLoop = nullptr; + if (LI) { + RowLoop = LI->AllocateLoop(); + ColLoop = LI->AllocateLoop(); + RowLoop->addChildLoop(ColLoop); + if (Loop *ParentL = LI->getLoopFor(Start)) + ParentL->addChildLoop(RowLoop); + else + LI->addTopLevelLoop(RowLoop); + } - BasicBlock *RowBody = - createLoop(Start, End, Row, B.getInt16(1), IntrinName + ".scalarize.rows", - B, DTU, RowLoop, LI); + BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1), + IntrinName + ".scalarize.rows", B, RowLoop); BasicBlock *RowLatch = RowBody->getSingleSuccessor(); - BasicBlock *ColBody = - createLoop(RowBody, RowLatch, Col, B.getInt16(1), - IntrinName + ".scalarize.cols", B, DTU, ColLoop, LI); + BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1), + IntrinName + ".scalarize.cols", B, ColLoop); BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor(); BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor(); @@ -181,35 +211,36 @@ static Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, } } -static Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, - IRBuilderBase &B, DomTreeUpdater &DTU, - LoopInfo &LI, Value *Row, Value *Col, - Value *K, Value *Acc, Value *LHS, - Value *RHS) { - Loop *RowLoop = LI.AllocateLoop(); - Loop *ColLoop = LI.AllocateLoop(); - Loop *InnerLoop = LI.AllocateLoop(); - ColLoop->addChildLoop(InnerLoop); - RowLoop->addChildLoop(ColLoop); - if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(RowLoop); - else - LI.addTopLevelLoop(RowLoop); +Value *X86LowerAMXIntrinsics::createTileDPBSSDLoops( + BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row, + Value *Col, Value *K, Value *Acc, Value *LHS, Value *RHS) { + Loop *RowLoop = nullptr; + Loop *ColLoop = nullptr; + Loop *InnerLoop = nullptr; + if (LI) { + RowLoop = LI->AllocateLoop(); + ColLoop = LI->AllocateLoop(); + InnerLoop = LI->AllocateLoop(); + ColLoop->addChildLoop(InnerLoop); + RowLoop->addChildLoop(ColLoop); + if (Loop *ParentL = LI->getLoopFor(Start)) + ParentL->addChildLoop(RowLoop); + else + LI->addTopLevelLoop(RowLoop); + } - BasicBlock *RowBody = - createLoop(Start, End, Row, B.getInt16(1), "tiledpbssd.scalarize.rows", B, - DTU, RowLoop, LI); + BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1), + "tiledpbssd.scalarize.rows", B, RowLoop); BasicBlock *RowLatch = RowBody->getSingleSuccessor(); - BasicBlock *ColBody = - createLoop(RowBody, RowLatch, Col, B.getInt16(1), - "tiledpbssd.scalarize.cols", B, DTU, ColLoop, LI); + BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1), + "tiledpbssd.scalarize.cols", B, ColLoop); BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor(); B.SetInsertPoint(ColBody->getTerminator()); BasicBlock *InnerBody = createLoop(ColBody, ColLoopLatch, K, B.getInt16(1), - "tiledpbssd.scalarize.inner", B, DTU, InnerLoop, LI); + "tiledpbssd.scalarize.inner", B, InnerLoop); BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor(); BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor(); @@ -324,30 +355,11 @@ static Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, return NewVecD; } -namespace { -class X86LowerAMXIntrinsics { - Function &Func; - -public: - X86LowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) - : Func(F), DT(DT), LI(LI) {} - bool visit(); - -private: - DominatorTree *DT; - LoopInfo *LI; - template - bool lowerTileLoadStore(Instruction *TileLoadStore); - bool lowerTileDPBSSD(Instruction *TileDPBSSD); - bool lowerTileZero(Instruction *TileZero); -}; - bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) { Value *M, *N, *K, *C, *A, *B; match(TileDPBSSD, m_Intrinsic( m_Value(M), m_Value(N), m_Value(K), m_Value(C), m_Value(A), m_Value(B))); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Instruction *InsertI = TileDPBSSD; IRBuilder<> PreBuilder(TileDPBSSD); PreBuilder.SetInsertPoint(TileDPBSSD); @@ -358,10 +370,10 @@ bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) { Value *KDWord = PreBuilder.CreateLShr(K, PreBuilder.getInt16(2)); BasicBlock *Start = InsertI->getParent(); BasicBlock *End = - SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); + SplitBlock(InsertI->getParent(), InsertI, &DTU, LI, nullptr, "continue"); IRBuilder<> Builder(TileDPBSSD); - Value *ResVec = createTileDPBSSDLoops(Start, End, Builder, DTU, *LI, M, - NDWord, KDWord, C, A, B); + Value *ResVec = + createTileDPBSSDLoops(Start, End, Builder, M, NDWord, KDWord, C, A, B); // we cannot assume there always be bitcast after tiledpbssd. So we need to // insert one bitcast as required Builder.SetInsertPoint(End->getFirstNonPHI()); @@ -394,7 +406,6 @@ bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) { m_Value(M), m_Value(N), m_Value(Ptr), m_Value(Stride), m_Value(Tile))); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Instruction *InsertI = TileLoadStore; IRBuilder<> PreBuilder(TileLoadStore); PreBuilder.SetInsertPoint(TileLoadStore); @@ -402,10 +413,10 @@ bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) { Value *StrideDWord = PreBuilder.CreateLShr(Stride, PreBuilder.getInt64(2)); BasicBlock *Start = InsertI->getParent(); BasicBlock *End = - SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); + SplitBlock(InsertI->getParent(), InsertI, &DTU, LI, nullptr, "continue"); IRBuilder<> Builder(TileLoadStore); Value *ResVec = createTileLoadStoreLoops( - Start, End, Builder, DTU, *LI, M, NDWord, Ptr, StrideDWord, + Start, End, Builder, M, NDWord, Ptr, StrideDWord, IsTileLoad ? nullptr : Tile); if (IsTileLoad) { // we cannot assume there always be bitcast after tileload. So we need to @@ -505,18 +516,19 @@ public: TM->getOptLevel() != CodeGenOpt::None) return false; - auto &DT = getAnalysis().getDomTree(); - auto &LI = getAnalysis().getLoopInfo(); + auto *DTWP = getAnalysisIfAvailable(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = getAnalysisIfAvailable(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - X86LowerAMXIntrinsics LAT(F, &DT, &LI); + X86LowerAMXIntrinsics LAT(F, DTU, LI); return LAT.visit(); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.addPreserved(); - AU.addRequired(); AU.addPreserved(); AU.addRequired(); } @@ -528,8 +540,6 @@ static const char PassName[] = "Lower AMX intrinsics"; char X86LowerAMXIntrinsicsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 2e1cbac..e5b3584 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -18,8 +18,6 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 0f92e5a..9df12b7 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -24,12 +24,12 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager