[CodeGen] Limit building time in CodeGenPrepare for huge function

author Xiang1 Zhang <xiang1.zhang@intel.com>

Tue, 30 Aug 2022 05:58:35 +0000 (13:58 +0800)

committer Xiang1 Zhang <xiang1.zhang@intel.com>

Wed, 7 Sep 2022 02:05:40 +0000 (10:05 +0800)
author Xiang1 Zhang <xiang1.zhang@intel.com>
Tue, 30 Aug 2022 05:58:35 +0000 (13:58 +0800)
committer Xiang1 Zhang <xiang1.zhang@intel.com>
Wed, 7 Sep 2022 02:05:40 +0000 (10:05 +0800)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp

index cd6ee4e..11f284b 100644 (file)
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -259,6 +259,10 @@ static cl::opt<bool>
      OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(false),
                       cl::desc("Enable converting phi types in CodeGenPrepare"));
  
+static cl::opt<unsigned>
+    HugeFuncThresholdInCGPP("cgpp-huge-func", cl::init(10000), cl::Hidden,
+                            cl::desc("Least BB number of huge function."));
+
  namespace {
  
  enum ExtType {
@@ -270,6 +274,15 @@ enum ExtType {
                   // information of a promoted instruction invalid.
  };
  
+enum ModifyDT {
+  NotModifyDT, // Not Modify any DT.
+  ModifyBBDT,  // Modify the Basic Block Dominator Tree.
+  ModifyInstDT // Modify the Instruction Dominator in a Basic Block,
+               // This usually means we move/delete/insert instruction
+               // in a Basic Block. So we should re-iterate instructions
+               // in such Basic Block.
+};
+
  using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
  using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
  using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
@@ -342,6 +355,15 @@ class CodeGenPrepare : public FunctionPass {
    std::unique_ptr<DominatorTree> DT;
  
  public:
+  /// If encounter huge function, we need to limit the build time.
+  bool IsHugeFunc = false;
+
+  /// FreshBBs is like worklist, it collected the updated BBs which need
+  /// to be optimized again.
+  /// Note: Consider building time in this pass, when a BB updated, we need
+  /// to insert such BB into FreshBBs for huge function.
+  SmallSet<BasicBlock *, 32> FreshBBs;
+
    static char ID; // Pass identification, replacement for typeid
  
    CodeGenPrepare() : FunctionPass(ID) {
@@ -398,13 +420,13 @@ private:
    bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
                                       bool isPreheader);
    bool makeBitReverse(Instruction &I);
-  bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
-  bool optimizeInst(Instruction *I, bool &ModifiedDT);
+  bool optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT);
+  bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT);
    bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                            unsigned AddrSpace);
    bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
    bool optimizeInlineAsmInst(CallInst *CS);
-  bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+  bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
    bool optimizeExt(Instruction *&I);
    bool optimizeExtUses(Instruction *I);
    bool optimizeLoadExt(LoadInst *Load);
@@ -416,7 +438,7 @@ private:
    bool optimizeSwitchPhiConstants(SwitchInst *SI);
    bool optimizeSwitchInst(SwitchInst *SI);
    bool optimizeExtractElementInst(Instruction *Inst);
-  bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
+  bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
    bool fixupDbgValue(Instruction *I);
    bool placeDbgValues(Function &F);
    bool placePseudoProbes(Function &F);
@@ -435,15 +457,15 @@ private:
        Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
        bool HasPromoted, TypePromotionTransaction &TPT,
        SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
-  bool splitBranchCondition(Function &F, bool &ModifiedDT);
+  bool splitBranchCondition(Function &F, ModifyDT &ModifiedDT);
    bool simplifyOffsetableRelocate(GCStatepointInst &I);
  
    bool tryToSinkFreeOperands(Instruction *I);
    bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
                                     CmpInst *Cmp, Intrinsic::ID IID);
-  bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
-  bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
-  bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
+  bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
    void verifyBFIUpdates(Function &F);
  };
  
@@ -474,6 +496,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
    // Clear per function information.
    InsertedInsts.clear();
    PromotedInsts.clear();
+  FreshBBs.clear();
  
    TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
    SubtargetInfo = TM->getSubtargetImpl(F);
@@ -537,7 +560,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
    // unconditional branch.
    EverMadeChange |= eliminateMostlyEmptyBlocks(F);
  
-  bool ModifiedDT = false;
+  ModifyDT ModifiedDT = ModifyDT::NotModifyDT;
    if (!DisableBranchOpts)
      EverMadeChange |= splitBranchCondition(F, ModifiedDT);
  
@@ -546,18 +569,51 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
    EverMadeChange |=
        SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true);
  
+  // If we are optimzing huge function, we need to consider the build time.
+  // Because the basic algorithm's complex is near O(N!).
+  IsHugeFunc = F.size() > HugeFuncThresholdInCGPP;
+
    bool MadeChange = true;
+  bool FuncIterated = false;
    while (MadeChange) {
      MadeChange = false;
      DT.reset();
+
      for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
-      bool ModifiedDTOnIteration = false;
-      MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration);
+      if (FuncIterated && !FreshBBs.contains(&BB))
+        continue;
  
-      // Restart BB iteration if the dominator tree of the Function was changed
-      if (ModifiedDTOnIteration)
-        break;
+      ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT;
+      bool Changed = optimizeBlock(BB, ModifiedDTOnIteration);
+
+      MadeChange |= Changed;
+      if (IsHugeFunc) {
+        // If the BB is updated, it may still has chance to be optimized.
+        // This usually happen at sink optimization.
+        // For example:
+        //
+        // bb0：
+        // %and = and i32 %a, 4
+        // %cmp = icmp eq i32 %and, 0
+        //
+        // If the %cmp sink to other BB, the %and will has chance to sink.
+        if (Changed)
+          FreshBBs.insert(&BB);
+        else if (FuncIterated)
+          FreshBBs.erase(&BB);
+
+        if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT)
+          DT.reset();
+      } else {
+        // For small/normal functions, we restart BB iteration if the dominator
+        // tree of the Function was changed.
+        if (ModifiedDTOnIteration != ModifyDT::NotModifyDT)
+          break;
+      }
      }
+    // We have iterated all the BB in the (only work for huge) function.
+    FuncIterated = IsHugeFunc;
+
      if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
        MadeChange |= mergeSExts(F);
      if (!LargeOffsetGEPMap.empty())
@@ -728,6 +784,12 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) {
        // Merge BB into SinglePred and delete it.
        MergeBlockIntoPredecessor(BB);
        Preds.insert(SinglePred);
+
+      if (IsHugeFunc) {
+        // Update FreshBBs to optimize the merged BB.
+        FreshBBs.insert(SinglePred);
+        FreshBBs.erase(BB);
+      }
      }
    }
  
@@ -962,6 +1024,22 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
    return true;
  }
  
+/// Replace all old uses with new ones, and push the updated BBs into FreshBBs.
+static void replaceAllUsesWith(Value *Old, Value *New,
+                               SmallSet<BasicBlock *, 32> &FreshBBs,
+                               bool IsHuge) {
+  auto *OldI = dyn_cast<Instruction>(Old);
+  if (OldI) {
+    for (Value::user_iterator UI = OldI->user_begin(), E = OldI->user_end();
+         UI != E; ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      if (IsHuge)
+        FreshBBs.insert(User->getParent());
+    }
+  }
+  Old->replaceAllUsesWith(New);
+}
+
  /// Eliminate a basic block that has only phi's and an unconditional branch in
  /// it.
  void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
@@ -982,6 +1060,12 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
        // Note: BB(=SinglePred) will not be deleted on this path.
        // DestBB(=its single successor) is the one that was deleted.
        LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
+
+      if (IsHugeFunc) {
+        // Update FreshBBs to optimize the merged BB.
+        FreshBBs.insert(SinglePred);
+        FreshBBs.erase(DestBB);
+      }
        return;
      }
    }
@@ -1449,12 +1533,12 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
    Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
    if (BO->getOpcode() != Instruction::Xor) {
      Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
-    BO->replaceAllUsesWith(Math);
+    replaceAllUsesWith(BO, Math, FreshBBs, IsHugeFunc);
    } else
      assert(BO->hasOneUse() &&
             "Patterns with XOr should use the BO only in the compare");
    Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
-  Cmp->replaceAllUsesWith(OV);
+  replaceAllUsesWith(Cmp, OV, FreshBBs, IsHugeFunc);
    Cmp->eraseFromParent();
    BO->eraseFromParent();
    return true;
@@ -1492,7 +1576,8 @@ static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
  
  /// Try to combine the compare into a call to the llvm.uadd.with.overflow
  /// intrinsic. Return true if any changes were made.
-bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT) {
+bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
+                                               ModifyDT &ModifiedDT) {
    Value *A, *B;
    BinaryOperator *Add;
    if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
@@ -1519,11 +1604,12 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT) {
      return false;
  
    // Reset callers - do not crash by iterating over a dead instruction.
-  ModifiedDT = true;
+  ModifiedDT = ModifyDT::ModifyInstDT;
    return true;
  }
  
-bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT) {
+bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
+                                               ModifyDT &ModifiedDT) {
    // We are not expecting non-canonical/degenerate code. Just bail out.
    Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
    if (isa<Constant>(A) && isa<Constant>(B))
@@ -1581,7 +1667,7 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT) {
      return false;
  
    // Reset callers - do not crash by iterating over a dead instruction.
-  ModifiedDT = true;
+  ModifiedDT = ModifyDT::ModifyInstDT;
    return true;
  }
  
@@ -1738,7 +1824,7 @@ static bool foldICmpWithDominatingICmp(CmpInst *Cmp,
    return true;
  }
  
-bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
+bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
    if (sinkCmpExpression(Cmp, *TLI))
      return true;
  
@@ -2043,7 +2129,9 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
  /// If the transform is performed, return true and set ModifiedDT to true.
  static bool despeculateCountZeros(IntrinsicInst *CountZeros,
                                    const TargetLowering *TLI,
-                                  const DataLayout *DL, bool &ModifiedDT) {
+                                  const DataLayout *DL, ModifyDT &ModifiedDT,
+                                  SmallSet<BasicBlock *, 32> &FreshBBs,
+                                  bool IsHugeFunc) {
    // If a zero input is undefined, it doesn't make sense to despeculate that.
    if (match(CountZeros->getOperand(1), m_One()))
      return false;
@@ -2068,12 +2156,16 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
    // The intrinsic will be sunk behind a compare against zero and branch.
    BasicBlock *StartBlock = CountZeros->getParent();
    BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
+  if (IsHugeFunc)
+    FreshBBs.insert(CallBlock);
  
    // Create another block after the count zero intrinsic. A PHI will be added
    // in this block to select the result of the intrinsic or the bit-width
    // constant if the input to the intrinsic is zero.
    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
    BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
+  if (IsHugeFunc)
+    FreshBBs.insert(EndBlock);
  
    // Set up a builder to create a compare, conditional branch, and PHI.
    IRBuilder<> Builder(CountZeros->getContext());
@@ -2094,7 +2186,7 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
    // or the bit width of the operand.
    Builder.SetInsertPoint(&EndBlock->front());
    PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
-  CountZeros->replaceAllUsesWith(PN);
+  replaceAllUsesWith(CountZeros, PN, FreshBBs, IsHugeFunc);
    Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
    PN->addIncoming(BitWidth, StartBlock);
    PN->addIncoming(CountZeros, CallBlock);
@@ -2103,11 +2195,11 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
    // undefined zero argument to 'true'. This will also prevent reprocessing the
    // intrinsic; we only despeculate when a zero input is defined.
    CountZeros->setArgOperand(1, Builder.getTrue());
-  ModifiedDT = true;
+  ModifiedDT = ModifyDT::ModifyBBDT;
    return true;
  }
  
-bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
    BasicBlock *BB = CI->getParent();
  
    // Lower inline assembly if we can.
@@ -2241,14 +2333,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
          LargeOffsetGEPMap.erase(II);
        }
  
-      II->replaceAllUsesWith(ArgVal);
+      replaceAllUsesWith(II, ArgVal, FreshBBs, IsHugeFunc);
        II->eraseFromParent();
        return true;
      }
      case Intrinsic::cttz:
      case Intrinsic::ctlz:
        // If counting zeros is expensive, try to avoid it.
-      return despeculateCountZeros(II, TLI, DL, ModifiedDT);
+      return despeculateCountZeros(II, TLI, DL, ModifiedDT, FreshBBs,
+                                   IsHugeFunc);
      case Intrinsic::fshl:
      case Intrinsic::fshr:
        return optimizeFunnelShift(II);
@@ -2265,7 +2358,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
          auto *One = ConstantInt::getSigned(II->getType(), 1);
          auto *CGep =
              ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);
-        II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));
+        replaceAllUsesWith(II, ConstantExpr::getPtrToInt(CGep, II->getType()),
+                           FreshBBs, IsHugeFunc);
          II->eraseFromParent();
          return true;
        }
@@ -2299,7 +2393,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
    FortifiedLibCallSimplifier Simplifier(TLInfo, true);
    IRBuilder<> Builder(CI);
    if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
-    CI->replaceAllUsesWith(V);
+    replaceAllUsesWith(CI, V, FreshBBs, IsHugeFunc);
      CI->eraseFromParent();
      return true;
    }
@@ -2338,7 +2432,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
  ///   ret i32 %tmp2
  /// @endcode
  bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
-                                                bool &ModifiedDT) {
+                                                ModifyDT &ModifiedDT) {
+  if (!BB->getTerminator())
+    return false;
+
    ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
    if (!RetI)
      return false;
@@ -2432,7 +2529,8 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
      BFI->setBlockFreq(
          BB,
          (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
-    ModifiedDT = Changed = true;
+    ModifiedDT = ModifyDT::ModifyBBDT;
+    Changed = true;
      ++NumRetsDup;
    }
  
@@ -5834,7 +5932,7 @@ bool CodeGenPrepare::mergeSExts(Function &F) {
        bool inserted = false;
        for (auto &Pt : CurPts) {
          if (getDT(F).dominates(Inst, Pt)) {
-          Pt->replaceAllUsesWith(Inst);
+          replaceAllUsesWith(Pt, Inst, FreshBBs, IsHugeFunc);
            RemovedInsts.insert(Pt);
            Pt->removeFromParent();
            Pt = Inst;
@@ -5846,7 +5944,7 @@ bool CodeGenPrepare::mergeSExts(Function &F) {
            // Give up if we need to merge in a common dominator as the
            // experiments show it is not profitable.
            continue;
-        Inst->replaceAllUsesWith(Pt);
+        replaceAllUsesWith(Inst, Pt, FreshBBs, IsHugeFunc);
          RemovedInsts.insert(Inst);
          Inst->removeFromParent();
          inserted = true;
@@ -5998,7 +6096,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
          if (GEP->getType() != I8PtrTy)
            NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
        }
-      GEP->replaceAllUsesWith(NewGEP);
+      replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc);
        LargeOffsetGEPID.erase(GEP);
        LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
        GEP->eraseFromParent();
@@ -6135,7 +6233,7 @@ bool CodeGenPrepare::optimizePhiType(
    for (Instruction *U : Uses) {
      if (isa<BitCastInst>(U)) {
        DeletedInstrs.insert(U);
-      U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
+      replaceAllUsesWith(U, ValMap[U->getOperand(0)], FreshBBs, IsHugeFunc);
      } else {
        U->setOperand(0,
                      new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
@@ -6163,7 +6261,7 @@ bool CodeGenPrepare::optimizePhiTypes(Function &F) {
  
    // Remove any old phi's that have been converted.
    for (auto *I : DeletedInstrs) {
-    I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+    replaceAllUsesWith(I, PoisonValue::get(I->getType()), FreshBBs, IsHugeFunc);
      I->eraseFromParent();
    }
  
@@ -6578,7 +6676,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
  
    // Replace all uses of load with new and (except for the use of load in the
    // new and itself).
-  Load->replaceAllUsesWith(NewAnd);
+  replaceAllUsesWith(Load, NewAnd, FreshBBs, IsHugeFunc);
    NewAnd->setOperand(0, Load);
  
    // Remove any and instructions that are now redundant.
@@ -6586,7 +6684,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
      // Check that the and mask is the same as the one we decided to put on the
      // new and.
      if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
-      And->replaceAllUsesWith(NewAnd);
+      replaceAllUsesWith(And, NewAnd, FreshBBs, IsHugeFunc);
        if (&*CurInstIterator == And)
          CurInstIterator = std::next(And->getIterator());
        And->eraseFromParent();
@@ -6696,7 +6794,7 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
    Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
    Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
    Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
-  Shift->replaceAllUsesWith(NewSel);
+  replaceAllUsesWith(Shift, NewSel, FreshBBs, IsHugeFunc);
    Shift->eraseFromParent();
    return true;
  }
@@ -6731,7 +6829,7 @@ bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
    Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, TVal});
    Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, FVal});
    Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
-  Fsh->replaceAllUsesWith(NewSel);
+  replaceAllUsesWith(Fsh, NewSel, FreshBBs, IsHugeFunc);
    Fsh->eraseFromParent();
    return true;
  }
@@ -6818,6 +6916,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
    BasicBlock *StartBlock = SI->getParent();
    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
    BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+  if (IsHugeFunc)
+    FreshBBs.insert(EndBlock);
    BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
  
    // Delete the unconditional branch that was just created by the split.
@@ -6838,6 +6938,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
          TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
                                         EndBlock->getParent(), EndBlock);
          TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+        if (IsHugeFunc)
+          FreshBBs.insert(TrueBlock);
          TrueBranch->setDebugLoc(SI->getDebugLoc());
        }
        auto *TrueInst = cast<Instruction>(SI->getTrueValue());
@@ -6847,6 +6949,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
        if (FalseBlock == nullptr) {
          FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
                                          EndBlock->getParent(), EndBlock);
+        if (IsHugeFunc)
+          FreshBBs.insert(FalseBlock);
          FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
          FalseBranch->setDebugLoc(SI->getDebugLoc());
        }
@@ -6863,6 +6967,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
  
      FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
                                      EndBlock->getParent(), EndBlock);
+    if (IsHugeFunc)
+      FreshBBs.insert(FalseBlock);
      auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
      FalseBranch->setDebugLoc(SI->getDebugLoc());
    }
@@ -6902,7 +7008,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
      PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
      PN->setDebugLoc(SI->getDebugLoc());
  
-    SI->replaceAllUsesWith(PN);
+    replaceAllUsesWith(SI, PN, FreshBBs, IsHugeFunc);
      SI->eraseFromParent();
      INS.erase(SI);
      ++NumSelectsExpanded;
@@ -6940,7 +7046,7 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
    Value *Shuffle = Builder.CreateVectorSplat(NewVecType->getNumElements(), BC1);
    Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
  
-  SVI->replaceAllUsesWith(BC2);
+  replaceAllUsesWith(SVI, BC2, FreshBBs, IsHugeFunc);
    RecursivelyDeleteTriviallyDeadInstructions(
        SVI, TLInfo, nullptr,
        [&](Value *V) { removeAllAssertingVHReferences(V); });
@@ -6993,6 +7099,18 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
    for (Use *U : ToReplace) {
      auto *UI = cast<Instruction>(U->get());
      Instruction *NI = UI->clone();
+
+    if (IsHugeFunc) {
+      // Now we clone an instruction, its operands' defs may sink to this BB
+      // now. So we put the operands defs' BBs into FreshBBs to do optmization.
+      for (unsigned I = 0; I < NI->getNumOperands(); ++I) {
+        auto *OpDef = dyn_cast<Instruction>(NI->getOperand(I));
+        if (!OpDef)
+          continue;
+        FreshBBs.insert(OpDef->getParent());
+      }
+    }
+
      NewInstructions[UI] = NI;
      MaybeDead.insert(UI);
      LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
@@ -7832,7 +7950,9 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
    return true;
  }
  
-static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI) {
+static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
+                           SmallSet<BasicBlock *, 32> &FreshBBs,
+                           bool IsHugeFunc) {
    // Try and convert
    //  %c = icmp ult %x, 8
    //  br %c, bla, blb
@@ -7873,7 +7993,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI) {
                                          ConstantInt::get(UI->getType(), 0));
        LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
        LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");
-      Cmp->replaceAllUsesWith(NewCmp);
+      replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc);
        return true;
      }
      if (Cmp->isEquality() &&
@@ -7886,14 +8006,14 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI) {
                                          ConstantInt::get(UI->getType(), 0));
        LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
        LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");
-      Cmp->replaceAllUsesWith(NewCmp);
+      replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc);
        return true;
      }
    }
    return false;
  }
  
-bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
+bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
    // Bail out if we inserted the instruction to prevent optimizations from
    // stepping on each other's toes.
    if (InsertedInsts.count(I))
@@ -7906,7 +8026,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
      // trivial PHI, go ahead and zap it here.
      if (Value *V = simplifyInstruction(P, {*DL, TLInfo})) {
        LargeOffsetGEPMap.erase(P);
-      P->replaceAllUsesWith(V);
+      replaceAllUsesWith(P, V, FreshBBs, IsHugeFunc);
        P->eraseFromParent();
        ++NumPHIsElim;
        return true;
@@ -7995,7 +8115,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
        Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
                                          GEPI->getName(), GEPI);
        NC->setDebugLoc(GEPI->getDebugLoc());
-      GEPI->replaceAllUsesWith(NC);
+      replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc);
        GEPI->eraseFromParent();
        ++NumGEPsElim;
        optimizeInst(NC, ModifiedDT);
@@ -8028,7 +8148,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
            F->takeName(FI);
            CmpI->setOperand(Const0 ? 1 : 0, F);
          }
-        FI->replaceAllUsesWith(CmpI);
+        replaceAllUsesWith(FI, CmpI, FreshBBs, IsHugeFunc);
          FI->eraseFromParent();
          return true;
        }
@@ -8055,7 +8175,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
    case Instruction::ExtractElement:
      return optimizeExtractElementInst(cast<ExtractElementInst>(I));
    case Instruction::Br:
-    return optimizeBranch(cast<BranchInst>(I), *TLI);
+    return optimizeBranch(cast<BranchInst>(I), *TLI, FreshBBs, IsHugeFunc);
    }
  
    return false;
@@ -8073,7 +8193,7 @@ bool CodeGenPrepare::makeBitReverse(Instruction &I) {
    if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
      return false;
    Instruction *LastInst = Insts.back();
-  I.replaceAllUsesWith(LastInst);
+  replaceAllUsesWith(&I, LastInst, FreshBBs, IsHugeFunc);
    RecursivelyDeleteTriviallyDeadInstructions(
        &I, TLInfo, nullptr,
        [&](Value *V) { removeAllAssertingVHReferences(V); });
@@ -8083,16 +8203,29 @@ bool CodeGenPrepare::makeBitReverse(Instruction &I) {
  // In this pass we look for GEP and cast instructions that are used
  // across basic blocks and rewrite them to improve basic-block-at-a-time
  // selection.
-bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) {
    SunkAddrs.clear();
    bool MadeChange = false;
  
-  CurInstIterator = BB.begin();
-  while (CurInstIterator != BB.end()) {
-    MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
-    if (ModifiedDT)
-      return true;
-  }
+  do {
+    CurInstIterator = BB.begin();
+    ModifiedDT = ModifyDT::NotModifyDT;
+    while (CurInstIterator != BB.end()) {
+      MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
+      if (ModifiedDT != ModifyDT::NotModifyDT) {
+        // For huge function we tend to quickly go though the inner optmization
+        // opportunities in the BB. So we go back to the BB head to re-optimize
+        // each instruction instead of go back to the function head.
+        if (IsHugeFunc) {
+          DT.reset();
+          getDT(*BB.getParent());
+          break;
+        } else {
+          return true;
+        }
+      }
+    }
+  } while (ModifiedDT == ModifyDT::ModifyInstDT);
  
    bool MadeBitReverse = true;
    while (MadeBitReverse) {
@@ -8252,7 +8385,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
  ///
  /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
  ///
-bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
+bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
    if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
      return false;
  
@@ -8303,6 +8436,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
      auto *TmpBB =
          BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
                             BB.getParent(), BB.getNextNode());
+    if (IsHugeFunc)
+      FreshBBs.insert(TmpBB);
  
      // Update original basic block by using the first condition directly by the
      // branch instruction and removing the no longer needed and/or instruction.
@@ -8419,7 +8554,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
        }
      }
  
-    ModifiedDT = true;
+    ModifiedDT = ModifyDT::ModifyBBDT;
      MadeChange = true;
  
      LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
diff --git a/llvm/test/CodeGen/AArch64/and-sink.ll b/llvm/test/CodeGen/AArch64/and-sink.ll

index 74e5bc8..9b4a627 100644 (file)
--- a/llvm/test/CodeGen/AArch64/and-sink.ll
+++ b/llvm/test/CodeGen/AArch64/and-sink.ll
@@ -1,5 +1,6 @@
  ; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
  ; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
+; RUN: opt -S -codegenprepare -cgpp-huge-func=0 -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
  
  @A = dso_local global i32 zeroinitializer
  @B = dso_local global i32 zeroinitializer
diff --git a/llvm/test/CodeGen/X86/and-sink.ll b/llvm/test/CodeGen/X86/and-sink.ll

index 002d1cd..965e848 100644 (file)
--- a/llvm/test/CodeGen/X86/and-sink.ll
+++ b/llvm/test/CodeGen/X86/and-sink.ll
@@ -1,6 +1,7 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=i686-unknown -verify-machineinstrs < %s | FileCheck %s
  ; RUN: opt < %s -codegenprepare -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s
+; RUN: opt < %s -codegenprepare -cgpp-huge-func=0 -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s
  
  @A = global i32 zeroinitializer
  @B = global i32 zeroinitializer
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll

index 9930b03..adeba80 100644 (file)
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
  ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -cgpp-huge-func=0 -S | FileCheck -check-prefix=CHECK %s
  
  ; Sink the shufflevector/insertelement pair, followed by the trunc. The sunk instruction end up dead.
  define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) {
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll

index 470a9f2..43cdbd0 100644 (file)
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
  ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -cgpp-huge-func=0 -S | FileCheck -check-prefix=CHECK %s
  
  ; Sink the shufflevector/insertelement pair, followed by the trunc. The sunk instruction end up dead.
  define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) {
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll

index a1fd88e..c84f5eb 100644 (file)
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@@ -1,5 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
  ; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -codegenprepare -cgpp-huge-func=0 < %s | FileCheck %s
  
  target datalayout =
  "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
author	Xiang1 Zhang <xiang1.zhang@intel.com>
	Tue, 30 Aug 2022 05:58:35 +0000 (13:58 +0800)
committer	Xiang1 Zhang <xiang1.zhang@intel.com>
	Wed, 7 Sep 2022 02:05:40 +0000 (10:05 +0800)
llvm/lib/CodeGen/CodeGenPrepare.cpp		patch \| blob \| history
llvm/test/CodeGen/AArch64/and-sink.ll		patch \| blob \| history
llvm/test/CodeGen/X86/and-sink.ll		patch \| blob \| history
llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll		patch \| blob \| history
llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll		patch \| blob \| history
llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll		patch \| blob \| history