From 46ec57e57605e92121d972438bff04bd694f74d5 Mon Sep 17 00:00:00 2001
From: Evgeniy Stepanov <eugeni.stepanov@gmail.com>
Date: Fri, 3 May 2019 17:31:49 +0000
Subject: [PATCH] Revert "[CodeGenPrepare] limit overflow intrinsic matching to
 a single basic block"

This reverts commit r359879, which introduced a compiler crash.

llvm-svn: 359908
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp                | 89 ++++++++++++----------
 llvm/test/CodeGen/X86/cgp-usubo.ll                 | 15 ++--
 .../CodeGenPrepare/X86/optimizeSelect-DT.ll        |  9 ++-
 .../CodeGenPrepare/X86/overflow-intrinsics.ll      | 20 ++---
 4 files changed, 68 insertions(+), 65 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 30b09d3..4335f10 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -388,9 +388,9 @@ class TypePromotionTransaction;
     bool tryToSinkFreeOperands(Instruction *I);
     bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
                                      Intrinsic::ID IID);
-    bool optimizeCmp(CmpInst *Cmp);
-    bool combineToUSubWithOverflow(CmpInst *Cmp);
-    bool combineToUAddWithOverflow(CmpInst *Cmp);
+    bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
+    bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
+    bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
   };
 
 } // end anonymous namespace
@@ -484,13 +484,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     if (!LargeOffsetGEPMap.empty())
       MadeChange |= splitLargeGEPOffsets();
 
-    // Really free instructions removed during promotion or kept around to
-    // improve efficiency (see comments in overflow intrinsic transforms).
-    for (Instruction *I : RemovedInsts) {
-      if (I->getParent())
-        I->removeFromParent();
+    // Really free removed instructions during promotion.
+    for (Instruction *I : RemovedInsts)
       I->deleteValue();
-    }
 
     EverMadeChange |= MadeChange;
     SeenChainsForSExt.clear();
@@ -1181,20 +1177,6 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
                                                  CmpInst *Cmp,
                                                  Intrinsic::ID IID) {
-  if (BO->getParent() != Cmp->getParent()) {
-    // We used to use a dominator tree here to allow multi-block optimization.
-    // But that was problematic because:
-    // 1. It could cause a perf regression by hoisting the math op into the
-    //    critical path.
-    // 2. It could cause a perf regression by creating a value that was live
-    //    across multiple blocks and increasing register pressure.
-    // 3. Use of a dominator tree could cause large compile-time regression.
-    //    This is because we recompute the DT on every change in the main CGP
-    //    run-loop. The recomputing is probably unnecessary in many cases, so if
-    //    that was fixed, using a DT here would be ok.
-    return false;
-  }
-
   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
   Value *Arg0 = BO->getOperand(0);
   Value *Arg1 = BO->getOperand(1);
@@ -1204,28 +1186,45 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
     Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
   }
 
-  // Insert at the first instruction of the pair.
-  Instruction *InsertPt = nullptr;
-  for (Instruction &Iter : *Cmp->getParent()) {
-    if (&Iter == BO || &Iter == Cmp) {
-      InsertPt = &Iter;
-      break;
+  Instruction *InsertPt;
+  if (BO->hasOneUse() && BO->user_back() == Cmp) {
+    // If the math is only used by the compare, insert at the compare to keep
+    // the condition in the same block as its users. (CGP aggressively sinks
+    // compares to help out SDAG.)
+    InsertPt = Cmp;
+  } else {
+    // The math and compare may be independent instructions. Check dominance to
+    // determine the insertion point for the intrinsic.
+    bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp);
+    if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO))
+      return false;
+
+    BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent();
+    if (MathBB != CmpBB) {
+      // Avoid hoisting an extra op into a dominating block and creating a
+      // potentially longer critical path.
+      if (!MathDominates)
+        return false;
+      // Check that the insertion doesn't create a value that is live across
+      // more than two blocks, so to minimise the increase in register pressure.
+      BasicBlock *Dominator = MathDominates ? MathBB : CmpBB;
+      BasicBlock *Dominated = MathDominates ? CmpBB : MathBB;
+      auto Successors = successors(Dominator);
+      if (llvm::find(Successors, Dominated) == Successors.end())
+        return false;
     }
+
+    InsertPt = MathDominates ? cast<Instruction>(BO) : cast<Instruction>(Cmp);
   }
-  assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
 
   IRBuilder<> Builder(InsertPt);
   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
   Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
-
-  // Delay the actual removal/deletion of the binop and compare for efficiency.
-  // If we delete those now, we would invalidate the instruction iterator and
-  // trigger dominator tree updates.
   BO->replaceAllUsesWith(Math);
   Cmp->replaceAllUsesWith(OV);
-  RemovedInsts.insert(BO);
-  RemovedInsts.insert(Cmp);
+  BO->eraseFromParent();
+  Cmp->eraseFromParent();
   return true;
 }
 
@@ -1261,7 +1260,8 @@ static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
 
 /// Try to combine the compare into a call to the llvm.uadd.with.overflow
 /// intrinsic. Return true if any changes were made.
-bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp) {
+bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
+                                               bool &ModifiedDT) {
   Value *A, *B;
   BinaryOperator *Add;
   if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
@@ -1281,10 +1281,13 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp) {
   if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
     return false;
 
+  // Reset callers - do not crash by iterating over a dead instruction.
+  ModifiedDT = true;
   return true;
 }
 
-bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp) {
+bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
+                                               bool &ModifiedDT) {
   // We are not expecting non-canonical/degenerate code. Just bail out.
   Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
   if (isa<Constant>(A) && isa<Constant>(B))
@@ -1339,6 +1342,8 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp) {
   if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
     return false;
 
+  // Reset callers - do not crash by iterating over a dead instruction.
+  ModifiedDT = true;
   return true;
 }
 
@@ -1408,14 +1413,14 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
   return MadeChange;
 }
 
-bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp) {
+bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
 
-  if (combineToUAddWithOverflow(Cmp))
+  if (combineToUAddWithOverflow(Cmp, ModifiedDT))
     return true;
 
-  if (combineToUSubWithOverflow(Cmp))
+  if (combineToUSubWithOverflow(Cmp, ModifiedDT))
     return true;
 
   return false;
@@ -6940,7 +6945,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
   }
 
   if (auto *Cmp = dyn_cast<CmpInst>(I))
-    if (TLI && optimizeCmp(Cmp))
+    if (TLI && optimizeCmp(Cmp, ModifiedDT))
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
diff --git a/llvm/test/CodeGen/X86/cgp-usubo.ll b/llvm/test/CodeGen/X86/cgp-usubo.ll
index 9bc32e3..2bdf7ce 100644
--- a/llvm/test/CodeGen/X86/cgp-usubo.ll
+++ b/llvm/test/CodeGen/X86/cgp-usubo.ll
@@ -121,7 +121,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) {
   ret i1 %ov
 }
 
-; This used to verify insertion point for multi-BB, but now we just bail out.
+; Verify insertion point for multi-BB.
 
 declare void @call(i1)
 
@@ -131,17 +131,14 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) nounwi
 ; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    je .LBB8_2
 ; CHECK-NEXT:  # %bb.1: # %t
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    subq %rsi, %rax
-; CHECK-NEXT:    movq %rax, (%rdx)
-; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    je .LBB8_2
-; CHECK-NEXT:  # %bb.3: # %end
-; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    subq %rsi, %rdi
 ; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movq %rdi, (%rdx)
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne .LBB8_3
 ; CHECK-NEXT:  .LBB8_2: # %f
 ; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:  .LBB8_3: # %end
 ; CHECK-NEXT:    retq
 entry:
   br i1 %cond, label %t, label %f
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
index 05389bf..dc63842 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll
@@ -14,10 +14,11 @@ define i1 @PR41004(i32 %x, i32 %y, i32 %t1) {
 ; CHECK-NEXT:    br label [[SELECT_END]]
 ; CHECK:       select.end:
 ; CHECK-NEXT:    [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[NEG:%.*]] = add i32 [[T1:%.*]], -1
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[NEG]], [[MUL]]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0
-; CHECK-NEXT:    ret i1 [[TOBOOL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[T1:%.*]], i32 1)
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MATH]], [[MUL]]
+; CHECK-NEXT:    ret i1 [[OV]]
 ;
 entry:
   %rem = srem i32 %x, 2
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
index 9ba1d7d..ab636c3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll
@@ -47,16 +47,15 @@ define i64 @uaddo3(i64 %a, i64 %b) nounwind ssp {
   ret i64 %Q
 }
 
-; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
-
 define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
 ; CHECK-LABEL: @uaddo4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
 ; CHECK-NEXT:    ret i64 [[Q]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i64 0
@@ -363,7 +362,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) {
   ret i1 %ov
 }
 
-; This used to verify insertion point for multi-BB, but now we just bail out.
+; Verify insertion point for multi-BB.
 
 declare void @call(i1)
 
@@ -372,14 +371,15 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    store i64 [[S]], i64* [[P:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT:    [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    store i64 [[MATH]], i64* [[P:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[END:%.*]], label [[F]]
 ; CHECK:       f:
 ; CHECK-NEXT:    ret i1 [[COND]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[OV]]
+; CHECK-NEXT:    ret i1 [[OV1]]
 ;
 entry:
   br i1 %cond, label %t, label %f
-- 
2.7.4