From 46ec57e57605e92121d972438bff04bd694f74d5 Mon Sep 17 00:00:00 2001 From: Evgeniy Stepanov Date: Fri, 3 May 2019 17:31:49 +0000 Subject: [PATCH] Revert "[CodeGenPrepare] limit overflow intrinsic matching to a single basic block" This reverts commit r359879, which introduced a compiler crash. llvm-svn: 359908 --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 89 ++++++++++++---------- llvm/test/CodeGen/X86/cgp-usubo.ll | 15 ++-- .../CodeGenPrepare/X86/optimizeSelect-DT.ll | 9 ++- .../CodeGenPrepare/X86/overflow-intrinsics.ll | 20 ++--- 4 files changed, 68 insertions(+), 65 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 30b09d3..4335f10 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -388,9 +388,9 @@ class TypePromotionTransaction; bool tryToSinkFreeOperands(Instruction *I); bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, Intrinsic::ID IID); - bool optimizeCmp(CmpInst *Cmp); - bool combineToUSubWithOverflow(CmpInst *Cmp); - bool combineToUAddWithOverflow(CmpInst *Cmp); + bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT); + bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT); + bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT); }; } // end anonymous namespace @@ -484,13 +484,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (!LargeOffsetGEPMap.empty()) MadeChange |= splitLargeGEPOffsets(); - // Really free instructions removed during promotion or kept around to - // improve efficiency (see comments in overflow intrinsic transforms). - for (Instruction *I : RemovedInsts) { - if (I->getParent()) - I->removeFromParent(); + // Really free removed instructions during promotion. + for (Instruction *I : RemovedInsts) I->deleteValue(); - } EverMadeChange |= MadeChange; SeenChainsForSExt.clear(); @@ -1181,20 +1177,6 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI, bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp, Intrinsic::ID IID) { - if (BO->getParent() != Cmp->getParent()) { - // We used to use a dominator tree here to allow multi-block optimization. - // But that was problematic because: - // 1. It could cause a perf regression by hoisting the math op into the - // critical path. - // 2. It could cause a perf regression by creating a value that was live - // across multiple blocks and increasing register pressure. - // 3. Use of a dominator tree could cause large compile-time regression. - // This is because we recompute the DT on every change in the main CGP - // run-loop. The recomputing is probably unnecessary in many cases, so if - // that was fixed, using a DT here would be ok. - return false; - } - // We allow matching the canonical IR (add X, C) back to (usubo X, -C). Value *Arg0 = BO->getOperand(0); Value *Arg1 = BO->getOperand(1); @@ -1204,28 +1186,45 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO, Arg1 = ConstantExpr::getNeg(cast(Arg1)); } - // Insert at the first instruction of the pair. - Instruction *InsertPt = nullptr; - for (Instruction &Iter : *Cmp->getParent()) { - if (&Iter == BO || &Iter == Cmp) { - InsertPt = &Iter; - break; + Instruction *InsertPt; + if (BO->hasOneUse() && BO->user_back() == Cmp) { + // If the math is only used by the compare, insert at the compare to keep + // the condition in the same block as its users. (CGP aggressively sinks + // compares to help out SDAG.) + InsertPt = Cmp; + } else { + // The math and compare may be independent instructions. Check dominance to + // determine the insertion point for the intrinsic. + bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp); + if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO)) + return false; + + BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent(); + if (MathBB != CmpBB) { + // Avoid hoisting an extra op into a dominating block and creating a + // potentially longer critical path. + if (!MathDominates) + return false; + // Check that the insertion doesn't create a value that is live across + // more than two blocks, so to minimise the increase in register pressure. + BasicBlock *Dominator = MathDominates ? MathBB : CmpBB; + BasicBlock *Dominated = MathDominates ? CmpBB : MathBB; + auto Successors = successors(Dominator); + if (llvm::find(Successors, Dominated) == Successors.end()) + return false; } + + InsertPt = MathDominates ? cast(BO) : cast(Cmp); } - assert(InsertPt != nullptr && "Parent block did not contain cmp or binop"); IRBuilder<> Builder(InsertPt); Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1); Value *Math = Builder.CreateExtractValue(MathOV, 0, "math"); Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov"); - - // Delay the actual removal/deletion of the binop and compare for efficiency. - // If we delete those now, we would invalidate the instruction iterator and - // trigger dominator tree updates. BO->replaceAllUsesWith(Math); Cmp->replaceAllUsesWith(OV); - RemovedInsts.insert(BO); - RemovedInsts.insert(Cmp); + BO->eraseFromParent(); + Cmp->eraseFromParent(); return true; } @@ -1261,7 +1260,8 @@ static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp, /// Try to combine the compare into a call to the llvm.uadd.with.overflow /// intrinsic. Return true if any changes were made. -bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp) { +bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, + bool &ModifiedDT) { Value *A, *B; BinaryOperator *Add; if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) @@ -1281,10 +1281,13 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp) { if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow)) return false; + // Reset callers - do not crash by iterating over a dead instruction. + ModifiedDT = true; return true; } -bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp) { +bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, + bool &ModifiedDT) { // We are not expecting non-canonical/degenerate code. Just bail out. Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); if (isa(A) && isa(B)) @@ -1339,6 +1342,8 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp) { if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow)) return false; + // Reset callers - do not crash by iterating over a dead instruction. + ModifiedDT = true; return true; } @@ -1408,14 +1413,14 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { return MadeChange; } -bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp) { +bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; - if (combineToUAddWithOverflow(Cmp)) + if (combineToUAddWithOverflow(Cmp, ModifiedDT)) return true; - if (combineToUSubWithOverflow(Cmp)) + if (combineToUSubWithOverflow(Cmp, ModifiedDT)) return true; return false; @@ -6940,7 +6945,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { } if (auto *Cmp = dyn_cast(I)) - if (TLI && optimizeCmp(Cmp)) + if (TLI && optimizeCmp(Cmp, ModifiedDT)) return true; if (LoadInst *LI = dyn_cast(I)) { diff --git a/llvm/test/CodeGen/X86/cgp-usubo.ll b/llvm/test/CodeGen/X86/cgp-usubo.ll index 9bc32e3..2bdf7ce 100644 --- a/llvm/test/CodeGen/X86/cgp-usubo.ll +++ b/llvm/test/CodeGen/X86/cgp-usubo.ll @@ -121,7 +121,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) { ret i1 %ov } -; This used to verify insertion point for multi-BB, but now we just bail out. +; Verify insertion point for multi-BB. declare void @call(i1) @@ -131,17 +131,14 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) nounwi ; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: je .LBB8_2 ; CHECK-NEXT: # %bb.1: # %t -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: subq %rsi, %rax -; CHECK-NEXT: movq %rax, (%rdx) -; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: je .LBB8_2 -; CHECK-NEXT: # %bb.3: # %end -; CHECK-NEXT: cmpq %rsi, %rdi +; CHECK-NEXT: subq %rsi, %rdi ; CHECK-NEXT: setb %al -; CHECK-NEXT: retq +; CHECK-NEXT: movq %rdi, (%rdx) +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: jne .LBB8_3 ; CHECK-NEXT: .LBB8_2: # %f ; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: .LBB8_3: # %end ; CHECK-NEXT: retq entry: br i1 %cond, label %t, label %f diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll b/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll index 05389bf..dc63842 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/optimizeSelect-DT.ll @@ -14,10 +14,11 @@ define i1 @PR41004(i32 %x, i32 %y, i32 %t1) { ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: ; CHECK-NEXT: [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[NEG:%.*]] = add i32 [[T1:%.*]], -1 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NEG]], [[MUL]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0 -; CHECK-NEXT: ret i1 [[TOBOOL]] +; CHECK-NEXT: [[TMP0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[T1:%.*]], i32 1) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0 +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MATH]], [[MUL]] +; CHECK-NEXT: ret i1 [[OV]] ; entry: %rem = srem i32 %x, 2 diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll index 9ba1d7d..ab636c3 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/overflow-intrinsics.ll @@ -47,16 +47,15 @@ define i64 @uaddo3(i64 %a, i64 %b) nounwind ssp { ret i64 %Q } -; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic. - define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp { ; CHECK-LABEL: @uaddo4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]] ; CHECK: next: -; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]] -; CHECK-NEXT: [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42 +; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]]) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 +; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42 ; CHECK-NEXT: ret i64 [[Q]] ; CHECK: exit: ; CHECK-NEXT: ret i64 0 @@ -363,7 +362,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) { ret i1 %ov } -; This used to verify insertion point for multi-BB, but now we just bail out. +; Verify insertion point for multi-BB. declare void @call(i1) @@ -372,14 +371,15 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]] ; CHECK: t: -; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: store i64 [[S]], i64* [[P:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]]) +; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0 +; CHECK-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 +; CHECK-NEXT: store i64 [[MATH]], i64* [[P:%.*]] ; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[F]] ; CHECK: f: ; CHECK-NEXT: ret i1 [[COND]] ; CHECK: end: -; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]] -; CHECK-NEXT: ret i1 [[OV]] +; CHECK-NEXT: ret i1 [[OV1]] ; entry: br i1 %cond, label %t, label %f -- 2.7.4