From a96b287827dd98fab4d2549e8e7a588d945853e7 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Thu, 6 May 2021 18:35:24 -0700 Subject: [PATCH] Improve loop inversion (#52347) * Improve loop inversion When doing loop inversion, we duplicate the condition block to the top of the loop to create a fall-through zero trip test. Improve this by redirecting all incoming branches to the condition block that appear to be coming from outside the potential loop body to branch to the new duplicated condition block. This improves the ability of the loop recognizer to find loops, whereas before we would reject the loops as "multi-entry". There are good diffs where more loops are detected, leading to better optimization and more loop alignment. There are also asm diffs regressions. * Formatting * Updates 1. Allow scratch block to be the loop head, since we introduce a new block to duplicate the condition, so the scratch block becomes a BBJ_NONE, which is fine. 2. Fix issue on x86 where a catch return, which is a BBJ_ALWAYS on x86, can't be the "head" block of the loop. --- src/coreclr/jit/compiler.h | 2 +- src/coreclr/jit/optimizer.cpp | 127 +++++++++++++++++++++++----------- 2 files changed, 88 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 9172a305757..92f599e710d 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -6516,7 +6516,7 @@ protected: // loop nested in "loopInd" that shares the same head as "loopInd". void optUpdateLoopHead(unsigned loopInd, BasicBlock* from, BasicBlock* to); - void optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap); + void optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, const bool updatePreds = false); // Marks the containsCall information to "lnum" and any parent loops. void AddContainsCallAllContainingLoops(unsigned lnum); diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 47ebe957c52..ca9591a5dff 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -2613,11 +2613,14 @@ void Compiler::optIdentifyLoopsForAlignment() // Updates the successors of `blk`: if `blk2` is a branch successor of `blk`, and there is a mapping // for `blk2->blk3` in `redirectMap`, change `blk` so that `blk3` is this branch successor. // +// Note that fall-through successors are not modified, including predecessor lists. +// // Arguments: // blk - block to redirect // redirectMap - block->block map specifying how the `blk` target will be redirected. +// updatePreds - if `true`, update the predecessor lists to match. // -void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap) +void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap, const bool updatePreds) { BasicBlock* newJumpDest = nullptr; switch (blk->bbJumpKind) @@ -2638,6 +2641,11 @@ void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap) // All of these have a single jump destination to update. if (redirectMap->Lookup(blk->bbJumpDest, &newJumpDest)) { + if (updatePreds) + { + fgRemoveRefPred(blk->bbJumpDest, blk); + fgAddRefPred(newJumpDest, blk); + } blk->bbJumpDest = newJumpDest; } break; @@ -2650,7 +2658,11 @@ void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap) BasicBlock* switchDest = blk->bbJumpSwt->bbsDstTab[i]; if (redirectMap->Lookup(switchDest, &newJumpDest)) { - switchDest = newJumpDest; + if (updatePreds) + { + fgRemoveRefPred(switchDest, blk); + fgAddRefPred(newJumpDest, blk); + } blk->bbJumpSwt->bbsDstTab[i] = newJumpDest; redirected = true; } @@ -4188,7 +4200,7 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) assert(opts.OptimizationEnabled()); assert(compCodeOpt() != SMALL_CODE); - /* Does the BB end with an unconditional jump? */ + // Does the BB end with an unconditional jump? if (block->bbJumpKind != BBJ_ALWAYS || (block->bbFlags & BBF_KEEP_BBJ_ALWAYS)) { @@ -4196,13 +4208,6 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) return; } - // block can't be the scratch bb, since we prefer to keep flow - // out of the scratch bb as BBJ_ALWAYS or BBJ_NONE. - if (fgBBisScratch(block)) - { - return; - } - // Get hold of the jump target BasicBlock* bTest = block->bbJumpDest; @@ -4221,14 +4226,16 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) // Since test is a BBJ_COND it will have a bbNext noway_assert(bTest->bbNext != nullptr); - // 'block' must be in the same try region as the condition, since we're going to insert - // a duplicated condition in 'block', and the condition might include exception throwing code. - if (!BasicBlock::sameTryRegion(block, bTest)) + // 'block' must be in the same try region as the condition, since we're going to insert a duplicated condition + // in a new block after 'block', and the condition might include exception throwing code. + // On non-funclet platforms (x86), the catch exit is a BBJ_ALWAYS, but we don't want that to + // be considered as the head of a loop, so also disallow different handler regions. + if (!BasicBlock::sameEHRegion(block, bTest)) { return; } - // We're going to change 'block' to branch to bTest->bbNext, so that also better be in the + // The duplicated condition block will branch to bTest->bbNext, so that also better be in the // same try region (or no try region) to avoid generating illegal flow. BasicBlock* bTestNext = bTest->bbNext; if (bTestNext->hasTryIndex() && !BasicBlock::sameTryRegion(block, bTestNext)) @@ -4411,7 +4418,12 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) bool foundCondTree = false; - // Clone each statement in bTest and append to block. + // Create a new block after `block` to put the copied condition code. + block->bbJumpKind = BBJ_NONE; + block->bbJumpDest = nullptr; + BasicBlock* bNewCond = fgNewBBafter(BBJ_COND, block, /*extendRegion*/ true); + + // Clone each statement in bTest and append to bNewCond. for (Statement* stmt : bTest->Statements()) { GenTree* originalTree = stmt->GetRootNode(); @@ -4439,7 +4451,7 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) gtReverseCond(clonedCompareTree); } - Statement* clonedStmt = fgNewStmtAtEnd(block, clonedTree); + Statement* clonedStmt = fgNewStmtAtEnd(bNewCond, clonedTree); if (opts.compDbgInfo) { @@ -4456,24 +4468,59 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) // this is a conservative guess. if (auto copyFlags = bTest->bbFlags & (BBF_HAS_IDX_LEN | BBF_HAS_NULLCHECK | BBF_HAS_NEWOBJ | BBF_HAS_NEWARRAY)) { - block->bbFlags |= copyFlags; + bNewCond->bbFlags |= copyFlags; } - /* Change the block to end with a conditional jump */ - - block->bbJumpKind = BBJ_COND; - block->bbJumpDest = bTest->bbNext; + bNewCond->bbJumpDest = bTest->bbNext; + bNewCond->inheritWeight(block); - /* Update bbRefs and bbPreds for 'block->bbNext' 'bTest' and 'bTest->bbNext' */ + // Update bbRefs and bbPreds for 'bNewCond', 'bNewCond->bbNext' 'bTest' and 'bTest->bbNext'. - fgAddRefPred(block->bbNext, block); + fgAddRefPred(bNewCond, block); + fgAddRefPred(bNewCond->bbNext, bNewCond); fgRemoveRefPred(bTest, block); - fgAddRefPred(bTest->bbNext, block); + fgAddRefPred(bTest->bbNext, bNewCond); + + // Move all predecessor edges that look like loop entry edges to point to the new cloned condition + // block, not the existing condition block. The idea is that if we only move `block` to point to + // `bNewCond`, but leave other `bTest` predecessors still pointing to `bTest`, when we eventually + // recognize loops, the loop will appear to have multiple entries, which will prevent optimization. + // We don't have loops yet, but blocks should be in increasing lexical numbered order, so use that + // as the proxy for predecessors that are "in" versus "out" of the potential loop. Note that correctness + // is maintained no matter which condition block we point to, but we'll lose optimization potential + // (and create spaghetti code) if we get it wrong. + + BlockToBlockMap blockMap(getAllocator(CMK_LoopOpt)); + bool blockMapInitialized = false; + + unsigned loopFirstNum = bNewCond->bbNext->bbNum; + unsigned loopBottomNum = bTest->bbNum; + for (flowList* pred = bTest->bbPreds; pred != nullptr; pred = pred->flNext) + { + BasicBlock* predBlock = pred->getBlock(); + unsigned bNum = predBlock->bbNum; + if ((loopFirstNum <= bNum) && (bNum <= loopBottomNum)) + { + // Looks like the predecessor is from within the potential loop; skip it. + continue; + } + + if (!blockMapInitialized) + { + blockMapInitialized = true; + blockMap.Set(bTest, bNewCond); + } + + // Redirect the predecessor to the new block. + JITDUMP("Redirecting non-loop " FMT_BB " -> " FMT_BB " to " FMT_BB " -> " FMT_BB "\n", predBlock->bbNum, + bTest->bbNum, predBlock->bbNum, bNewCond->bbNum); + optRedirectBlock(predBlock, &blockMap, /*updatePreds*/ true); + } // If we have profile data for all blocks and we know that we are cloning the - // bTest block into block and thus changing the control flow from block so - // that it no longer goes directly to bTest anymore, we have to adjust + // `bTest` block into `bNewCond` and thus changing the control flow from `block` so + // that it no longer goes directly to `bTest` anymore, we have to adjust // various weights. // if (allProfileWeightsAreValid) @@ -4521,33 +4568,33 @@ void Compiler::optInvertWhileLoop(BasicBlock* block) BasicBlock::weight_t const blockToNextWeight = weightBlock * blockToNextLikelihood; BasicBlock::weight_t const blockToAfterWeight = weightBlock * blockToAfterLikelihood; - flowList* const edgeBlockToNext = fgGetPredForBlock(block->bbNext, block); - flowList* const edgeBlockToAfter = fgGetPredForBlock(block->bbJumpDest, block); + flowList* const edgeBlockToNext = fgGetPredForBlock(bNewCond->bbNext, bNewCond); + flowList* const edgeBlockToAfter = fgGetPredForBlock(bNewCond->bbJumpDest, bNewCond); - JITDUMP("Setting weight of " FMT_BB " -> " FMT_BB " to " FMT_WT " (enter loop)\n", block->bbNum, - block->bbNext->bbNum, blockToNextWeight); - JITDUMP("Setting weight of " FMT_BB " -> " FMT_BB " to " FMT_WT " (avoid loop)\n", block->bbNum, - block->bbJumpDest->bbNum, blockToAfterWeight); + JITDUMP("Setting weight of " FMT_BB " -> " FMT_BB " to " FMT_WT " (enter loop)\n", bNewCond->bbNum, + bNewCond->bbNext->bbNum, blockToNextWeight); + JITDUMP("Setting weight of " FMT_BB " -> " FMT_BB " to " FMT_WT " (avoid loop)\n", bNewCond->bbNum, + bNewCond->bbJumpDest->bbNum, blockToAfterWeight); - edgeBlockToNext->setEdgeWeights(blockToNextWeight, blockToNextWeight, block->bbNext); - edgeBlockToAfter->setEdgeWeights(blockToAfterWeight, blockToAfterWeight, block->bbJumpDest); + edgeBlockToNext->setEdgeWeights(blockToNextWeight, blockToNextWeight, bNewCond->bbNext); + edgeBlockToAfter->setEdgeWeights(blockToAfterWeight, blockToAfterWeight, bNewCond->bbJumpDest); #ifdef DEBUG // Verify profile for the two target blocks is consistent. // - fgDebugCheckIncomingProfileData(block->bbNext); - fgDebugCheckIncomingProfileData(block->bbJumpDest); + fgDebugCheckIncomingProfileData(bNewCond->bbNext); + fgDebugCheckIncomingProfileData(bNewCond->bbJumpDest); #endif // DEBUG } #ifdef DEBUG if (verbose) { - printf("\nDuplicated loop exit block at " FMT_BB " for loop (" FMT_BB " - " FMT_BB ")\n", block->bbNum, - block->bbNext->bbNum, bTest->bbNum); + printf("\nDuplicated loop exit block at " FMT_BB " for loop (" FMT_BB " - " FMT_BB ")\n", bNewCond->bbNum, + bNewCond->bbNext->bbNum, bTest->bbNum); printf("Estimated code size expansion is %d\n", estDupCostSz); - fgDumpBlock(block); + fgDumpBlock(bNewCond); fgDumpBlock(bTest); } #endif // DEBUG @@ -4583,7 +4630,7 @@ PhaseStatus Compiler::optInvertLoops() // if (block->bbWeight == BB_ZERO_WEIGHT) { - /* Zero weighted block can't have a LOOP_HEAD flag */ + // Zero weighted block can't have a LOOP_HEAD flag noway_assert(block->isLoopHead() == false); continue; } -- 2.34.1