Improve reachability sets computation (#84204)
authorBruce Forstall <brucefo@microsoft.com>
Sat, 1 Apr 2023 18:20:33 +0000 (11:20 -0700)
committerGitHub <noreply@github.com>
Sat, 1 Apr 2023 18:20:33 +0000 (11:20 -0700)
* Improve reachability sets computation

Two changes to improve the throughput of computing `bbReach` sets:
1. In `fgComputeReachabilitySets()`, iterate over the blocks in reverse
post-order. This leads to fewer outer `do ... while(change)` iterations.
To do this, the `fgDfsReversePostorder()` function which creates the
reverse post-order numbers and ordering was hoisted out of dominator
creation and above reachability computation.
2. Create a `BlockSetOps::UnionDChanged()` function that does a `UnionD`
operation but also returns `true` if the target bitset changed value.

Some additional stats were added under `COUNT_BASIC_BLOCKS` regarding
how many iterations dominators and reachability computations take to
converge.

* Code review feedback

src/coreclr/jit/bitset.h
src/coreclr/jit/bitsetasshortlong.h
src/coreclr/jit/bitsetops.h
src/coreclr/jit/compiler.cpp
src/coreclr/jit/compiler.h
src/coreclr/jit/fgopt.cpp
src/coreclr/jit/fgprofilesynthesis.cpp

index 7360eb4424ac2f1c01952c42ae5d2d504510d356..0e090e5a45a791f3e73bae25bab4542ea111308b 100644 (file)
@@ -228,6 +228,8 @@ class BitSetOps
 
     // Destructively modify "bs1" to be the union of "bs1" and "bs2".
     static void UnionD(Env env, BitSetType& bs1, BitSetValueArgType bs2);
+    // Destructively modify "bs1" to be the union of "bs1" and "bs2"; return `true` if `bs1` changed.
+    static bool UnionDChanged(Env env, BitSetType& bs1, BitSetValueArgType bs2);
     // Returns a new BitSet that is the union of "bs1" and "bs2".
     static BitSetValueRetType Union(Env env, BitSetValueArgType bs1, BitSetValueArgType bs2);
 
@@ -374,6 +376,11 @@ public:
         BitSetTraits::GetOpCounter(env)->RecordOp(BitSetSupport::BSOP_UnionD);
         BSO::UnionD(env, bs1, bs2);
     }
+    static bool UnionDChanged(Env env, BitSetType& bs1, BitSetValueArgType bs2)
+    {
+        BitSetTraits::GetOpCounter(env)->RecordOp(BitSetSupport::BSOP_UnionDChanged);
+        return BSO::UnionDChanged(env, bs1, bs2);
+    }
     static BitSetValueRetType Union(Env env, BitSetValueArgType bs1, BitSetValueArgType bs2)
     {
         BitSetTraits::GetOpCounter(env)->RecordOp(BitSetSupport::BSOP_Union);
index 0aa33bd916ad7e1b0af8c3a17fb7092a2a1e7cc7..5a2e315673cbe006a88b5912ccc6100269862fc2 100644 (file)
@@ -39,6 +39,7 @@ private:
     static unsigned CountLong(Env env, BitSetShortLongRep bs);
     static bool IsEmptyUnionLong(Env env, BitSetShortLongRep bs1, BitSetShortLongRep bs2);
     static void UnionDLong(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2);
+    static bool UnionDLongChanged(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2);
     static void DiffDLong(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2);
     static void AddElemDLong(Env env, BitSetShortLongRep& bs, unsigned i);
     static bool TryAddElemDLong(Env env, BitSetShortLongRep& bs, unsigned i);
@@ -209,6 +210,23 @@ public:
             UnionDLong(env, bs1, bs2);
         }
     }
+
+    static bool UnionDChanged(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2)
+    {
+        if (IsShort(env))
+        {
+            size_t bsCurrent = (size_t)bs1;
+            size_t bsNew     = bsCurrent | ((size_t)bs2);
+            bool   changed   = bsNew != bsCurrent;
+            bs1              = (BitSetShortLongRep)bsNew;
+            return changed;
+        }
+        else
+        {
+            return UnionDLongChanged(env, bs1, bs2);
+        }
+    }
+
     static BitSetShortLongRep Union(Env env, BitSetShortLongRep bs1, BitSetShortLongRep bs2)
     {
         BitSetShortLongRep res = MakeCopy(env, bs1);
@@ -443,7 +461,7 @@ public:
     {
         if (IsShort(env))
         {
-            // Can't just shift by numBits+1, since that might be 32 (and (1 << 32( == 1, for an unsigned).
+            // Can't just shift by numBits+1, since that might be 32 (and (1 << 32) == 1, for an unsigned).
             unsigned numBits = BitSetTraits::GetSize(env);
             if (numBits == BitsInSizeT)
             {
@@ -638,6 +656,27 @@ void BitSetOps</*BitSetType*/ BitSetShortLongRep,
     }
 }
 
+template <typename Env, typename BitSetTraits>
+bool BitSetOps</*BitSetType*/ BitSetShortLongRep,
+               /*Brand*/ BSShortLong,
+               /*Env*/ Env,
+               /*BitSetTraits*/ BitSetTraits>::UnionDLongChanged(Env                 env,
+                                                                 BitSetShortLongRep& bs1,
+                                                                 BitSetShortLongRep  bs2)
+{
+    assert(!IsShort(env));
+    bool     changed = false;
+    unsigned len     = BitSetTraits::GetArrSize(env);
+    for (unsigned i = 0; i < len; i++)
+    {
+        size_t bsCurrent = bs1[i];
+        size_t bsNew     = bsCurrent | bs2[i];
+        changed |= bsNew != bsCurrent;
+        bs1[i] = bsNew;
+    }
+    return changed;
+}
+
 template <typename Env, typename BitSetTraits>
 void BitSetOps</*BitSetType*/ BitSetShortLongRep,
                /*Brand*/ BSShortLong,
index 932904bb71906bf6333267c454b6c3197311ad1f..a5eca5721286b1e2b9a289acc7f6ff4a9e69325b 100644 (file)
@@ -17,6 +17,7 @@ BSOPNAME(BSOP_RemoveElem)
 BSOPNAME(BSOP_AddElemD)
 BSOPNAME(BSOP_AddElem)
 BSOPNAME(BSOP_UnionD)
+BSOPNAME(BSOP_UnionDChanged)
 BSOPNAME(BSOP_Union)
 BSOPNAME(BSOP_IntersectionD)
 BSOPNAME(BSOP_Intersection)
index 06099db0e94cc3c0486dfa7aa98cc6fe76024364..32430a707f4288dbd490ab33690676c4ee428af6 100644 (file)
@@ -297,6 +297,15 @@ Histogram bbCntTable(bbCntBuckets);
 unsigned  bbSizeBuckets[] = {1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 0};
 Histogram bbOneBBSizeTable(bbSizeBuckets);
 
+unsigned  domsChangedIterationBuckets[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0};
+Histogram domsChangedIterationTable(domsChangedIterationBuckets);
+
+unsigned  computeReachabilitySetsIterationBuckets[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0};
+Histogram computeReachabilitySetsIterationTable(computeReachabilitySetsIterationBuckets);
+
+unsigned  computeReachabilityIterationBuckets[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0};
+Histogram computeReachabilityIterationTable(computeReachabilityIterationBuckets);
+
 #endif // COUNT_BASIC_BLOCKS
 
 /*****************************************************************************
@@ -1560,6 +1569,25 @@ void Compiler::compShutdown()
     fprintf(fout, "--------------------------------------------------\n");
     bbOneBBSizeTable.dump(fout);
     fprintf(fout, "--------------------------------------------------\n");
+
+    fprintf(fout, "--------------------------------------------------\n");
+    fprintf(fout, "fgComputeDoms `while (change)` iterations:\n");
+    fprintf(fout, "--------------------------------------------------\n");
+    domsChangedIterationTable.dump(fout);
+    fprintf(fout, "--------------------------------------------------\n");
+
+    fprintf(fout, "--------------------------------------------------\n");
+    fprintf(fout, "fgComputeReachabilitySets `while (change)` iterations:\n");
+    fprintf(fout, "--------------------------------------------------\n");
+    computeReachabilitySetsIterationTable.dump(fout);
+    fprintf(fout, "--------------------------------------------------\n");
+
+    fprintf(fout, "--------------------------------------------------\n");
+    fprintf(fout, "fgComputeReachability `while (change)` iterations:\n");
+    fprintf(fout, "--------------------------------------------------\n");
+    computeReachabilityIterationTable.dump(fout);
+    fprintf(fout, "--------------------------------------------------\n");
+
 #endif // COUNT_BASIC_BLOCKS
 
 #if COUNT_LOOPS
index f92b51c542ca3d1d906ce989a663d233821b594b..b00e42a584d65be8d2250f5ba30ea1034729ba79 100644 (file)
@@ -11691,6 +11691,9 @@ extern size_t   gcPtrMapNSize;
 #if COUNT_BASIC_BLOCKS
 extern Histogram bbCntTable;
 extern Histogram bbOneBBSizeTable;
+extern Histogram domsChangedIterationTable;
+extern Histogram computeReachabilitySetsIterationTable;
+extern Histogram computeReachabilityIterationTable;
 #endif
 
 /*****************************************************************************
index 609d689edf1c1b4db22bbb1ecbe1acf828521862..2a917a1da3219f7e87c1507647a95f43e404b982 100644 (file)
@@ -196,6 +196,7 @@ void Compiler::fgUpdateChangedFlowGraph(FlowGraphUpdates updates)
     JITDUMP("\nRenumbering the basic blocks for fgUpdateChangeFlowGraph\n");
     fgRenumberBlocks();
     fgComputeEnterBlocksSet();
+    fgDfsReversePostorder();
     fgComputeReachabilitySets();
     if (computeDoms)
     {
@@ -219,18 +220,18 @@ void Compiler::fgUpdateChangedFlowGraph(FlowGraphUpdates updates)
 //
 // This also sets the BBF_GC_SAFE_POINT flag on blocks.
 //
+// This depends on `fgBBReversePostorder` being correct.
+//
 // TODO-Throughput: This algorithm consumes O(n^2) because we're using dense bitsets to
 // represent reachability. While this yields O(1) time queries, it bloats the memory usage
 // for large code.  We can do better if we try to approach reachability by
 // computing the strongly connected components of the flow graph.  That way we only need
 // linear memory to label every block with its SCC.
 //
-// Assumptions:
-//    Assumes the predecessor lists are correct.
-//
 void Compiler::fgComputeReachabilitySets()
 {
     assert(fgPredsComputed);
+    assert(fgBBReversePostorder != nullptr);
 
 #ifdef DEBUG
     fgReachabilitySetsValid = false;
@@ -250,21 +251,21 @@ void Compiler::fgComputeReachabilitySets()
     // Find the reachable blocks. Also, set BBF_GC_SAFE_POINT.
 
     bool     change;
-    BlockSet newReach(BlockSetOps::MakeEmpty(this));
+    unsigned changedIterCount = 1;
     do
     {
         change = false;
 
-        for (BasicBlock* const block : Blocks())
+        for (unsigned i = 1; i <= fgBBNumMax; ++i)
         {
-            BlockSetOps::Assign(this, newReach, block->bbReach);
+            BasicBlock* const block = fgBBReversePostorder[i];
 
             bool predGcSafe = (block->bbPreds != nullptr); // Do all of our predecessor blocks have a GC safe bit?
 
             for (BasicBlock* const predBlock : block->PredBlocks())
             {
                 /* Union the predecessor's reachability set into newReach */
-                BlockSetOps::UnionD(this, newReach, predBlock->bbReach);
+                change |= BlockSetOps::UnionDChanged(this, block->bbReach, predBlock->bbReach);
 
                 if (!(predBlock->bbFlags & BBF_GC_SAFE_POINT))
                 {
@@ -276,15 +277,15 @@ void Compiler::fgComputeReachabilitySets()
             {
                 block->bbFlags |= BBF_GC_SAFE_POINT;
             }
-
-            if (!BlockSetOps::Equal(this, newReach, block->bbReach))
-            {
-                BlockSetOps::Assign(this, block->bbReach, newReach);
-                change = true;
-            }
         }
+
+        ++changedIterCount;
     } while (change);
 
+#if COUNT_BASIC_BLOCKS
+    computeReachabilitySetsIterationTable.record(changedIterCount);
+#endif // COUNT_BASIC_BLOCKS
+
 #ifdef DEBUG
     if (verbose)
     {
@@ -598,15 +599,11 @@ PhaseStatus Compiler::fgComputeReachability()
         madeChanges |= fgRenumberBlocks();
 
         //
-        // Compute fgEnterBlks
+        // Compute fgEnterBlks, reverse post-order, and bbReach.
         //
 
         fgComputeEnterBlocksSet();
-
-        //
-        // Compute bbReach
-        //
-
+        fgDfsReversePostorder();
         fgComputeReachabilitySets();
 
         //
@@ -618,6 +615,10 @@ PhaseStatus Compiler::fgComputeReachability()
 
     } while (changed);
 
+#if COUNT_BASIC_BLOCKS
+    computeReachabilityIterationTable.record(passNum - 1);
+#endif // COUNT_BASIC_BLOCKS
+
     //
     // Now, compute the dominators
     //
@@ -752,19 +753,24 @@ bool Compiler::fgRemoveDeadBlocks()
 //   preorder and reverse postorder numbers, plus a reverse postorder for blocks.
 //
 // Notes:
-//   Assumes caller has computed the fgEnterBlkSet.
+//   Assumes caller has computed the fgEnterBlks set.
+//
+//   Each block's `bbPreorderNum` and `bbPostorderNum` is set.
 //
-//   Assumes caller has allocated the fgBBReversePostorder array.
-//   It will be filled in with blocks in reverse post order.
+//   The `fgBBReversePostorder` array is filled in with the `BasicBlock*` in reverse post-order.
 //
 //   This algorithm only pays attention to the actual blocks. It ignores any imaginary entry block.
 //
 void Compiler::fgDfsReversePostorder()
 {
+    assert(fgBBcount == fgBBNumMax);
+    assert(BasicBlockBitSetTraits::GetSize(this) == fgBBNumMax + 1);
+
     // Make sure fgEnterBlks are still there in startNodes, even if they participate in a loop (i.e., there is
     // an incoming edge into the block).
     assert(fgEnterBlksSetValid);
-    assert(fgBBReversePostorder != nullptr);
+
+    fgBBReversePostorder = new (this, CMK_DominatorMemory) BasicBlock*[fgBBNumMax + 1]{};
 
     // visited   :  Once we run the DFS post order sort recursive algorithm, we mark the nodes we visited to avoid
     //              backtracking.
@@ -989,13 +995,6 @@ void Compiler::fgComputeDoms()
     assert(BasicBlockBitSetTraits::GetSize(this) == fgBBNumMax + 1);
 #endif // DEBUG
 
-    BlockSet processedBlks(BlockSetOps::MakeEmpty(this));
-
-    fgBBReversePostorder = new (this, CMK_DominatorMemory) BasicBlock*[fgBBNumMax + 1]{};
-
-    fgDfsReversePostorder();
-    noway_assert(fgBBReversePostorder[0] == nullptr);
-
     // flRoot and bbRoot represent an imaginary unique entry point in the flow graph.
     // All the orphaned EH blocks and fgFirstBB will temporarily have its predecessors list
     // (with bbRoot as the only basic block in it) set as flRoot.
@@ -1012,9 +1011,11 @@ void Compiler::fgComputeDoms()
 
     FlowEdge flRoot(&bbRoot, nullptr);
 
+    noway_assert(fgBBReversePostorder[0] == nullptr);
     fgBBReversePostorder[0] = &bbRoot;
 
     // Mark both bbRoot and fgFirstBB processed
+    BlockSet processedBlks(BlockSetOps::MakeEmpty(this));
     BlockSetOps::AddElemD(this, processedBlks, 0); // bbRoot    == block #0
     BlockSetOps::AddElemD(this, processedBlks, 1); // fgFirstBB == block #1
     assert(fgFirstBB->bbNum == 1);
@@ -1057,7 +1058,8 @@ void Compiler::fgComputeDoms()
     }
 
     // Now proceed to compute the immediate dominators for each basic block.
-    bool changed = true;
+    bool     changed          = true;
+    unsigned changedIterCount = 1;
     while (changed)
     {
         changed = false;
@@ -1116,8 +1118,14 @@ void Compiler::fgComputeDoms()
             }
             BlockSetOps::AddElemD(this, processedBlks, block->bbNum);
         }
+
+        ++changedIterCount;
     }
 
+#if COUNT_BASIC_BLOCKS
+    domsChangedIterationTable.record(changedIterCount);
+#endif // COUNT_BASIC_BLOCKS
+
     // As stated before, once we have computed immediate dominance we need to clear
     // all the basic blocks whose predecessor list was set to flRoot.  This
     // reverts that and leaves the blocks the same as before.
@@ -1311,7 +1319,7 @@ void Compiler::fgNumberDomTree(DomTreeNode* domTree)
 // fgIntersectDom: Intersect two immediate dominator sets.
 //
 // Find the lowest common ancestor in the dominator tree between two basic blocks. The LCA in the dominance tree
-// represents the closest dominator between the two basic blocks. Used to adjust the IDom value in fgComputDoms.
+// represents the closest dominator between the two basic blocks. Used to adjust the IDom value in fgComputeDoms.
 //
 // Arguments:
 //    a, b - two blocks to intersect
index 17f976ead792789a0da4829277e685cb4286761b..2bab76036d482cf7f4de8f0b41dcc76c6c651894 100644 (file)
@@ -763,7 +763,6 @@ void ProfileSynthesis::RandomizeLikelihoods()
 void ProfileSynthesis::BuildReversePostorder()
 {
     m_comp->EnsureBasicBlockEpoch();
-    m_comp->fgBBReversePostorder = new (m_comp, CMK_Pgo) BasicBlock*[m_comp->fgBBNumMax + 1]{};
     m_comp->fgComputeEnterBlocksSet();
     m_comp->fgDfsReversePostorder();