Spanning tree instrumentation (#47959)
authorAndy Ayers <andya@microsoft.com>
Tue, 9 Feb 2021 01:59:12 +0000 (17:59 -0800)
committerGitHub <noreply@github.com>
Tue, 9 Feb 2021 01:59:12 +0000 (17:59 -0800)
Add a new instrumentation mode that only instruments a subset of the edges in
the control flow graph. This reduces the total number of counters and so has
less compile time and runtime overhead than instrumenting every block.

Add a matching count reconstruction algorithm that recovers the missing edge
counts and all block counts.

See #46882 for more details on this approach.

Also in runtime pgo support, add the header offset to the copy source.
This fixes #47930.

13 files changed:
eng/pipelines/common/templates/runtimes/run-test-job.yml
src/coreclr/inc/corjit.h
src/coreclr/jit/block.h
src/coreclr/jit/compiler.cpp
src/coreclr/jit/compiler.h
src/coreclr/jit/compphases.h
src/coreclr/jit/fgbasic.cpp
src/coreclr/jit/fgprofile.cpp
src/coreclr/jit/jitconfigvalues.h
src/coreclr/jit/phase.cpp
src/coreclr/tools/Common/Pgo/PgoFormat.cs
src/coreclr/vm/pgo.cpp
src/tests/Common/testenvironment.proj

index eac13a7..40da880 100644 (file)
@@ -477,6 +477,7 @@ jobs:
           - jitpgo
           - jitpgo_inline
           - jitpgo_classes
+          - jitpgo_edgeinstrumentation
         ${{ if in(parameters.testGroup, 'ilasm') }}:
           scenarios:
           - ilasmroundtrip
index 72df3af..a153564 100644 (file)
@@ -305,6 +305,7 @@ public:
         TypeHandleHistogramTypeHandle = (DescriptorMin * 3) | TypeHandle, // TypeHandle that is part of a type histogram
         Version = (DescriptorMin * 4) | None, // Version is encoded in the Other field of the schema
         NumRuns = (DescriptorMin * 5) | None, // Number of runs is encoded in the Other field of the schema
+        EdgeIntCount = (DescriptorMin * 6) | FourByte, // 4 byte edge counter, using unsigned 4 byte int
     };
 
     struct PgoInstrumentationSchema
index cfe2641..c21131e 100644 (file)
@@ -882,9 +882,14 @@ struct BasicBlock : private LIR::Range
     void ensurePredListOrder(Compiler* compiler);
     void reorderPredList(Compiler* compiler);
 
-    BlockSet    bbReach; // Set of all blocks that can reach this one
-    BasicBlock* bbIDom;  // Represent the closest dominator to this block (called the Immediate
-                         // Dominator) used to compute the dominance tree.
+    BlockSet bbReach; // Set of all blocks that can reach this one
+
+    union {
+        BasicBlock* bbIDom;      // Represent the closest dominator to this block (called the Immediate
+                                 // Dominator) used to compute the dominance tree.
+        void* bbSparseProbeList; // Used early on by fgInstrument
+        void* bbSparseCountInfo; // Used early on by fgIncorporateEdgeCounts
+    };
 
     unsigned bbPostOrderNum; // the block's post order number in the graph.
 
index fbc2482..e51a7d2 100644 (file)
@@ -4403,6 +4403,14 @@ void Compiler::compCompile(void** methodCodePtr, ULONG* methodCodeSize, JitFlags
     //
     DoPhase(this, PHASE_INCPROFILE, &Compiler::fgIncorporateProfileData);
 
+    // If we're going to instrument code, we may need to prepare before
+    // we import.
+    //
+    if (compileFlags->IsSet(JitFlags::JIT_FLAG_BBINSTR))
+    {
+        DoPhase(this, PHASE_IBCPREP, &Compiler::fgPrepareToInstrumentMethod);
+    }
+
     // Import: convert the instrs in each basic block to a tree based intermediate representation
     //
     DoPhase(this, PHASE_IMPORTATION, &Compiler::fgImport);
index 78c8a31..535eef6 100644 (file)
@@ -71,11 +71,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  */
 
 struct InfoHdr;            // defined in GCInfo.h
-struct escapeMapping_t;    // defined in flowgraph.cpp
+struct escapeMapping_t;    // defined in fgdiagnostic.cpp
 class emitter;             // defined in emit.h
 struct ShadowParamVarInfo; // defined in GSChecks.cpp
 struct InitVarDscInfo;     // defined in register_arg_convention.h
-class FgStack;             // defined in flowgraph.cpp
+class FgStack;             // defined in fgbasic.cpp
+class Instrumentor;        // defined in fgprofile.cpp
+class SpanningTreeVisitor; // defined in fgprofile.cpp
 #if FEATURE_ANYCSE
 class CSE_DataFlow; // defined in OptCSE.cpp
 #endif
@@ -5535,15 +5537,6 @@ protected:
 
     void fgAdjustForAddressExposedOrWrittenThis();
 
-    bool                                   fgProfileData_ILSizeMismatch;
-    ICorJitInfo::PgoInstrumentationSchema* fgPgoSchema;
-    BYTE*                                  fgPgoData;
-    UINT32                                 fgPgoSchemaCount;
-    HRESULT                                fgPgoQueryResult;
-    UINT32                                 fgNumProfileRuns;
-    UINT32                                 fgPgoBlockCounts;
-    UINT32                                 fgPgoClassProfiles;
-
     unsigned fgStressBBProf()
     {
 #ifdef DEBUG
@@ -5562,13 +5555,32 @@ protected:
     }
 
     bool fgHaveProfileData();
-    void fgComputeProfileScale();
     bool fgGetProfileWeightForBasicBlock(IL_OFFSET offset, BasicBlock::weight_t* weight);
+
+    Instrumentor* fgCountInstrumentor;
+    Instrumentor* fgClassInstrumentor;
+
+    PhaseStatus fgPrepareToInstrumentMethod();
     PhaseStatus fgInstrumentMethod();
     PhaseStatus fgIncorporateProfileData();
     void        fgIncorporateBlockCounts();
+    void        fgIncorporateEdgeCounts();
 
 public:
+    bool                                   fgProfileData_ILSizeMismatch;
+    ICorJitInfo::PgoInstrumentationSchema* fgPgoSchema;
+    BYTE*                                  fgPgoData;
+    UINT32                                 fgPgoSchemaCount;
+    HRESULT                                fgPgoQueryResult;
+    UINT32                                 fgNumProfileRuns;
+    UINT32                                 fgPgoBlockCounts;
+    UINT32                                 fgPgoEdgeCounts;
+    UINT32                                 fgPgoClassProfiles;
+
+    void WalkSpanningTree(SpanningTreeVisitor* visitor);
+    void fgSetProfileWeight(BasicBlock* block, BasicBlock::weight_t weight);
+    void fgComputeProfileScale();
+
     // fgIsUsingProfileWeights - returns true if we have real profile data for this method
     //                           or if we have some fake profile data for the stress mode
     bool fgIsUsingProfileWeights()
index 23dade2..fc6df58 100644 (file)
@@ -28,6 +28,7 @@ CompPhaseNameMacro(PHASE_IMPORTATION,            "Importation",
 CompPhaseNameMacro(PHASE_INDXCALL,               "Indirect call transform",        "INDXCALL", false, -1, true)
 CompPhaseNameMacro(PHASE_PATCHPOINTS,            "Expand patchpoints",             "PPOINT",   false, -1, true)
 CompPhaseNameMacro(PHASE_POST_IMPORT,            "Post-import",                    "POST-IMP", false, -1, false)
+CompPhaseNameMacro(PHASE_IBCPREP,                "Profile instrumentation prep",   "IBCPREP",  false, -1, false)
 CompPhaseNameMacro(PHASE_IBCINSTR,               "Profile instrumentation",        "IBCINSTR", false, -1, false)
 CompPhaseNameMacro(PHASE_INCPROFILE,             "Profile incorporation",          "INCPROF",  false, -1, false)
 CompPhaseNameMacro(PHASE_MORPH_INIT,             "Morph - Init",                   "MOR-INIT" ,false, -1, false)
index 37c06e0..8f07bfb 100644 (file)
@@ -170,7 +170,10 @@ void Compiler::fgInit()
     fgPgoSchemaCount     = 0;
     fgNumProfileRuns     = 0;
     fgPgoBlockCounts     = 0;
+    fgPgoEdgeCounts      = 0;
     fgPgoClassProfiles   = 0;
+    fgCountInstrumentor  = nullptr;
+    fgClassInstrumentor  = nullptr;
     fgPredListSortVector = nullptr;
 }
 
@@ -418,15 +421,20 @@ void Compiler::fgChangeSwitchBlock(BasicBlock* oldSwitchBlock, BasicBlock* newSw
     }
 }
 
-/*****************************************************************************
- * fgReplaceSwitchJumpTarget:
- *
- * We have a BBJ_SWITCH at 'blockSwitch' and we want to replace all entries
- * in the jumpTab[] such that so that jumps that previously went to
- * 'oldTarget' now go to 'newTarget'.
- * We also must update the predecessor lists for 'oldTarget' and 'newPred'.
- */
-
+//------------------------------------------------------------------------
+// fgReplaceSwitchJumpTarget: update BBJ_SWITCH block  so that all control
+//   that previously flowed to oldTarget now flows to newTarget.
+//
+// Arguments:
+//   blockSwitch - block ending in a switch
+//   newTarget   - new branch target
+//   oldTarget   - old branch target
+//
+// Notes:
+//   Updates the jump table and the cached unique target set (if any).
+//   Can be called before or after pred lists are built.
+//   If pred lists are built, updates pred lists.
+//
 void Compiler::fgReplaceSwitchJumpTarget(BasicBlock* blockSwitch, BasicBlock* newTarget, BasicBlock* oldTarget)
 {
     noway_assert(blockSwitch != nullptr);
@@ -450,7 +458,10 @@ void Compiler::fgReplaceSwitchJumpTarget(BasicBlock* blockSwitch, BasicBlock* ne
         {
             // Remove the old edge [oldTarget from blockSwitch]
             //
-            fgRemoveAllRefPreds(oldTarget, blockSwitch);
+            if (fgComputePredsDone)
+            {
+                fgRemoveAllRefPreds(oldTarget, blockSwitch);
+            }
 
             //
             // Change the jumpTab entry to branch to the new location
@@ -460,7 +471,12 @@ void Compiler::fgReplaceSwitchJumpTarget(BasicBlock* blockSwitch, BasicBlock* ne
             //
             // Create the new edge [newTarget from blockSwitch]
             //
-            flowList* newEdge = fgAddRefPred(newTarget, blockSwitch);
+            flowList* newEdge = nullptr;
+
+            if (fgComputePredsDone)
+            {
+                newEdge = fgAddRefPred(newTarget, blockSwitch);
+            }
 
             // Now set the correct value of newEdge->flDupCount
             // and replace any other jumps in jumpTab[] that go to oldTarget.
@@ -479,7 +495,10 @@ void Compiler::fgReplaceSwitchJumpTarget(BasicBlock* blockSwitch, BasicBlock* ne
                     //
                     // Increment the flDupCount
                     //
-                    newEdge->flDupCount++;
+                    if (fgComputePredsDone)
+                    {
+                        newEdge->flDupCount++;
+                    }
                 }
                 i++; // Check the next entry in jumpTab[]
             }
@@ -1780,11 +1799,6 @@ unsigned Compiler::fgMakeBasicBlocks(const BYTE* codeAddr, IL_OFFSET codeSize, F
         }
     }
 
-    if (compIsForInlining())
-    {
-        fgComputeProfileScale();
-    }
-
     do
     {
         unsigned   jmpAddr = DUMMY_INIT(BAD_IL_OFFSET);
@@ -3591,22 +3605,27 @@ BasicBlock* Compiler::fgSplitBlockAtBeginning(BasicBlock* curr)
 //              'succ' might be the fall-through path or the branch path from 'curr'.
 //
 // Arguments:
-//    curr - A block which branches conditionally to 'succ'
+//    curr - A block which branches to 'succ'
 //    succ - The target block
 //
 // Return Value:
 //    Returns a new block, that is a successor of 'curr' and which branches unconditionally to 'succ'
 //
 // Assumptions:
-//    'curr' must have a bbJumpKind of BBJ_COND or BBJ_SWITCH
+//    'curr' must have a bbJumpKind of BBJ_COND, BBJ_ALWAYS, or BBJ_SWITCH
 //
 // Notes:
 //    The returned block is empty.
+//    Can be invoked before pred lists are built.
 
 BasicBlock* Compiler::fgSplitEdge(BasicBlock* curr, BasicBlock* succ)
 {
-    assert(curr->bbJumpKind == BBJ_COND || curr->bbJumpKind == BBJ_SWITCH);
-    assert(fgGetPredForBlock(succ, curr) != nullptr);
+    assert(curr->bbJumpKind == BBJ_COND || curr->bbJumpKind == BBJ_SWITCH || curr->bbJumpKind == BBJ_ALWAYS);
+
+    if (fgComputePredsDone)
+    {
+        assert(fgGetPredForBlock(succ, curr) != nullptr);
+    }
 
     BasicBlock* newBlock;
     if (succ == curr->bbNext)
@@ -3639,20 +3658,30 @@ BasicBlock* Compiler::fgSplitEdge(BasicBlock* curr, BasicBlock* succ)
         }
         fgAddRefPred(newBlock, curr);
     }
-    else
+    else if (curr->bbJumpKind == BBJ_SWITCH)
     {
-        assert(curr->bbJumpKind == BBJ_SWITCH);
-
         // newBlock replaces 'succ' in the switch.
         fgReplaceSwitchJumpTarget(curr, newBlock, succ);
 
         // And 'succ' has 'newBlock' as a new predecessor.
         fgAddRefPred(succ, newBlock);
     }
+    else
+    {
+        assert(curr->bbJumpKind == BBJ_ALWAYS);
+        fgReplacePred(succ, curr, newBlock);
+        curr->bbJumpDest = newBlock;
+        newBlock->bbFlags |= BBF_JMP_TARGET;
+        fgAddRefPred(newBlock, curr);
+    }
 
     // This isn't accurate, but it is complex to compute a reasonable number so just assume that we take the
     // branch 50% of the time.
-    newBlock->inheritWeightPercentage(curr, 50);
+    //
+    if (curr->bbJumpKind != BBJ_ALWAYS)
+    {
+        newBlock->inheritWeightPercentage(curr, 50);
+    }
 
     // The bbLiveIn and bbLiveOut are both equal to the bbLiveIn of 'succ'
     if (fgLocalVarLivenessDone)
index d0a6080..2980fc1 100644 (file)
@@ -111,21 +111,21 @@ void Compiler::fgComputeProfileScale()
     //
     // For most callees it will be the same as the entry block count.
     //
-    BasicBlock::weight_t calleeWeight = 0;
-
-    if (!fgGetProfileWeightForBasicBlock(0, &calleeWeight))
+    if (!fgFirstBB->hasProfileWeight())
     {
         JITDUMP("   ... no callee profile data for entry block\n");
         impInlineInfo->profileScaleState = InlineInfo::ProfileScaleState::UNAVAILABLE;
         return;
     }
 
+    // Note when/if we do normalization this may need to change.
+    //
+    BasicBlock::weight_t calleeWeight = fgFirstBB->bbWeight;
+
     // We should generally be able to assume calleeWeight >= callSiteWeight.
     // If this isn't so, perhaps something is wrong with the profile data
     // collection or retrieval.
     //
-    // For now, ignore callee data if we'd need to upscale.
-    //
     if (calleeWeight < callSiteWeight)
     {
         JITDUMP("   ... callee entry count %f is less than call site count %f\n", calleeWeight, callSiteWeight);
@@ -233,7 +233,7 @@ public:
     {
         return false;
     }
-    virtual void Prepare()
+    virtual void Prepare(bool preImport)
     {
     }
     virtual void BuildSchemaElements(BasicBlock* block, Schema& schema)
@@ -283,7 +283,7 @@ public:
     {
         return ((block->bbFlags & (BBF_INTERNAL | BBF_IMPORTED)) == BBF_IMPORTED);
     }
-    void Prepare() override;
+    void Prepare(bool isPreImport) override;
     void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
     void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
     void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override;
@@ -292,8 +292,16 @@ public:
 //------------------------------------------------------------------------
 // BlockCountInstrumentor::Prepare: prepare for count instrumentation
 //
-void BlockCountInstrumentor::Prepare()
+// Arguments:
+//   preImport - true if this is the prepare call that happens before
+//      importation
+//
+void BlockCountInstrumentor::Prepare(bool preImport)
 {
+    if (preImport)
+    {
+        return;
+    }
 
 #ifdef DEBUG
     // Set schema index to invalid value
@@ -439,446 +447,1183 @@ void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profile
 }
 
 //------------------------------------------------------------------------
-// ClassProbeVisitor: invoke functor on each virtual call in a tree
+// SpanningTreeVisitor: abstract class for computations done while
+//   evolving a spanning tree.
 //
-template <class TFunctor>
-class ClassProbeVisitor final : public GenTreeVisitor<ClassProbeVisitor<TFunctor>>
+class SpanningTreeVisitor
 {
 public:
-    enum
+    // To save visitors a bit of work, we also note
+    // for non-tree edges whether the edge postdominates
+    // the source, dominates the target, or is a critical edge.
+    //
+    enum class EdgeKind
     {
-        DoPreOrder = true
+        Unknown,
+        PostdominatesSource,
+        DominatesTarget,
+        CriticalEdge
     };
 
-    TFunctor& m_functor;
-    Compiler* m_compiler;
-
-    ClassProbeVisitor(Compiler* compiler, TFunctor& functor)
-        : GenTreeVisitor<ClassProbeVisitor>(compiler), m_functor(functor), m_compiler(compiler)
-    {
-    }
-    Compiler::fgWalkResult PreOrderVisit(GenTree** use, GenTree* user)
-    {
-        GenTree* const node = *use;
-        if (node->IsCall())
-        {
-            GenTreeCall* const call = node->AsCall();
-            if (call->IsVirtual() && (call->gtCallType != CT_INDIRECT))
-            {
-                m_functor(m_compiler, call);
-            }
-        }
-
-        return Compiler::WALK_CONTINUE;
-    }
+    virtual void Badcode()                     = 0;
+    virtual void VisitBlock(BasicBlock* block) = 0;
+    virtual void VisitTreeEdge(BasicBlock* source, BasicBlock* target) = 0;
+    virtual void VisitNonTreeEdge(BasicBlock* source, BasicBlock* target, EdgeKind kind) = 0;
 };
 
 //------------------------------------------------------------------------
-// BuildClassProbeSchemaGen: functor that creates class probe schema elements
+// WalkSpanningTree: evolve a "maximal cost" depth first spanning tree,
+//   invoking the visitor as each edge is classified, or each node is first
+//   discovered.
 //
-class BuildClassProbeSchemaGen
+// Arguments:
+//    visitor - visitor to notify
+//
+// Notes:
+//   We only have rudimentary weights at this stage, and so in practice
+//   we use a depth-first spanning tree (DFST) where we try to steer
+//   the DFS to preferentially visit "higher" cost edges.
+//
+//   Since instrumentation happens after profile incorporation
+//   we could in principle use profile weights to steer the DFS or to build
+//   a true maximum weight tree. However we are relying on being able to
+//   rebuild the exact same spanning tree "later on" when doing a subsequent
+//   profile reconstruction. So, we restrict ourselves to just using
+//   information apparent in the IL.
+//
+void Compiler::WalkSpanningTree(SpanningTreeVisitor* visitor)
 {
-private:
-    Schema&   m_schema;
-    unsigned& m_schemaCount;
+    // Inlinee compilers build their blocks in the root compiler's
+    // graph. So for BlockSets and NumSucc, we use the root compiler instance.
+    //
+    Compiler* const comp = impInlineRoot();
+    comp->NewBasicBlockEpoch();
 
-public:
-    BuildClassProbeSchemaGen(Schema& schema, unsigned& schemaCount) : m_schema(schema), m_schemaCount(schemaCount)
-    {
-    }
+    // We will track visited or queued nodes with a bit vector.
+    //
+    BlockSet marked = BlockSetOps::MakeEmpty(comp);
 
-    void operator()(Compiler* compiler, GenTreeCall* call)
+    // And nodes to visit with a bit vector and stack.
+    //
+    ArrayStack<BasicBlock*> stack(getAllocator(CMK_Pgo));
+
+    // Scratch vector for visiting successors of blocks with
+    // multiple successors.
+    //
+    // Bit vector to track progress through those successors.
+    //
+    ArrayStack<BasicBlock*> scratch(getAllocator(CMK_Pgo));
+    BlockSet                processed = BlockSetOps::MakeEmpty(comp);
+
+    // Push the method entry and all EH handler region entries on the stack.
+    // (push method entry last so it's visited first).
+    //
+    // Note inlinees are "contaminated" with root method EH structures.
+    // We know the inlinee itself doesn't have EH, so we only look at
+    // handlers for root methods.
+    //
+    // If we ever want to support inlining methods with EH, we'll
+    // have to revisit this.
+    //
+    if (!compIsForInlining())
     {
-        ICorJitInfo::PgoInstrumentationSchema schemaElem;
-        schemaElem.Count = 1;
-        schemaElem.Other = ICorJitInfo::ClassProfile::CLASS_FLAG;
-        if (call->IsVirtualStub())
+        EHblkDsc* HBtab = compHndBBtab;
+        unsigned  XTnum = 0;
+
+        for (; XTnum < compHndBBtabCount; XTnum++, HBtab++)
         {
-            schemaElem.Other |= ICorJitInfo::ClassProfile::INTERFACE_FLAG;
+            BasicBlock* hndBegBB = HBtab->ebdHndBeg;
+            stack.Push(hndBegBB);
+            BlockSetOps::AddElemD(comp, marked, hndBegBB->bbNum);
         }
-        else
+    }
+
+    stack.Push(fgFirstBB);
+    BlockSetOps::AddElemD(comp, marked, fgFirstBB->bbNum);
+
+    unsigned nBlocks = 0;
+
+    while (!stack.Empty())
+    {
+        BasicBlock* const block = stack.Pop();
+
+        // Visit the block.
+        //
+        assert(BlockSetOps::IsMember(comp, marked, block->bbNum));
+        visitor->VisitBlock(block);
+        nBlocks++;
+
+        switch (block->bbJumpKind)
         {
-            assert(call->IsVirtualVtable());
-        }
+            case BBJ_CALLFINALLY:
+            {
+                // Just queue up the continuation block,
+                // unless the finally doesn't return, in which
+                // case we really should treat this block as a throw,
+                // and so this block would get instrumented.
+                //
+                // Since our keying scheme is IL based and this
+                // block has no IL offset, we'd need to invent
+                // some new keying scheme. For now we just
+                // ignore this (rare) case.
+                //
+                if (block->isBBCallAlwaysPair())
+                {
+                    // This block should be the only pred of the continuation.
+                    //
+                    BasicBlock* const target = block->bbNext;
+                    assert(!BlockSetOps::IsMember(comp, marked, target->bbNum));
+                    visitor->VisitTreeEdge(block, target);
+                    stack.Push(target);
+                    BlockSetOps::AddElemD(comp, marked, target->bbNum);
+                }
+            }
+            break;
 
-        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount;
-        schemaElem.ILOffset            = jitGetILoffs(call->gtClassProfileCandidateInfo->ilOffset);
-        schemaElem.Offset              = 0;
+            case BBJ_RETURN:
+            case BBJ_THROW:
+            {
+                // Pseudo-edge back to method entry.
+                //
+                // Note if the throw is caught locally this will over-state the profile
+                // count for method entry. But we likely don't care too much about
+                // profiles for methods that throw lots of exceptions.
+                //
+                BasicBlock* const target = fgFirstBB;
+                assert(BlockSetOps::IsMember(comp, marked, target->bbNum));
+                visitor->VisitNonTreeEdge(block, target, SpanningTreeVisitor::EdgeKind::PostdominatesSource);
+            }
+            break;
 
-        m_schema.push_back(schemaElem);
+            case BBJ_EHFINALLYRET:
+            case BBJ_EHCATCHRET:
+            case BBJ_EHFILTERRET:
+            case BBJ_LEAVE:
+            {
+                // See if we're leaving an EH handler region.
+                //
+                bool           isInTry     = false;
+                unsigned const regionIndex = ehGetMostNestedRegionIndex(block, &isInTry);
 
-        // Re-using ILOffset and Other fields from schema item for TypeHandleHistogramCount
-        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramTypeHandle;
-        schemaElem.Count               = ICorJitInfo::ClassProfile::SIZE;
-        m_schema.push_back(schemaElem);
+                if (isInTry)
+                {
+                    // No, we're leaving a try or catch, not a handler.
+                    // Treat this as a normal edge.
+                    //
+                    BasicBlock* const target = block->bbJumpDest;
+
+                    // In some bad IL cases we may not have a target.
+                    // In others we may see something other than LEAVE be most-nested in a try.
+                    //
+                    if (target == nullptr)
+                    {
+                        JITDUMP("No jump dest for " FMT_BB ", suspect bad code\n", block->bbNum);
+                        visitor->Badcode();
+                    }
+                    else if (block->bbJumpKind != BBJ_LEAVE)
+                    {
+                        JITDUMP("EH RET in " FMT_BB " most-nested in try, suspect bad code\n", block->bbNum);
+                        visitor->Badcode();
+                    }
+                    else
+                    {
+                        if (BlockSetOps::IsMember(comp, marked, target->bbNum))
+                        {
+                            visitor->VisitNonTreeEdge(block, target,
+                                                      SpanningTreeVisitor::EdgeKind::PostdominatesSource);
+                        }
+                        else
+                        {
+                            visitor->VisitTreeEdge(block, target);
+                            stack.Push(target);
+                            BlockSetOps::AddElemD(comp, marked, target->bbNum);
+                        }
+                    }
+                }
+                else
+                {
+                    // Pseudo-edge back to handler entry.
+                    //
+                    EHblkDsc* const   dsc    = ehGetBlockHndDsc(block);
+                    BasicBlock* const target = dsc->ebdHndBeg;
+                    assert(BlockSetOps::IsMember(comp, marked, target->bbNum));
+                    visitor->VisitNonTreeEdge(block, target, SpanningTreeVisitor::EdgeKind::PostdominatesSource);
+                }
+            }
+            break;
 
-        m_schemaCount++;
-    }
-};
+            default:
+            {
+                // If this block is a control flow fork, we want to
+                // preferentially visit critical edges first; if these
+                // edges end up in the DFST then instrumentation will
+                // require edge splitting.
+                //
+                // We also want to preferentially visit edges to rare
+                // successors last, if this block is non-rare.
+                //
+                // It's not immediately clear if we should pass comp or this
+                // to NumSucc here (for inlinees).
+                //
+                // It matters for FINALLYRET and for SWITCHES. Currently
+                // we handle the first one specially, and it seems possible
+                // things will just work for switches either way, but it
+                // might work a bit better using the root compiler.
+                //
+                const unsigned numSucc = block->NumSucc(comp);
 
-//------------------------------------------------------------------------
-// ClassProbeInserter: functor that adds class probe instrumentation
-//
-class ClassProbeInserter
-{
-    Schema&   m_schema;
-    BYTE*     m_profileMemory;
-    int*      m_currentSchemaIndex;
-    unsigned& m_instrCount;
+                if (numSucc == 1)
+                {
+                    // Not a fork. Just visit the sole successor.
+                    //
+                    BasicBlock* const target = block->GetSucc(0, comp);
+                    if (BlockSetOps::IsMember(comp, marked, target->bbNum))
+                    {
+                        // We can't instrument in the call always pair tail block
+                        // so treat this as a critical edge.
+                        //
+                        visitor->VisitNonTreeEdge(block, target,
+                                                  block->isBBCallAlwaysPairTail()
+                                                      ? SpanningTreeVisitor::EdgeKind::CriticalEdge
+                                                      : SpanningTreeVisitor::EdgeKind::PostdominatesSource);
+                    }
+                    else
+                    {
+                        visitor->VisitTreeEdge(block, target);
+                        stack.Push(target);
+                        BlockSetOps::AddElemD(comp, marked, target->bbNum);
+                    }
+                }
+                else
+                {
+                    // A block with multiple successors.
+                    //
+                    // Because we're using a stack up above, we work in reverse
+                    // order of "cost" here --  so we first consider rare,
+                    // then normal, then critical.
+                    //
+                    // That is, all things being equal we'd prefer to
+                    // have critical edges be tree edges, and
+                    // edges from non-rare to rare be non-tree edges.
+                    //
+                    scratch.Reset();
+                    BlockSetOps::ClearD(comp, processed);
+
+                    for (unsigned i = 0; i < numSucc; i++)
+                    {
+                        BasicBlock* const succ = block->GetSucc(i, comp);
+                        scratch.Push(succ);
+                    }
 
-public:
-    ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
-        : m_schema(schema)
-        , m_profileMemory(profileMemory)
-        , m_currentSchemaIndex(pCurrentSchemaIndex)
-        , m_instrCount(instrCount)
-    {
-    }
+                    // Rare successors of non-rare blocks
+                    //
+                    for (unsigned i = 0; i < numSucc; i++)
+                    {
+                        BasicBlock* const target = scratch.Top(i);
 
-    void operator()(Compiler* compiler, GenTreeCall* call)
-    {
-        JITDUMP("Found call [%06u] with probe index %d and ilOffset 0x%X\n", compiler->dspTreeID(call),
-                call->gtClassProfileCandidateInfo->probeIndex, call->gtClassProfileCandidateInfo->ilOffset);
+                        if (BlockSetOps::IsMember(comp, processed, i))
+                        {
+                            continue;
+                        }
 
-        // We transform the call from (CALLVIRT obj, ... args ...) to
-        // to
-        //      (CALLVIRT
-        //        (COMMA
-        //          (ASG tmp, obj)
-        //          (COMMA
-        //            (CALL probe_fn tmp, &probeEntry)
-        //            tmp)))
-        //         ... args ...)
-        //
+                        if (block->isRunRarely() || !target->isRunRarely())
+                        {
+                            continue;
+                        }
 
-        assert(call->gtCallThisArg->GetNode()->TypeGet() == TYP_REF);
+                        BlockSetOps::AddElemD(comp, processed, i);
 
-        // Sanity check that we're looking at the right schema entry
-        //
-        assert(m_schema[*m_currentSchemaIndex].ILOffset == (int32_t)call->gtClassProfileCandidateInfo->ilOffset);
-        assert(m_schema[*m_currentSchemaIndex].InstrumentationKind ==
-               ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount);
+                        if (BlockSetOps::IsMember(comp, marked, target->bbNum))
+                        {
+                            visitor->VisitNonTreeEdge(block, target,
+                                                      target->bbRefs > 1
+                                                          ? SpanningTreeVisitor::EdgeKind::CriticalEdge
+                                                          : SpanningTreeVisitor::EdgeKind::DominatesTarget);
+                        }
+                        else
+                        {
+                            visitor->VisitTreeEdge(block, target);
+                            stack.Push(target);
+                            BlockSetOps::AddElemD(comp, marked, target->bbNum);
+                        }
+                    }
 
-        // Figure out where the table is located.
-        //
-        BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
-        *m_currentSchemaIndex += 2; // There are 2 schema entries per class probe
+                    // Non-critical edges
+                    //
+                    for (unsigned i = 0; i < numSucc; i++)
+                    {
+                        BasicBlock* const target = scratch.Top(i);
 
-        // Grab a temp to hold the 'this' object as it will be used three times
-        //
-        unsigned const tmpNum             = compiler->lvaGrabTemp(true DEBUGARG("class profile tmp"));
-        compiler->lvaTable[tmpNum].lvType = TYP_REF;
+                        if (BlockSetOps::IsMember(comp, processed, i))
+                        {
+                            continue;
+                        }
 
-        // Generate the IR...
-        //
-        GenTree* const          classProfileNode = compiler->gtNewIconNode((ssize_t)classProfile, TYP_I_IMPL);
-        GenTree* const          tmpNode          = compiler->gtNewLclvNode(tmpNum, TYP_REF);
-        GenTreeCall::Use* const args             = compiler->gtNewCallArgs(tmpNode, classProfileNode);
-        GenTree* const helperCallNode = compiler->gtNewHelperCallNode(CORINFO_HELP_CLASSPROFILE, TYP_VOID, args);
-        GenTree* const tmpNode2       = compiler->gtNewLclvNode(tmpNum, TYP_REF);
-        GenTree* const callCommaNode  = compiler->gtNewOperNode(GT_COMMA, TYP_REF, helperCallNode, tmpNode2);
-        GenTree* const tmpNode3       = compiler->gtNewLclvNode(tmpNum, TYP_REF);
-        GenTree* const asgNode = compiler->gtNewOperNode(GT_ASG, TYP_REF, tmpNode3, call->gtCallThisArg->GetNode());
-        GenTree* const asgCommaNode = compiler->gtNewOperNode(GT_COMMA, TYP_REF, asgNode, callCommaNode);
+                        if (target->bbRefs != 1)
+                        {
+                            continue;
+                        }
 
-        // Update the call
-        //
-        call->gtCallThisArg->SetNode(asgCommaNode);
+                        BlockSetOps::AddElemD(comp, processed, i);
 
-        JITDUMP("Modified call is now\n");
-        DISPTREE(call);
+                        if (BlockSetOps::IsMember(comp, marked, target->bbNum))
+                        {
+                            visitor->VisitNonTreeEdge(block, target, SpanningTreeVisitor::EdgeKind::DominatesTarget);
+                        }
+                        else
+                        {
+                            visitor->VisitTreeEdge(block, target);
+                            stack.Push(target);
+                            BlockSetOps::AddElemD(comp, marked, target->bbNum);
+                        }
+                    }
 
-        // Restore the stub address on the call
-        //
-        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
+                    // Critical edges
+                    //
+                    for (unsigned i = 0; i < numSucc; i++)
+                    {
+                        BasicBlock* const target = scratch.Top(i);
 
-        m_instrCount++;
+                        if (BlockSetOps::IsMember(comp, processed, i))
+                        {
+                            continue;
+                        }
+
+                        BlockSetOps::AddElemD(comp, processed, i);
+
+                        if (BlockSetOps::IsMember(comp, marked, target->bbNum))
+                        {
+                            visitor->VisitNonTreeEdge(block, target, SpanningTreeVisitor::EdgeKind::CriticalEdge);
+                        }
+                        else
+                        {
+                            visitor->VisitTreeEdge(block, target);
+                            stack.Push(target);
+                            BlockSetOps::AddElemD(comp, marked, target->bbNum);
+                        }
+                    }
+
+                    // Verify we processed each successor.
+                    //
+                    assert(numSucc == BlockSetOps::Count(comp, processed));
+                }
+            }
+            break;
+        }
     }
-};
+}
 
 //------------------------------------------------------------------------
-// SuppressProbesFunctor: functor that resets IR back to the state
-//   it had if there were no class probes.
+// EfficientEdgeCountInstrumentor: instrumentor that adds a counter to
+//   selective edges.
 //
-class SuppressProbesFunctor
+// Based on "Optimally Profiling and Tracing Programs,"
+// Ball and Larus PLDI '92.
+//
+class EfficientEdgeCountInstrumentor : public Instrumentor, public SpanningTreeVisitor
 {
 private:
-    unsigned& m_cleanupCount;
+    // A particular edge probe. These are linked
+    // on the source block via bbSparseProbeList.
+    //
+    struct Probe
+    {
+        BasicBlock* target;
+        Probe*      next;
+        int         schemaIndex;
+        EdgeKind    kind;
+    };
 
-public:
-    SuppressProbesFunctor(unsigned& cleanupCount) : m_cleanupCount(cleanupCount)
+    Probe* NewProbe(BasicBlock* source, BasicBlock* target)
     {
+        Probe* p                  = new (m_comp, CMK_Pgo) Probe();
+        p->target                 = target;
+        p->kind                   = EdgeKind::Unknown;
+        p->schemaIndex            = -1;
+        p->next                   = (Probe*)source->bbSparseProbeList;
+        source->bbSparseProbeList = p;
+        m_probeCount++;
+
+        return p;
     }
 
-    void operator()(Compiler* compiler, GenTreeCall* call)
+    void NewSourceProbe(BasicBlock* source, BasicBlock* target)
     {
-        // Restore the stub address on the call
-        //
-        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
+        JITDUMP("[%u] New probe for " FMT_BB " -> " FMT_BB " [source]\n", m_probeCount, source->bbNum, target->bbNum);
+        Probe* p = NewProbe(source, target);
+        p->kind  = EdgeKind::PostdominatesSource;
+    }
 
-        m_cleanupCount++;
+    void NewTargetProbe(BasicBlock* source, BasicBlock* target)
+    {
+        JITDUMP("[%u] New probe for " FMT_BB " -> " FMT_BB " [target]\n", m_probeCount, source->bbNum, target->bbNum);
+
+        Probe* p = NewProbe(source, target);
+        p->kind  = EdgeKind::DominatesTarget;
     }
-};
 
-//------------------------------------------------------------------------
-// ClassProbeInstrumentor: instrumentor that adds a class probe to each
-//   virtual call in the basic block
-//
-class ClassProbeInstrumentor : public Instrumentor
-{
+    void NewEdgeProbe(BasicBlock* source, BasicBlock* target)
+    {
+        JITDUMP("[%u] New probe for " FMT_BB " -> " FMT_BB " [edge]\n", m_probeCount, source->bbNum, target->bbNum);
+
+        Probe* p = NewProbe(source, target);
+        p->kind  = EdgeKind::CriticalEdge;
+
+        m_edgeProbeCount++;
+    }
+
+    unsigned m_blockCount;
+    unsigned m_probeCount;
+    unsigned m_edgeProbeCount;
+    bool     m_badcode;
+
 public:
-    ClassProbeInstrumentor(Compiler* comp) : Instrumentor(comp)
+    EfficientEdgeCountInstrumentor(Compiler* comp)
+        : Instrumentor(comp)
+        , SpanningTreeVisitor()
+        , m_blockCount(0)
+        , m_probeCount(0)
+        , m_edgeProbeCount(0)
+        , m_badcode(false)
     {
     }
+    void Prepare(bool isPreImport) override;
     bool ShouldProcess(BasicBlock* block) override
     {
-        return ((block->bbFlags & (BBF_INTERNAL | BBF_IMPORTED)) == BBF_IMPORTED);
+        return ((block->bbFlags & BBF_IMPORTED) == BBF_IMPORTED);
     }
-    void Prepare() override;
     void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
     void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
-    void SuppressProbes() override;
-};
-
-//------------------------------------------------------------------------
-// ClassProbeInstrumentor::Prepare: prepare for class instrumentation
-//
-void ClassProbeInstrumentor::Prepare()
-{
 
-#ifdef DEBUG
-    // Set schema index to invalid value
-    //
-    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    void Badcode() override
     {
-        block->bbClassSchemaIndex = -1;
+        m_badcode = true;
     }
-#endif
-}
 
-//------------------------------------------------------------------------
-// ClassProbeInstrumentor::BuildSchemaElements: create schema elements for a class probe
-//
-// Arguments:
-//   block -- block to instrument
-//   schema -- schema that we're building
-//
-void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema)
-{
-    if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+    void VisitBlock(BasicBlock* block) override
     {
-        return;
+        m_blockCount++;
+        block->bbSparseProbeList = nullptr;
+        JITDUMP("node " FMT_BB "\n", block->bbNum);
     }
 
-    // Remember the schema index for this block.
-    //
-    block->bbClassSchemaIndex = (int)schema.size();
+    void VisitTreeEdge(BasicBlock* source, BasicBlock* target) override
+    {
+        JITDUMP("tree " FMT_BB " -> " FMT_BB "\n", source->bbNum, target->bbNum);
+    }
 
-    // Scan the statements and identify the class probes
-    //
-    BuildClassProbeSchemaGen                    schemaGen(schema, m_schemaCount);
-    ClassProbeVisitor<BuildClassProbeSchemaGen> visitor(m_comp, schemaGen);
-    for (Statement* stmt : block->Statements())
+    void VisitNonTreeEdge(BasicBlock* source, BasicBlock* target, SpanningTreeVisitor::EdgeKind kind) override
     {
-        visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+        JITDUMP("non-tree " FMT_BB " -> " FMT_BB "\n", source->bbNum, target->bbNum);
+        switch (kind)
+        {
+            case EdgeKind::PostdominatesSource:
+                NewSourceProbe(source, target);
+                break;
+            case EdgeKind::DominatesTarget:
+                NewTargetProbe(source, target);
+                break;
+            case EdgeKind::CriticalEdge:
+                NewEdgeProbe(source, target);
+                break;
+            default:
+                assert(!"unexpected kind");
+                break;
+        }
     }
-}
+};
 
 //------------------------------------------------------------------------
-// ClassProbeInstrumentor::Instrument: add class probes to block
+// EfficientEdgeCountInstrumentor:Prepare: analyze the flow graph to
+//   determine which edges should be instrumented.
 //
 // Arguments:
-//   block -- block of interest
-//   schema -- instrumentation schema
-//   profileMemory -- profile data slab
+//   preImport - true if this is the prepare call that happens before
+//      importation
 //
-void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
+// Notes:
+//   Build a (maximum weight) spanning tree and designate the non-tree
+//   edges as the ones needing instrumentation.
+//
+//   For non-critical edges, instrumentation happens in either the
+//   predecessor or successor blocks.
+//
+//   Note we may only schematize and instrument a subset of the full
+//   set of instrumentation envisioned here, if the method is partially
+//   imported, as subsequent "passes" will bypass un-imported blocks.
+//
+//   It might be preferable to export the full schema but only
+//   selectively instrument; this would make merging and importing
+//   of data simpler, as all schemas for a method would agree, no
+//   matter what importer-level opts were applied.
+//
+void EfficientEdgeCountInstrumentor::Prepare(bool preImport)
 {
-    if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+    if (!preImport)
     {
+        // If we saw badcode in the preimport prepare, we would expect
+        // compilation to blow up in the importer. So if we end up back
+        // here postimport with badcode set, something is wrong.
+        //
+        assert(!m_badcode);
         return;
     }
 
-    // Would be nice to avoid having to search here by tracking
-    // candidates more directly.
-    //
-    JITDUMP("Scanning for calls to profile in " FMT_BB "\n", block->bbNum);
-
-    // Scan the statements and add class probes
-    //
-    int classSchemaIndex = block->bbClassSchemaIndex;
-    assert((classSchemaIndex >= 0) && (classSchemaIndex < (int)schema.size()));
-
-    ClassProbeInserter                    insertProbes(schema, profileMemory, &classSchemaIndex, m_instrCount);
-    ClassProbeVisitor<ClassProbeInserter> visitor(m_comp, insertProbes);
-    for (Statement* stmt : block->Statements())
-    {
-        visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
-    }
+    JITDUMP("\nEfficientEdgeCountInstrumentor: preparing for instrumentation\n");
+    m_comp->WalkSpanningTree(this);
+    JITDUMP("%u blocks, %u probes (%u on critical edges)\n", m_blockCount, m_probeCount, m_edgeProbeCount);
 }
 
 //------------------------------------------------------------------------
-// ClassProbeInstrumentor::SuppressProbes: clean up if we're not instrumenting
+// EfficientEdgeCountInstrumentor:BuildSchemaElements: create schema
+//   elements for the probes
 //
-// Notes:
-//   Currently we're hijacking the gtCallStubAddr of the call node to hold
-//   a pointer to the profile candidate info.
+// Arguments:
+//   block -- block to instrument
+//   schema -- schema that we're building
 //
-//   We must undo this, if not instrumenting.
+// Todo: if required to have special entry probe, we must also
+//  instrument method entry with a block count.
 //
-void ClassProbeInstrumentor::SuppressProbes()
+void EfficientEdgeCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema)
 {
-    unsigned                                 cleanupCount = 0;
-    SuppressProbesFunctor                    suppressProbes(cleanupCount);
-    ClassProbeVisitor<SuppressProbesFunctor> visitor(m_comp, suppressProbes);
-
-    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    // Walk the bbSparseProbeList, emitting one schema element per...
+    //
+    for (Probe* probe = (Probe*)block->bbSparseProbeList; probe != nullptr; probe = probe->next)
     {
-        if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+        // Probe is for the edge from block to target.
+        //
+        BasicBlock* const target = probe->target;
+
+        // Remember the schema index for this probe
+        //
+        assert(probe->schemaIndex == -1);
+        probe->schemaIndex = (int)schema.size();
+
+        // Assign the current block's IL offset into the profile data.
+        // Use the "other" field to hold the target block IL offset.
+        //
+        int32_t sourceOffset = (int32_t)block->bbCodeOffs;
+        int32_t targetOffset = (int32_t)target->bbCodeOffs;
+
+        // We may see empty BBJ_NONE BBF_INTERNAL blocks that were added
+        // by fgNormalizeEH.
+        //
+        // We'll use their bbNum in place of IL offset, and set
+        // a high bit as a "flag"
+        //
+        if ((block->bbFlags & BBF_INTERNAL) == BBF_INTERNAL)
         {
-            continue;
+            sourceOffset = block->bbNum | IL_OFFSETX_CALLINSTRUCTIONBIT;
         }
 
-        for (Statement* stmt : block->Statements())
+        if ((target->bbFlags & BBF_INTERNAL) == BBF_INTERNAL)
         {
-            visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+            targetOffset = target->bbNum | IL_OFFSETX_CALLINSTRUCTIONBIT;
         }
-    }
 
-    assert(cleanupCount == m_comp->info.compClassProbeCount);
+        ICorJitInfo::PgoInstrumentationSchema schemaElem;
+        schemaElem.Count               = 1;
+        schemaElem.Other               = targetOffset;
+        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::EdgeIntCount;
+        schemaElem.ILOffset            = sourceOffset;
+        schemaElem.Offset              = 0;
+
+        schema.push_back(schemaElem);
+
+        m_schemaCount++;
+    }
 }
 
 //------------------------------------------------------------------------
-// fgInstrumentMethod: add instrumentation probes to the method
-//
-// Returns:
-//   appropriate phase status
-//
-// Note:
-//
-//   By default this instruments each non-internal block with
-//   a counter probe.
-//
-//   Optionally adds class probes to virtual and interface calls.
+// EfficientEdgeCountInstrumentor::Instrument: add counter probes for edges
+//   originating from block
 //
-//   Probe structure is described by a schema array, which is created
-//   here based on flowgraph and IR structure.
+// Arguments:
+//   block -- block of interest
+//   schema -- instrumentation schema
+//   profileMemory -- profile data slab
 //
-PhaseStatus Compiler::fgInstrumentMethod()
+void EfficientEdgeCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
 {
-    noway_assert(!compIsForInlining());
-
-    // Choose instrumentation technology.
+    // Inlinee compilers build their blocks in the root compiler's
+    // graph. So for NumSucc, we use the root compiler instance.
     //
-    Instrumentor* countInst = new (this, CMK_Pgo) BlockCountInstrumentor(this);
-    Instrumentor* classInst = nullptr;
+    Compiler* const comp = m_comp->impInlineRoot();
 
-    if (JitConfig.JitClassProfiling() > 0)
-    {
-        classInst = new (this, CMK_Pgo) ClassProbeInstrumentor(this);
-    }
-    else
+    // Walk the bbSparseProbeList, adding instrumentation.
+    //
+    for (Probe* probe = (Probe*)block->bbSparseProbeList; probe != nullptr; probe = probe->next)
     {
-        classInst = new (this, CMK_Pgo) NonInstrumentor(this);
-    }
+        // Probe is for the edge from block to target.
+        //
+        BasicBlock* const target = probe->target;
 
-    // Do any up-front work.
-    //
-    countInst->Prepare();
-    classInst->Prepare();
+        // Retrieve the schema index for this probe
+        //
+        const int schemaIndex = probe->schemaIndex;
 
-    // Walk the flow graph to build up the instrumentation schema.
-    //
-    Schema schema(getAllocator(CMK_Pgo));
-    for (BasicBlock* block = fgFirstBB; (block != nullptr); block = block->bbNext)
-    {
-        if (countInst->ShouldProcess(block))
-        {
-            countInst->BuildSchemaElements(block, schema);
-        }
+        // Sanity checks.
+        //
+        assert((schemaIndex >= 0) && (schemaIndex < (int)schema.size()));
+        assert(schema[schemaIndex].InstrumentationKind == ICorJitInfo::PgoInstrumentationKind::EdgeIntCount);
 
-        if (classInst->ShouldProcess(block))
+        size_t addrOfCurrentExecutionCount = (size_t)(schema[schemaIndex].Offset + profileMemory);
+
+        // Determine where to place the probe.
+        //
+        BasicBlock* instrumentedBlock = nullptr;
+
+        switch (probe->kind)
         {
-            classInst->BuildSchemaElements(block, schema);
+            case EdgeKind::PostdominatesSource:
+                instrumentedBlock = block;
+                break;
+            case EdgeKind::DominatesTarget:
+                instrumentedBlock = probe->target;
+                break;
+            case EdgeKind::CriticalEdge:
+            {
+#ifdef DEBUG
+                // Verify the edge still exists.
+                //
+                const unsigned numSucc = block->NumSucc(comp);
+                bool           found   = false;
+                for (unsigned i = 0; i < numSucc && !found; i++)
+                {
+                    found = (target == block->GetSucc(i, comp));
+                }
+                assert(found);
+#endif
+                instrumentedBlock = m_comp->fgSplitEdge(block, probe->target);
+                instrumentedBlock->bbFlags |= BBF_IMPORTED;
+            }
+            break;
+
+            default:
+                unreached();
         }
-    }
 
-    // Verify we created schema for the calls needing class probes.
-    // (we counted those when importing)
-    //
-    assert(classInst->SchemaCount() == info.compClassProbeCount);
+        assert(instrumentedBlock != nullptr);
 
-    // Optionally, if there were no class probes and only one count probe,
-    // suppress instrumentation.
-    //
-    if ((JitConfig.JitMinimalProfiling() > 0) && (countInst->SchemaCount() == 1) && (classInst->SchemaCount() == 0))
-    {
-        JITDUMP("Not instrumenting method: only one counter, and no class probes\n");
-        return PhaseStatus::MODIFIED_NOTHING;
-    }
+        // Place the probe
 
-    JITDUMP("Instrumenting method: %d count probes and %d class probes\n", countInst->SchemaCount(),
-            classInst->SchemaCount());
+        // Read Basic-Block count value
+        GenTree* valueNode =
+            m_comp->gtNewIndOfIconHandleNode(TYP_INT, addrOfCurrentExecutionCount, GTF_ICON_BBC_PTR, false);
 
-    // Allocate the profile buffer
-    //
-    BYTE* profileMemory;
+        // Increment value by 1
+        GenTree* rhsNode = m_comp->gtNewOperNode(GT_ADD, TYP_INT, valueNode, m_comp->gtNewIconNode(1));
 
-    HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
-                                                                    (UINT32)schema.size(), &profileMemory);
+        // Write new Basic-Block count value
+        GenTree* lhsNode =
+            m_comp->gtNewIndOfIconHandleNode(TYP_INT, addrOfCurrentExecutionCount, GTF_ICON_BBC_PTR, false);
+        GenTree* asgNode = m_comp->gtNewAssignNode(lhsNode, rhsNode);
 
-    // Deal with allocation failures.
-    //
-    if (!SUCCEEDED(res))
+        m_comp->fgNewStmtAtBeg(instrumentedBlock, asgNode);
+
+        m_instrCount++;
+    }
+}
+
+//------------------------------------------------------------------------
+// ClassProbeVisitor: invoke functor on each virtual call in a tree
+//
+template <class TFunctor>
+class ClassProbeVisitor final : public GenTreeVisitor<ClassProbeVisitor<TFunctor>>
+{
+public:
+    enum
     {
-        JITDUMP("Unable to instrument: schema allocation failed: 0x%x\n", res);
+        DoPreOrder = true
+    };
 
-        // The E_NOTIMPL status is returned when we are profiling a generic method from a different assembly
-        //
-        if (res != E_NOTIMPL)
-        {
-            noway_assert(!"Error: unexpected hresult from allocPgoInstrumentationBySchema");
-            return PhaseStatus::MODIFIED_NOTHING;
-        }
+    TFunctor& m_functor;
+    Compiler* m_compiler;
 
-        // Do any cleanup we might need to do...
-        //
-        countInst->SuppressProbes();
-        classInst->SuppressProbes();
-        return PhaseStatus::MODIFIED_NOTHING;
+    ClassProbeVisitor(Compiler* compiler, TFunctor& functor)
+        : GenTreeVisitor<ClassProbeVisitor>(compiler), m_functor(functor), m_compiler(compiler)
+    {
     }
-
-    // Add the instrumentation code
-    //
-    for (BasicBlock* block = fgFirstBB; (block != nullptr); block = block->bbNext)
+    Compiler::fgWalkResult PreOrderVisit(GenTree** use, GenTree* user)
     {
-        if (countInst->ShouldProcess(block))
+        GenTree* const node = *use;
+        if (node->IsCall())
         {
-            countInst->Instrument(block, schema, profileMemory);
+            GenTreeCall* const call = node->AsCall();
+            if (call->IsVirtual() && (call->gtCallType != CT_INDIRECT))
+            {
+                m_functor(m_compiler, call);
+            }
         }
 
-        if (classInst->ShouldProcess(block))
-        {
-            classInst->Instrument(block, schema, profileMemory);
-        }
+        return Compiler::WALK_CONTINUE;
     }
-
-    // Verify we instrumented everthing we created schemas for.
-    //
-    assert(countInst->InstrCount() == countInst->SchemaCount());
-    assert(classInst->InstrCount() == classInst->SchemaCount());
-
-    // Add any special entry instrumentation. This does not
-    // use the schema mechanism.
-    //
-    countInst->InstrumentMethodEntry(schema, profileMemory);
-    classInst->InstrumentMethodEntry(schema, profileMemory);
-
-    return PhaseStatus::MODIFIED_EVERYTHING;
-}
+};
 
 //------------------------------------------------------------------------
-// fgIncorporateProfileData: add block/edge profile data to the flowgraph
+// BuildClassProbeSchemaGen: functor that creates class probe schema elements
+//
+class BuildClassProbeSchemaGen
+{
+private:
+    Schema&   m_schema;
+    unsigned& m_schemaCount;
+
+public:
+    BuildClassProbeSchemaGen(Schema& schema, unsigned& schemaCount) : m_schema(schema), m_schemaCount(schemaCount)
+    {
+    }
+
+    void operator()(Compiler* compiler, GenTreeCall* call)
+    {
+        ICorJitInfo::PgoInstrumentationSchema schemaElem;
+        schemaElem.Count = 1;
+        schemaElem.Other = ICorJitInfo::ClassProfile::CLASS_FLAG;
+        if (call->IsVirtualStub())
+        {
+            schemaElem.Other |= ICorJitInfo::ClassProfile::INTERFACE_FLAG;
+        }
+        else
+        {
+            assert(call->IsVirtualVtable());
+        }
+
+        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount;
+        schemaElem.ILOffset            = jitGetILoffs(call->gtClassProfileCandidateInfo->ilOffset);
+        schemaElem.Offset              = 0;
+
+        m_schema.push_back(schemaElem);
+
+        // Re-using ILOffset and Other fields from schema item for TypeHandleHistogramCount
+        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramTypeHandle;
+        schemaElem.Count               = ICorJitInfo::ClassProfile::SIZE;
+        m_schema.push_back(schemaElem);
+
+        m_schemaCount++;
+    }
+};
+
+//------------------------------------------------------------------------
+// ClassProbeInserter: functor that adds class probe instrumentation
+//
+class ClassProbeInserter
+{
+    Schema&   m_schema;
+    BYTE*     m_profileMemory;
+    int*      m_currentSchemaIndex;
+    unsigned& m_instrCount;
+
+public:
+    ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
+        : m_schema(schema)
+        , m_profileMemory(profileMemory)
+        , m_currentSchemaIndex(pCurrentSchemaIndex)
+        , m_instrCount(instrCount)
+    {
+    }
+
+    void operator()(Compiler* compiler, GenTreeCall* call)
+    {
+        JITDUMP("Found call [%06u] with probe index %d and ilOffset 0x%X\n", compiler->dspTreeID(call),
+                call->gtClassProfileCandidateInfo->probeIndex, call->gtClassProfileCandidateInfo->ilOffset);
+
+        // We transform the call from (CALLVIRT obj, ... args ...) to
+        // to
+        //      (CALLVIRT
+        //        (COMMA
+        //          (ASG tmp, obj)
+        //          (COMMA
+        //            (CALL probe_fn tmp, &probeEntry)
+        //            tmp)))
+        //         ... args ...)
+        //
+
+        assert(call->gtCallThisArg->GetNode()->TypeGet() == TYP_REF);
+
+        // Sanity check that we're looking at the right schema entry
+        //
+        assert(m_schema[*m_currentSchemaIndex].ILOffset == (int32_t)call->gtClassProfileCandidateInfo->ilOffset);
+        assert(m_schema[*m_currentSchemaIndex].InstrumentationKind ==
+               ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount);
+
+        // Figure out where the table is located.
+        //
+        BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
+        *m_currentSchemaIndex += 2; // There are 2 schema entries per class probe
+
+        // Grab a temp to hold the 'this' object as it will be used three times
+        //
+        unsigned const tmpNum             = compiler->lvaGrabTemp(true DEBUGARG("class profile tmp"));
+        compiler->lvaTable[tmpNum].lvType = TYP_REF;
+
+        // Generate the IR...
+        //
+        GenTree* const          classProfileNode = compiler->gtNewIconNode((ssize_t)classProfile, TYP_I_IMPL);
+        GenTree* const          tmpNode          = compiler->gtNewLclvNode(tmpNum, TYP_REF);
+        GenTreeCall::Use* const args             = compiler->gtNewCallArgs(tmpNode, classProfileNode);
+        GenTree* const helperCallNode = compiler->gtNewHelperCallNode(CORINFO_HELP_CLASSPROFILE, TYP_VOID, args);
+        GenTree* const tmpNode2       = compiler->gtNewLclvNode(tmpNum, TYP_REF);
+        GenTree* const callCommaNode  = compiler->gtNewOperNode(GT_COMMA, TYP_REF, helperCallNode, tmpNode2);
+        GenTree* const tmpNode3       = compiler->gtNewLclvNode(tmpNum, TYP_REF);
+        GenTree* const asgNode = compiler->gtNewOperNode(GT_ASG, TYP_REF, tmpNode3, call->gtCallThisArg->GetNode());
+        GenTree* const asgCommaNode = compiler->gtNewOperNode(GT_COMMA, TYP_REF, asgNode, callCommaNode);
+
+        // Update the call
+        //
+        call->gtCallThisArg->SetNode(asgCommaNode);
+
+        JITDUMP("Modified call is now\n");
+        DISPTREE(call);
+
+        // Restore the stub address on the call
+        //
+        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
+
+        m_instrCount++;
+    }
+};
+
+//------------------------------------------------------------------------
+// SuppressProbesFunctor: functor that resets IR back to the state
+//   it had if there were no class probes.
+//
+class SuppressProbesFunctor
+{
+private:
+    unsigned& m_cleanupCount;
+
+public:
+    SuppressProbesFunctor(unsigned& cleanupCount) : m_cleanupCount(cleanupCount)
+    {
+    }
+
+    void operator()(Compiler* compiler, GenTreeCall* call)
+    {
+        // Restore the stub address on the call
+        //
+        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
+
+        m_cleanupCount++;
+    }
+};
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor: instrumentor that adds a class probe to each
+//   virtual call in the basic block
+//
+class ClassProbeInstrumentor : public Instrumentor
+{
+public:
+    ClassProbeInstrumentor(Compiler* comp) : Instrumentor(comp)
+    {
+    }
+    bool ShouldProcess(BasicBlock* block) override
+    {
+        return ((block->bbFlags & (BBF_INTERNAL | BBF_IMPORTED)) == BBF_IMPORTED);
+    }
+    void Prepare(bool isPreImport) override;
+    void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
+    void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
+    void SuppressProbes() override;
+};
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::Prepare: prepare for class instrumentation
+//
+// Arguments:
+//   preImport - true if this is the prepare call that happens before
+//      importation
+//
+void ClassProbeInstrumentor::Prepare(bool isPreImport)
+{
+    if (isPreImport)
+    {
+        return;
+    }
+
+#ifdef DEBUG
+    // Set schema index to invalid value
+    //
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        block->bbClassSchemaIndex = -1;
+    }
+#endif
+}
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::BuildSchemaElements: create schema elements for a class probe
+//
+// Arguments:
+//   block -- block to instrument
+//   schema -- schema that we're building
+//
+void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema)
+{
+    if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+    {
+        return;
+    }
+
+    // Remember the schema index for this block.
+    //
+    block->bbClassSchemaIndex = (int)schema.size();
+
+    // Scan the statements and identify the class probes
+    //
+    BuildClassProbeSchemaGen                    schemaGen(schema, m_schemaCount);
+    ClassProbeVisitor<BuildClassProbeSchemaGen> visitor(m_comp, schemaGen);
+    for (Statement* stmt : block->Statements())
+    {
+        visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+    }
+}
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::Instrument: add class probes to block
+//
+// Arguments:
+//   block -- block of interest
+//   schema -- instrumentation schema
+//   profileMemory -- profile data slab
+//
+void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
+{
+    if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+    {
+        return;
+    }
+
+    // Would be nice to avoid having to search here by tracking
+    // candidates more directly.
+    //
+    JITDUMP("Scanning for calls to profile in " FMT_BB "\n", block->bbNum);
+
+    // Scan the statements and add class probes
+    //
+    int classSchemaIndex = block->bbClassSchemaIndex;
+    assert((classSchemaIndex >= 0) && (classSchemaIndex < (int)schema.size()));
+
+    ClassProbeInserter                    insertProbes(schema, profileMemory, &classSchemaIndex, m_instrCount);
+    ClassProbeVisitor<ClassProbeInserter> visitor(m_comp, insertProbes);
+    for (Statement* stmt : block->Statements())
+    {
+        visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+    }
+}
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::SuppressProbes: clean up if we're not instrumenting
+//
+// Notes:
+//   Currently we're hijacking the gtCallStubAddr of the call node to hold
+//   a pointer to the profile candidate info.
+//
+//   We must undo this, if not instrumenting.
+//
+void ClassProbeInstrumentor::SuppressProbes()
+{
+    unsigned                                 cleanupCount = 0;
+    SuppressProbesFunctor                    suppressProbes(cleanupCount);
+    ClassProbeVisitor<SuppressProbesFunctor> visitor(m_comp, suppressProbes);
+
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+        {
+            continue;
+        }
+
+        for (Statement* stmt : block->Statements())
+        {
+            visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+        }
+    }
+
+    assert(cleanupCount == m_comp->info.compClassProbeCount);
+}
+
+//------------------------------------------------------------------------
+// fgPrepareToInstrumentMethod: prepare for instrumentation
+//
+// Notes:
+//   Runs before importation, so instrumentation schemes can get a pure
+//   look at the flowgraph before any internal blocks are added.
+//
+// Returns:
+//   appropriate phase status
+//
+PhaseStatus Compiler::fgPrepareToInstrumentMethod()
+{
+    noway_assert(!compIsForInlining());
+
+    // Choose instrumentation technology.
+    //
+    // Currently, OSR is incompatible with edge profiling. So if OSR is enabled,
+    // always do block profiling.
+    //
+    // Note this incompatibility only exists for methods that actually have
+    // patchpoints, but we won't know that until we import.
+    //
+    const bool methodMayHavePatchpoints =
+        (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER0) && (JitConfig.TC_OnStackReplacement() > 0));
+
+    if ((JitConfig.JitEdgeProfiling() > 0) && !methodMayHavePatchpoints)
+    {
+        fgCountInstrumentor = new (this, CMK_Pgo) EfficientEdgeCountInstrumentor(this);
+    }
+    else
+    {
+        if (JitConfig.JitEdgeProfiling() > 0)
+        {
+            JITDUMP("OSR and edge profiling not yet compatible; using block profiling\n");
+        }
+
+        fgCountInstrumentor = new (this, CMK_Pgo) BlockCountInstrumentor(this);
+    }
+
+    if (JitConfig.JitClassProfiling() > 0)
+    {
+        fgClassInstrumentor = new (this, CMK_Pgo) ClassProbeInstrumentor(this);
+    }
+    else
+    {
+        fgClassInstrumentor = new (this, CMK_Pgo) NonInstrumentor(this);
+    }
+
+    // Make pre-import preparations.
+    //
+    const bool isPreImport = true;
+    fgCountInstrumentor->Prepare(isPreImport);
+    fgClassInstrumentor->Prepare(isPreImport);
+
+    return PhaseStatus::MODIFIED_NOTHING;
+}
+
+//------------------------------------------------------------------------
+// fgInstrumentMethod: add instrumentation probes to the method
+//
+// Returns:
+//   appropriate phase status
+//
+// Note:
+//
+//   By default this instruments each non-internal block with
+//   a counter probe.
+//
+//   Optionally adds class probes to virtual and interface calls.
+//
+//   Probe structure is described by a schema array, which is created
+//   here based on flowgraph and IR structure.
+//
+PhaseStatus Compiler::fgInstrumentMethod()
+{
+    noway_assert(!compIsForInlining());
+
+    // Make post-importpreparations.
+    //
+    const bool isPreImport = false;
+    fgCountInstrumentor->Prepare(isPreImport);
+    fgClassInstrumentor->Prepare(isPreImport);
+
+    // Walk the flow graph to build up the instrumentation schema.
+    //
+    Schema schema(getAllocator(CMK_Pgo));
+    for (BasicBlock* block = fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        if (fgCountInstrumentor->ShouldProcess(block))
+        {
+            fgCountInstrumentor->BuildSchemaElements(block, schema);
+        }
+
+        if (fgClassInstrumentor->ShouldProcess(block))
+        {
+            fgClassInstrumentor->BuildSchemaElements(block, schema);
+        }
+    }
+
+    // Verify we created schema for the calls needing class probes.
+    // (we counted those when importing)
+    //
+    assert(fgClassInstrumentor->SchemaCount() == info.compClassProbeCount);
+
+    // Optionally, if there were no class probes and only one count probe,
+    // suppress instrumentation.
+    //
+    if ((JitConfig.JitMinimalProfiling() > 0) && (fgCountInstrumentor->SchemaCount() == 1) &&
+        (fgClassInstrumentor->SchemaCount() == 0))
+    {
+        JITDUMP("Not instrumenting method: only one counter, and no class probes\n");
+        return PhaseStatus::MODIFIED_NOTHING;
+    }
+
+    JITDUMP("Instrumenting method: %d count probes and %d class probes\n", fgCountInstrumentor->SchemaCount(),
+            fgClassInstrumentor->SchemaCount());
+
+    assert(schema.size() > 0);
+
+    // Allocate the profile buffer
+    //
+    BYTE* profileMemory;
+
+    HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
+                                                                    (UINT32)schema.size(), &profileMemory);
+
+    JITDUMP("Instrumentation data base address is %p\n", dspPtr(profileMemory));
+
+    // Deal with allocation failures.
+    //
+    if (!SUCCEEDED(res))
+    {
+        JITDUMP("Unable to instrument: schema allocation failed: 0x%x\n", res);
+
+        // The E_NOTIMPL status is returned when we are profiling a generic method from a different assembly
+        //
+        if (res != E_NOTIMPL)
+        {
+            noway_assert(!"Error: unexpected hresult from allocPgoInstrumentationBySchema");
+            return PhaseStatus::MODIFIED_NOTHING;
+        }
+
+        // Do any cleanup we might need to do...
+        //
+        fgCountInstrumentor->SuppressProbes();
+        fgClassInstrumentor->SuppressProbes();
+        return PhaseStatus::MODIFIED_NOTHING;
+    }
+
+    // Add the instrumentation code
+    //
+    for (BasicBlock* block = fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        if (fgCountInstrumentor->ShouldProcess(block))
+        {
+            fgCountInstrumentor->Instrument(block, schema, profileMemory);
+        }
+
+        if (fgClassInstrumentor->ShouldProcess(block))
+        {
+            fgClassInstrumentor->Instrument(block, schema, profileMemory);
+        }
+    }
+
+    // Verify we instrumented everthing we created schemas for.
+    //
+    assert(fgCountInstrumentor->InstrCount() == fgCountInstrumentor->SchemaCount());
+    assert(fgClassInstrumentor->InstrCount() == fgClassInstrumentor->SchemaCount());
+
+    // Add any special entry instrumentation. This does not
+    // use the schema mechanism.
+    //
+    fgCountInstrumentor->InstrumentMethodEntry(schema, profileMemory);
+    fgClassInstrumentor->InstrumentMethodEntry(schema, profileMemory);
+
+    return PhaseStatus::MODIFIED_EVERYTHING;
+}
+
+//------------------------------------------------------------------------
+// fgIncorporateProfileData: add block/edge profile data to the flowgraph
 //
 // Returns:
 //   appropriate phase status
@@ -887,130 +1632,832 @@ PhaseStatus Compiler::fgIncorporateProfileData()
 {
     // Are we doing profile stress?
     //
-    if (fgStressBBProf() > 0)
+    if (fgStressBBProf() > 0)
+    {
+        JITDUMP("JitStress -- incorporating random profile data\n");
+        fgIncorporateBlockCounts();
+        return PhaseStatus::MODIFIED_EVERYTHING;
+    }
+
+    // Do we have profile data?
+    //
+    if (!fgHaveProfileData())
+    {
+        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_BBOPT))
+        {
+            JITDUMP("BBOPT set, but no profile data available (hr=%08x)\n", fgPgoQueryResult);
+        }
+        else
+        {
+            JITDUMP("BBOPT not set\n");
+        }
+        return PhaseStatus::MODIFIED_NOTHING;
+    }
+
+    // Summarize profile data
+    //
+    JITDUMP("Have profile data: %d schema records (schema at %p, data at %p)\n", fgPgoSchemaCount, dspPtr(fgPgoSchema),
+            dspPtr(fgPgoData));
+
+    fgNumProfileRuns      = 0;
+    unsigned otherRecords = 0;
+
+    for (UINT32 iSchema = 0; iSchema < fgPgoSchemaCount; iSchema++)
+    {
+        switch (fgPgoSchema[iSchema].InstrumentationKind)
+        {
+            case ICorJitInfo::PgoInstrumentationKind::NumRuns:
+                fgNumProfileRuns += fgPgoSchema[iSchema].Other;
+                break;
+
+            case ICorJitInfo::PgoInstrumentationKind::BasicBlockIntCount:
+                fgPgoBlockCounts++;
+                break;
+
+            case ICorJitInfo::PgoInstrumentationKind::EdgeIntCount:
+                fgPgoEdgeCounts++;
+                break;
+
+            case ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount:
+                fgPgoClassProfiles++;
+                break;
+
+            default:
+                otherRecords++;
+                break;
+        }
+    }
+
+    if (fgNumProfileRuns == 0)
+    {
+        fgNumProfileRuns = 1;
+    }
+
+    JITDUMP("Profile summary: %d runs, %d block probes, %d edge probes, %d class profiles, %d other records\n",
+            fgNumProfileRuns, fgPgoBlockCounts, fgPgoEdgeCounts, fgPgoClassProfiles, otherRecords);
+
+    const bool haveBlockCounts = fgPgoBlockCounts > 0;
+    const bool haveEdgeCounts  = fgPgoEdgeCounts > 0;
+
+    // We expect one or the other but not both.
+    //
+    assert(haveBlockCounts != haveEdgeCounts);
+
+    if (haveBlockCounts)
+    {
+        fgIncorporateBlockCounts();
+    }
+    else if (haveEdgeCounts)
+    {
+        fgIncorporateEdgeCounts();
+    }
+
+    // Now that we have profile data, compute the profile scale for inlinees,
+    // if we haven'd done so already.
+    //
+    if (compIsForInlining())
+    {
+        fgComputeProfileScale();
+    }
+
+    return PhaseStatus::MODIFIED_EVERYTHING;
+}
+
+//------------------------------------------------------------------------
+// fgSetProfileWeight: set profile weight for a block
+//
+// Arguments:
+//   block -- block in question
+//   profileWeight -- raw profile weight (not accounting for inlining)
+//
+// Notes:
+//   Does inlinee scaling.
+//   Handles handler entry special case.
+//
+void Compiler::fgSetProfileWeight(BasicBlock* block, BasicBlock::weight_t profileWeight)
+{
+    // Scale count appropriately for inlinees.
+    //
+    if (compIsForInlining())
+    {
+        if (impInlineInfo->profileScaleState == InlineInfo::ProfileScaleState::KNOWN)
+        {
+            double scaledWeight = impInlineInfo->profileScaleFactor * profileWeight;
+            profileWeight       = (BasicBlock::weight_t)scaledWeight;
+        }
+    }
+
+    block->setBBProfileWeight(profileWeight);
+
+    if (profileWeight == BB_ZERO_WEIGHT)
+    {
+        block->bbSetRunRarely();
+    }
+    else
+    {
+        block->bbFlags &= ~BBF_RUN_RARELY;
+    }
+
+#if HANDLER_ENTRY_MUST_BE_IN_HOT_SECTION
+    // Handle a special case -- some handler entries can't have zero profile count.
+    //
+    if (this->bbIsHandlerBeg(block) && block->isRunRarely())
+    {
+        JITDUMP("Suppressing zero count for " FMT_BB " as it is a handler entry\n", block->bbNum);
+        block->makeBlockHot();
+    }
+#endif
+}
+
+//------------------------------------------------------------------------
+// fgIncorporateBlockCounts: read block count based profile data
+//   and set block weights
+//
+// Notes:
+//   Count data for inlinees is scaled (usually down).
+//
+//   Since we are now running before the importer, we do not know which
+//   blocks will be imported, and we should not see any internal blocks.
+//
+// Todo:
+//   Normalize counts.
+//
+//   Take advantage of the (likely) correspondence between block order
+//   and schema order?
+//
+//   Find some other mechanism for handling cases where handler entry
+//   blocks must be in the hot section.
+//
+void Compiler::fgIncorporateBlockCounts()
+{
+    for (BasicBlock* block = fgFirstBB; block != nullptr; block = block->bbNext)
+    {
+        BasicBlock::weight_t profileWeight;
+
+        if (fgGetProfileWeightForBasicBlock(block->bbCodeOffs, &profileWeight))
+        {
+            fgSetProfileWeight(block, profileWeight);
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// EfficientEdgeCountReconstructor: reconstruct block counts from sparse
+//   edge counts.
+//
+// Notes:
+//    The algorithm is conceptually simple, but requires a bit of bookkeeping.
+//
+//    First, we should have a correspondence between the edge count schema
+//    entries and the non-tree edges of the spanning tree.
+//
+//    The instrumentation schema may be partial, if any importer folding was
+//    done. Say for instance we have a method that is ISA sensitive to x64 and
+//    arm64, and we instrument on x64 and are now jitting on arm64. If so
+//    there may be missing schema entries. If we are confident the IL and
+//    jit IL to block computations are the same, these missing entries can
+//    safely be presumed to be zero.
+//
+//    Second, we need to be able to reason about the sets of known and
+//    unknown edges that are incoming and outgoing from any block. These
+//    may not quite be the edges we'd see from iterating successors or
+//    building pred lists, because we create special pseudo-edges during
+//    instrumentation. So, we also need to build up data structures
+//    keeping track of those.
+//
+//    Solving is done in four steps:
+//    * Prepare
+//      *  walk the blocks setting up per block info, and a map
+//         for block schema keys to blocks.
+//      * walk the schema to create info for the known edges, and
+//         a map from edge schema keys to edges.
+//    * Evolve Spanning Tree
+//      * for non-tree edges, presume any missing edge is zero
+//        (and hence, can be ignored during the solving process
+//      * for tree edges, verify there is no schema entry, and
+//        add in an unknown count edge.
+//    * Solve
+//      * repeatedly walk blocks, looking for blocks where all
+//        incoming or outgoing edges are known. This determines
+//        the block counts.
+//      * for blocks with known counts, look for cases where just
+//        one incoming or outgoing edge is unknown, and solve for
+//        them.
+//    * Propagate
+//      * update block counts. bail if there were errors.
+//        * mark rare blocks, and special case handler entries
+//        * (eventually) try "fixing" counts
+//      * (eventually) set edge likelihoods
+//      * (eventually) normalize
+//
+//   If we've done everything right, the solving is guaranteed to
+//   converge.
+//
+//   Along the way we may find edges with negative counts; this
+//   is an indication that the count data is not self-consistent.
+//
+class EfficientEdgeCountReconstructor : public SpanningTreeVisitor
+{
+private:
+    Compiler*     m_comp;
+    CompAllocator m_allocator;
+    unsigned      m_blocks;
+    unsigned      m_edges;
+    unsigned      m_unknownBlocks;
+    unsigned      m_unknownEdges;
+    unsigned      m_zeroEdges;
+
+    // Map a block into its schema key.
+    //
+    static int32_t BlockToKey(BasicBlock* block)
+    {
+        int32_t key = (int32_t)block->bbCodeOffs;
+        if ((block->bbFlags & BBF_INTERNAL) == BBF_INTERNAL)
+        {
+            key = block->bbNum | IL_OFFSETX_CALLINSTRUCTIONBIT;
+        }
+
+        return key;
+    }
+
+    // Map correlating block keys to blocks.
+    //
+    typedef JitHashTable<int32_t, JitSmallPrimitiveKeyFuncs<int32_t>, BasicBlock*> KeyToBlockMap;
+    KeyToBlockMap m_keyToBlockMap;
+
+    // Key for finding an edge based on schema info.
+    //
+    struct EdgeKey
+    {
+        int32_t const m_sourceKey;
+        int32_t const m_targetKey;
+
+        EdgeKey(int32_t sourceKey, int32_t targetKey) : m_sourceKey(sourceKey), m_targetKey(targetKey)
+        {
+        }
+
+        EdgeKey(BasicBlock* sourceBlock, BasicBlock* targetBlock)
+            : m_sourceKey(BlockToKey(sourceBlock)), m_targetKey(BlockToKey(targetBlock))
+        {
+        }
+
+        static bool Equals(const EdgeKey& e1, const EdgeKey& e2)
+        {
+            return (e1.m_sourceKey == e2.m_sourceKey) && (e1.m_targetKey == e2.m_targetKey);
+        }
+
+        static unsigned GetHashCode(const EdgeKey& e)
+        {
+            return (unsigned)(e.m_sourceKey ^ (e.m_targetKey << 16));
+        }
+    };
+
+    // Per edge info
+    //
+    struct Edge
+    {
+        BasicBlock::weight_t m_weight;
+        BasicBlock*          m_sourceBlock;
+        BasicBlock*          m_targetBlock;
+        Edge*                m_nextOutgoingEdge;
+        Edge*                m_nextIncomingEdge;
+        bool                 m_weightKnown;
+
+        Edge(BasicBlock* source, BasicBlock* target)
+            : m_weight(BB_ZERO_WEIGHT)
+            , m_sourceBlock(source)
+            , m_targetBlock(target)
+            , m_nextOutgoingEdge(nullptr)
+            , m_nextIncomingEdge(nullptr)
+            , m_weightKnown(false)
+        {
+        }
+    };
+
+    // Map for correlating EdgeIntCount schema entries with edges
+    //
+    typedef JitHashTable<EdgeKey, EdgeKey, Edge*> EdgeKeyToEdgeMap;
+    EdgeKeyToEdgeMap m_edgeKeyToEdgeMap;
+
+    // Per block data
+    //
+    struct BlockInfo
+    {
+        BasicBlock::weight_t m_weight;
+        Edge*                m_incomingEdges;
+        Edge*                m_outgoingEdges;
+        int                  m_incomingUnknown;
+        int                  m_outgoingUnknown;
+        bool                 m_weightKnown;
+
+        BlockInfo()
+            : m_weight(BB_ZERO_WEIGHT)
+            , m_incomingEdges(nullptr)
+            , m_outgoingEdges(nullptr)
+            , m_incomingUnknown(0)
+            , m_outgoingUnknown(0)
+            , m_weightKnown(false)
+        {
+        }
+    };
+
+    // Map a block to its info
+    //
+    BlockInfo* BlockToInfo(BasicBlock* block)
     {
-        JITDUMP("JitStress -- incorporating random profile data\n");
-        fgIncorporateBlockCounts();
-        return PhaseStatus::MODIFIED_EVERYTHING;
+        assert(block->bbSparseCountInfo != nullptr);
+        return (BlockInfo*)block->bbSparseCountInfo;
     }
 
-    // Do we have profile data?
+    // Set up block info for a block.
     //
-    if (!fgHaveProfileData())
+    void SetBlockInfo(BasicBlock* block, BlockInfo* info)
     {
-        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_BBOPT))
+        assert(block->bbSparseCountInfo == nullptr);
+        block->bbSparseCountInfo = info;
+    }
+
+    // Flags for noting and handling various error cases.
+    //
+    bool m_badcode;
+    bool m_mismatch;
+    bool m_negativeCount;
+    bool m_failedToConverge;
+
+public:
+    EfficientEdgeCountReconstructor(Compiler* comp)
+        : SpanningTreeVisitor()
+        , m_comp(comp)
+        , m_allocator(comp->getAllocator(CMK_Pgo))
+        , m_blocks(0)
+        , m_edges(0)
+        , m_unknownBlocks(0)
+        , m_unknownEdges(0)
+        , m_zeroEdges(0)
+        , m_keyToBlockMap(m_allocator)
+        , m_edgeKeyToEdgeMap(m_allocator)
+        , m_badcode(false)
+        , m_mismatch(false)
+        , m_negativeCount(false)
+        , m_failedToConverge(false)
+    {
+    }
+
+    void Prepare();
+    void Solve();
+    void Propagate();
+
+    void Badcode() override
+    {
+        m_badcode = true;
+    }
+
+    void NegativeCount()
+    {
+        m_negativeCount = true;
+    }
+
+    void Mismatch()
+    {
+        m_mismatch = true;
+    }
+
+    void FailedToConverge()
+    {
+        m_failedToConverge = true;
+    }
+
+    void VisitBlock(BasicBlock*) override
+    {
+    }
+
+    void VisitTreeEdge(BasicBlock* source, BasicBlock* target) override
+    {
+        // Tree edges should not be in the schema.
+        //
+        // If they are, we have somekind of mismatch between instrumentation and
+        // reconstruction. Flag this.
+        //
+        EdgeKey key(source, target);
+
+        if (m_edgeKeyToEdgeMap.Lookup(key))
         {
-            JITDUMP("BBOPT set, but no profile data available (hr=%08x)\n", fgPgoQueryResult);
+            JITDUMP("Did not expect tree edge " FMT_BB " -> " FMT_BB " to be present in the schema (key %08x, %08x)\n",
+                    source->bbNum, target->bbNum, key.m_sourceKey, key.m_targetKey);
+
+            Mismatch();
+            return;
+        }
+
+        Edge* const edge = new (m_allocator) Edge(source, target);
+        m_edges++;
+        m_unknownEdges++;
+
+        BlockInfo* const sourceInfo = BlockToInfo(source);
+        edge->m_nextOutgoingEdge    = sourceInfo->m_outgoingEdges;
+        sourceInfo->m_outgoingEdges = edge;
+        sourceInfo->m_outgoingUnknown++;
+
+        BlockInfo* const targetInfo = BlockToInfo(target);
+        edge->m_nextIncomingEdge    = targetInfo->m_incomingEdges;
+        targetInfo->m_incomingEdges = edge;
+        targetInfo->m_incomingUnknown++;
+
+        JITDUMP(" ... unknown edge " FMT_BB " -> " FMT_BB "\n", source->bbNum, target->bbNum);
+    }
+
+    void VisitNonTreeEdge(BasicBlock* source, BasicBlock* target, SpanningTreeVisitor::EdgeKind kind) override
+    {
+        // We may have this edge in the schema, and so already added this edge to the map.
+        //
+        // If not, assume we have a partial schema. We could add a zero count edge,
+        // but such edges don't impact the solving algorithm, so we can omit them.
+        //
+        EdgeKey key(source, target);
+        Edge*   edge = nullptr;
+
+        if (m_edgeKeyToEdgeMap.Lookup(key, &edge))
+        {
+            BlockInfo* const sourceInfo = BlockToInfo(source);
+            edge->m_nextOutgoingEdge    = sourceInfo->m_outgoingEdges;
+            sourceInfo->m_outgoingEdges = edge;
+
+            BlockInfo* const targetInfo = BlockToInfo(target);
+            edge->m_nextIncomingEdge    = targetInfo->m_incomingEdges;
+            targetInfo->m_incomingEdges = edge;
         }
         else
         {
-            JITDUMP("BBOPT not set\n");
+            // Because the count is zero, we can just pretend this edge doesn't exist.
+            //
+            JITDUMP("Schema is missing non-tree edge " FMT_BB " -> " FMT_BB ", will presume zero\n", source->bbNum,
+                    target->bbNum);
+            m_zeroEdges++;
         }
-        return PhaseStatus::MODIFIED_NOTHING;
     }
+};
 
-    // Summarize profile data
+//------------------------------------------------------------------------
+// EfficientEdgeCountReconstructor::Prepare: set up mapping information and
+//    prepare for spanning tree walk and solver
+//
+void EfficientEdgeCountReconstructor::Prepare()
+{
+    // Create per-block info, and set up the key to block map.
     //
-    JITDUMP("Have profile data: %d schema records (schema at %p, data at %p)\n", fgPgoSchemaCount, dspPtr(fgPgoSchema),
-            dspPtr(fgPgoData));
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        m_keyToBlockMap.Set(BlockToKey(block), block);
+        BlockInfo* const info = new (m_allocator) BlockInfo();
+        SetBlockInfo(block, info);
 
-    fgNumProfileRuns      = 0;
-    unsigned otherRecords = 0;
+        // No block counts are known, initially.
+        //
+        m_blocks++;
+        m_unknownBlocks++;
+    }
 
-    for (UINT32 iSchema = 0; iSchema < fgPgoSchemaCount; iSchema++)
+    // Create edges for schema entries with edge counts, and set them up in
+    // the edge key to edge map.
+    //
+    for (UINT32 iSchema = 0; iSchema < m_comp->fgPgoSchemaCount; iSchema++)
     {
-        switch (fgPgoSchema[iSchema].InstrumentationKind)
+        const ICorJitInfo::PgoInstrumentationSchema& schemaEntry = m_comp->fgPgoSchema[iSchema];
+        switch (schemaEntry.InstrumentationKind)
         {
-            case ICorJitInfo::PgoInstrumentationKind::NumRuns:
-                fgNumProfileRuns += fgPgoSchema[iSchema].Other;
-                break;
+            case ICorJitInfo::PgoInstrumentationKind::EdgeIntCount:
+            {
+                // Optimization TODO: if profileCount is zero, we can just ignore this edge
+                // and the right things will happen.
+                //
+                uint32_t const profileCount = *(uint32_t*)(m_comp->fgPgoData + schemaEntry.Offset);
+                {
+                    BasicBlock::weight_t const weight = (BasicBlock::weight_t)profileCount;
 
-            case ICorJitInfo::PgoInstrumentationKind::BasicBlockIntCount:
-                fgPgoBlockCounts++;
-                break;
+                    // Find the blocks.
+                    //
+                    BasicBlock* sourceBlock = nullptr;
 
-            case ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount:
-                fgPgoClassProfiles++;
-                break;
+                    if (!m_keyToBlockMap.Lookup(schemaEntry.ILOffset, &sourceBlock))
+                    {
+                        JITDUMP("Could not find source block for schema entry %d (IL offset/key %08x\n", iSchema,
+                                schemaEntry.ILOffset);
+                    }
+
+                    BasicBlock* targetBlock = nullptr;
+
+                    if (!m_keyToBlockMap.Lookup(schemaEntry.Other, &targetBlock))
+                    {
+                        JITDUMP("Could not find target block for schema entry %d (IL offset/key %08x\n", iSchema,
+                                schemaEntry.ILOffset);
+                    }
+
+                    if ((sourceBlock == nullptr) || (targetBlock == nullptr))
+                    {
+                        // Looks like there is skew between schema and graph.
+                        //
+                        Mismatch();
+                        continue;
+                    }
+
+                    Edge* const edge = new (m_allocator) Edge(sourceBlock, targetBlock);
+
+                    JITDUMP("... adding known edge " FMT_BB " -> " FMT_BB ": weight %0f\n", edge->m_sourceBlock->bbNum,
+                            edge->m_targetBlock->bbNum, weight);
+
+                    edge->m_weightKnown = true;
+                    edge->m_weight      = weight;
+
+                    EdgeKey edgeKey(schemaEntry.ILOffset, schemaEntry.Other);
+                    m_edgeKeyToEdgeMap.Set(edgeKey, edge);
+
+                    m_edges++;
+                }
+            }
+            break;
 
             default:
-                otherRecords++;
                 break;
         }
     }
+}
 
-    if (fgNumProfileRuns == 0)
+//------------------------------------------------------------------------
+// EfficientEdgeCountReconstructor::Solve: solve for missing edge and block counts
+//
+void EfficientEdgeCountReconstructor::Solve()
+{
+    // If issues arose earlier, then don't try solving.
+    //
+    if (m_badcode || m_mismatch)
     {
-        fgNumProfileRuns = 1;
+        JITDUMP("... not solving because of the %s\n", m_badcode ? "badcode" : "mismatch")
+        return;
     }
 
-    JITDUMP("Profile summary: %d runs, %d block probes, %d class profiles, %d other records\n", fgNumProfileRuns,
-            fgPgoBlockCounts, fgPgoClassProfiles, otherRecords);
+    unsigned       nPasses = 0;
+    unsigned const nLimit  = 10;
 
-    assert(fgPgoBlockCounts > 0);
+    JITDUMP("\nSolver: %u blocks, %u unknown; %u edges, %u unknown, %u zero (and so ignored)\n", m_blocks,
+            m_unknownBlocks, m_edges, m_unknownEdges, m_zeroEdges);
 
-    fgIncorporateBlockCounts();
-    return PhaseStatus::MODIFIED_EVERYTHING;
-}
-
-//------------------------------------------------------------------------
-// fgIncorporateBlockCounts: read block count based profile data
-//   and set block weights
-//
-// Notes:
-//   Count data for inlinees is scaled (usually down).
-//
-//   Since we are now running before the importer, we do not know which
-//   blocks will be imported, and we should not see any internal blocks.
-//
-// Todo:
-//   Normalize counts.
-//
-//   Take advantage of the (likely) correspondence between block order
-//   and schema order?
-//
-//   Find some other mechanism for handling cases where handler entry
-//   blocks must be in the hot section.
-//
-void Compiler::fgIncorporateBlockCounts()
-{
-    for (BasicBlock* block = fgFirstBB; block != nullptr; block = block->bbNext)
+    while ((m_unknownBlocks > 0) && (nPasses < nLimit))
     {
-        BasicBlock::weight_t profileWeight;
+        nPasses++;
+        JITDUMP("\nPass [%u]: %u unknown blocks, %u unknown edges\n", nPasses, m_unknownBlocks, m_unknownEdges);
 
-        if (fgGetProfileWeightForBasicBlock(block->bbCodeOffs, &profileWeight))
+        // TODO: no point walking all the blocks here, we should find a way to just walk
+        // the subset with unknown counts or edges.
+        //
+        // The ideal solver order is likely reverse postorder over the depth-first spanning tree.
+        // We approximate it here by running from last node to first.
+        //
+        for (BasicBlock* block = m_comp->fgLastBB; (block != nullptr); block = block->bbPrev)
         {
-            if (compIsForInlining())
+            BlockInfo* const info = BlockToInfo(block);
+
+            // Try and determine block weight.
+            //
+            if (!info->m_weightKnown)
             {
-                if (impInlineInfo->profileScaleState == InlineInfo::ProfileScaleState::KNOWN)
+                JITDUMP(FMT_BB ": %u incoming unknown, %u outgoing unknown\n", block->bbNum, info->m_incomingUnknown,
+                        info->m_outgoingUnknown);
+
+                BasicBlock::weight_t weight      = BB_ZERO_WEIGHT;
+                bool                 weightKnown = false;
+                if (info->m_incomingUnknown == 0)
+                {
+                    JITDUMP(FMT_BB ": all incoming edge weights known, summming...\n", block->bbNum);
+                    for (Edge* edge = info->m_incomingEdges; edge != nullptr; edge = edge->m_nextIncomingEdge)
+                    {
+                        if (!edge->m_weightKnown)
+                        {
+                            JITDUMP("... odd, expected " FMT_BB " -> " FMT_BB " to have known weight\n",
+                                    edge->m_sourceBlock->bbNum, edge->m_targetBlock->bbNum);
+                        }
+                        assert(edge->m_weightKnown);
+                        JITDUMP("  " FMT_BB " -> " FMT_BB " has weight %0f\n", edge->m_sourceBlock->bbNum,
+                                edge->m_targetBlock->bbNum, edge->m_weight);
+                        weight += edge->m_weight;
+                    }
+                    JITDUMP(FMT_BB ": all incoming edge weights known, sum is %0f\n", block->bbNum, weight);
+                    weightKnown = true;
+                }
+                else if (info->m_outgoingUnknown == 0)
                 {
-                    double scaledWeight = impInlineInfo->profileScaleFactor * profileWeight;
-                    profileWeight       = (BasicBlock::weight_t)scaledWeight;
+                    JITDUMP(FMT_BB ": all outgoing edge weights known, summming...\n", block->bbNum);
+                    for (Edge* edge = info->m_outgoingEdges; edge != nullptr; edge = edge->m_nextOutgoingEdge)
+                    {
+                        if (!edge->m_weightKnown)
+                        {
+                            JITDUMP("... odd, expected " FMT_BB " -> " FMT_BB " to have known weight\n",
+                                    edge->m_sourceBlock->bbNum, edge->m_targetBlock->bbNum);
+                        }
+                        assert(edge->m_weightKnown);
+                        JITDUMP("  " FMT_BB " -> " FMT_BB " has weight %0f\n", edge->m_sourceBlock->bbNum,
+                                edge->m_targetBlock->bbNum, edge->m_weight);
+                        weight += edge->m_weight;
+                    }
+                    JITDUMP(FMT_BB ": all outgoing edge weights known, sum is %0f\n", block->bbNum, weight);
+                    weightKnown = true;
                 }
-            }
 
-            block->setBBProfileWeight(profileWeight);
+                if (weightKnown)
+                {
+                    info->m_weight      = weight;
+                    info->m_weightKnown = true;
+                    assert(m_unknownBlocks > 0);
+                    m_unknownBlocks--;
+                }
+            }
 
-            if (profileWeight == BB_ZERO_WEIGHT)
+            // If we still don't know the block weight, move on to the next block.
+            //
+            if (!info->m_weightKnown)
             {
-                block->bbSetRunRarely();
+                continue;
             }
-            else
+
+            // If we know the block weight, see if we can resolve any edge weights.
+            //
+            if (info->m_incomingUnknown == 1)
             {
-                block->bbFlags &= ~BBF_RUN_RARELY;
+                BasicBlock::weight_t weight       = BB_ZERO_WEIGHT;
+                Edge*                resolvedEdge = nullptr;
+                for (Edge* edge = info->m_incomingEdges; edge != nullptr; edge = edge->m_nextIncomingEdge)
+                {
+                    if (edge->m_weightKnown)
+                    {
+                        weight += edge->m_weight;
+                    }
+                    else
+                    {
+                        assert(resolvedEdge == nullptr);
+                        resolvedEdge = edge;
+                    }
+                }
+
+                assert(resolvedEdge != nullptr);
+
+                weight = info->m_weight - weight;
+
+                JITDUMP(FMT_BB " -> " FMT_BB
+                               ": target block weight and all other incoming edge weights known, so weight is %0f\n",
+                        resolvedEdge->m_sourceBlock->bbNum, resolvedEdge->m_targetBlock->bbNum, weight);
+
+                // If we arrive at a negative count for this edge, set it to zero.
+                //
+                if (weight < 0)
+                {
+                    JITDUMP(" .... weight was negative, setting to zero\n");
+                    NegativeCount();
+                    weight = 0;
+                }
+
+                resolvedEdge->m_weight      = weight;
+                resolvedEdge->m_weightKnown = true;
+
+                // Update source and target info.
+                //
+                assert(BlockToInfo(resolvedEdge->m_sourceBlock)->m_outgoingUnknown > 0);
+                BlockToInfo(resolvedEdge->m_sourceBlock)->m_outgoingUnknown--;
+                info->m_incomingUnknown--;
+                assert(m_unknownEdges > 0);
+                m_unknownEdges--;
             }
 
-#if HANDLER_ENTRY_MUST_BE_IN_HOT_SECTION
-            // Handle a special case -- some handler entries can't have zero profile count.
-            //
-            if (this->bbIsHandlerBeg(block) && block->isRunRarely())
+            if (info->m_outgoingUnknown == 1)
             {
-                JITDUMP("Suppressing zero count for " FMT_BB " as it is a handler entry\n", block->bbNum);
-                block->makeBlockHot();
+                BasicBlock::weight_t weight       = BB_ZERO_WEIGHT;
+                Edge*                resolvedEdge = nullptr;
+                for (Edge* edge = info->m_outgoingEdges; edge != nullptr; edge = edge->m_nextOutgoingEdge)
+                {
+                    if (edge->m_weightKnown)
+                    {
+                        weight += edge->m_weight;
+                    }
+                    else
+                    {
+                        assert(resolvedEdge == nullptr);
+                        resolvedEdge = edge;
+                    }
+                }
+
+                assert(resolvedEdge != nullptr);
+
+                weight = info->m_weight - weight;
+
+                JITDUMP(FMT_BB " -> " FMT_BB
+                               ": source block weight and all other outgoing edge weights known, so weight is %0f\n",
+                        resolvedEdge->m_sourceBlock->bbNum, resolvedEdge->m_targetBlock->bbNum, weight);
+
+                // If we arrive at a negative count for this edge, set it to zero.
+                //
+                if (weight < 0)
+                {
+                    JITDUMP(" .... weight was negative, setting to zero\n");
+                    NegativeCount();
+                    weight = 0;
+                }
+
+                resolvedEdge->m_weight      = weight;
+                resolvedEdge->m_weightKnown = true;
+
+                // Update source and target info.
+                //
+                info->m_outgoingUnknown--;
+                assert(BlockToInfo(resolvedEdge->m_targetBlock)->m_incomingUnknown > 0);
+                BlockToInfo(resolvedEdge->m_targetBlock)->m_incomingUnknown--;
+                assert(m_unknownEdges > 0);
+                m_unknownEdges--;
             }
-#endif
         }
     }
+
+    if (m_unknownBlocks == 0)
+    {
+        JITDUMP("\nSolver: converged in %u passes\n", nPasses);
+    }
+    else
+    {
+        JITDUMP("\nSolver: failed to converge in %u passes, %u blocks and %u edges remain unsolved\n", nPasses,
+                m_unknownBlocks, m_unknownEdges);
+        FailedToConverge();
+    }
+}
+
+//------------------------------------------------------------------------
+// EfficientEdgeCountReconstructor::Propagate: actually set block weights.
+//
+// Notes:
+//    For inlinees, weights are scaled appropriately.
+//
+void EfficientEdgeCountReconstructor::Propagate()
+{
+    // We don't expect mismatches or convergence failures.
+    //
+    assert(!m_mismatch);
+    assert(!m_failedToConverge);
+
+    // If any issues arose during reconstruction, don't set weights.
+    //
+    if (m_badcode || m_mismatch || m_failedToConverge)
+    {
+        JITDUMP("... discarding profile data because of %s\n",
+                m_badcode ? "badcode" : m_mismatch ? "mismatch" : "failed to converge");
+
+        // Make sure nothing else in the jit looks at the profile data.
+        //
+        m_comp->fgPgoSchema = nullptr;
+
+        return;
+    }
+
+    if (m_comp->compIsForInlining())
+    {
+        // Tentatively set first block's profile to compute inlinee profile scale.
+        //
+        BlockInfo* const info = BlockToInfo(m_comp->fgFirstBB);
+        assert(info->m_weightKnown);
+
+        m_comp->fgSetProfileWeight(m_comp->fgFirstBB, info->m_weight);
+        m_comp->fgComputeProfileScale();
+    }
+
+    // Set weight on all blocks (will reset entry weight for inlinees based
+    // on above computed scale).
+    //
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        BlockInfo* const info = BlockToInfo(block);
+        assert(info->m_weightKnown);
+
+        m_comp->fgSetProfileWeight(block, info->m_weight);
+    }
+}
+
+//------------------------------------------------------------------------
+// fgIncorporateEdgeCounts: read sparse edge count based profile data
+//   and set block weights
+//
+// Notes:
+//   Because edge counts are sparse, we need to solve for the missing
+//   edge counts; in the process, we also determine block counts.
+//
+// Todo:
+//   Normalize counts.
+//   Since we have edge weights here, we might as well set them
+//   (or likelihoods)
+//
+void Compiler::fgIncorporateEdgeCounts()
+{
+    JITDUMP("\nReconstructing block counts from sparse edge instrumentation\n");
+
+    EfficientEdgeCountReconstructor e(this);
+    e.Prepare();
+    WalkSpanningTree(&e);
+    e.Solve();
+    e.Propagate();
 }
 
 bool flowList::setEdgeWeightMinChecked(BasicBlock::weight_t newWeight, BasicBlock::weight_t slop, bool* wbUsedSlop)
index 5ffab7c..bb4ae67 100644 (file)
@@ -458,6 +458,7 @@ CONFIG_INTEGER(TC_OnStackReplacement_InitialCounter, W("TC_OnStackReplacement_In
 // Profile instrumentation options
 CONFIG_INTEGER(JitMinimalProfiling, W("JitMinimalProfiling"), 0)
 CONFIG_INTEGER(JitClassProfiling, W("JitClassProfiling"), 0)
+CONFIG_INTEGER(JitEdgeProfiling, W("JitEdgeProfiling"), 0)
 
 #if defined(DEBUG)
 // JitFunctionFile: Name of a file that contains a list of functions. If the currently compiled function is in the
index 1b1689e..860f7d6 100644 (file)
@@ -157,11 +157,14 @@ void Phase::PostPhase(PhaseStatus status)
     // well as the new-style phases that have been updated to return
     // PhaseStatus from their DoPhase methods.
     //
-    static Phases s_allowlist[] = {PHASE_IMPORTATION,       PHASE_IBCINSTR,      PHASE_INCPROFILE,
-                                   PHASE_INDXCALL,          PHASE_MORPH_INLINE,  PHASE_ALLOCATE_OBJECTS,
-                                   PHASE_EMPTY_TRY,         PHASE_EMPTY_FINALLY, PHASE_MERGE_FINALLY_CHAINS,
-                                   PHASE_CLONE_FINALLY,     PHASE_MERGE_THROWS,  PHASE_MORPH_GLOBAL,
-                                   PHASE_BUILD_SSA,         PHASE_RATIONALIZE,   PHASE_LOWERING,
+    static Phases s_allowlist[] = {PHASE_IMPORTATION,       PHASE_IBCINSTR,
+                                   PHASE_IBCPREP,           PHASE_INCPROFILE,
+                                   PHASE_INDXCALL,          PHASE_MORPH_INLINE,
+                                   PHASE_ALLOCATE_OBJECTS,  PHASE_EMPTY_TRY,
+                                   PHASE_EMPTY_FINALLY,     PHASE_MERGE_FINALLY_CHAINS,
+                                   PHASE_CLONE_FINALLY,     PHASE_MERGE_THROWS,
+                                   PHASE_MORPH_GLOBAL,      PHASE_BUILD_SSA,
+                                   PHASE_RATIONALIZE,       PHASE_LOWERING,
                                    PHASE_STACK_LEVEL_SETTER};
 
     if (madeChanges)
index 5e8a79a..c38dcdd 100644 (file)
@@ -41,6 +41,7 @@ namespace Internal.Pgo
         TypeHandleHistogramTypeHandle = (DescriptorMin * 3) | TypeHandle, // TypeHandle that is part of a type histogram
         Version = (DescriptorMin * 4) | None, // Version is encoded in the Other field of the schema
         NumRuns = (DescriptorMin * 5) | None, // Number of runs is encoded in the Other field of the schema
+        EdgeIntCount = (DescriptorMin * 6) | FourByte, // 4 byte edge counter, using unsigned 4 byte int
     }
 
     public interface IPgoSchemaDataLoader<TType>
@@ -550,7 +551,6 @@ namespace Internal.Pgo
 
             void MergeInSchemaElem(Dictionary<PgoSchemaElem, PgoSchemaElem> dataMerger, PgoSchemaElem schema)
             {
-                long sortKey = ((long)schema.ILOffset) << 32 | (long)schema.InstrumentationKind;
                 if (dataMerger.TryGetValue(schema, out var existingSchemaItem))
                 {
                     // Actually merge two schema items
@@ -559,6 +559,7 @@ namespace Internal.Pgo
                     switch (existingSchemaItem.InstrumentationKind)
                     {
                         case PgoInstrumentationKind.BasicBlockIntCount:
+                        case PgoInstrumentationKind.EdgeIntCount:
                         case PgoInstrumentationKind.TypeHandleHistogramCount:
                             if ((existingSchemaItem.Count != 1) || (schema.Count != 1))
                             {
index d85316d..15ff3ab 100644 (file)
@@ -1065,7 +1065,7 @@ HRESULT PgoManager::getPgoInstrumentationResultsInstance(MethodDesc* pMD, BYTE**
         size_t* pInstrumentationDataDst = (size_t*)((*pAllocatedData) + schemaDataSize);
         size_t* pInstrumentationDataDstEnd = (size_t*)((*pAllocatedData) + schemaDataSize + instrumentationDataSize);
         *pInstrumentationData = (BYTE*)pInstrumentationDataDst;
-        volatile size_t*pSrc = (volatile size_t*)found->header.GetData();
+        volatile size_t*pSrc = (volatile size_t*)(found->header.GetData() + found->header.countsOffset);
         // Use a volatile memcpy to copy the instrumentation data into a temporary buffer
         // This allows the instrumentation data to be made stable for reading during the execution of the jit
         // and since the copy moves through a volatile pointer, there will be no tearing of individual data elements
index 4371f64..a4460c5 100644 (file)
@@ -56,6 +56,7 @@
       COMPlus_JitObjectStackAllocation;
       COMPlus_JitInlinePolicyProfile;
       COMPlus_JitClassProfiling;
+      COMPlus_JitEdgeProfiling;
       RunningIlasmRoundTrip
     </COMPlusVariables>
   </PropertyGroup>
     <TestEnvironment Include="jitpgo" TieredPGO="1" TieredCompilation="1" />
     <TestEnvironment Include="jitpgo_inline" TieredPGO="1" TieredCompilation="1" JitInlinePolicyProfile="1"/>
     <TestEnvironment Include="jitpgo_classes" TieredPGO="1" TieredCompilation="1" JitEnableGuardedDevirtualization="1" JitClassProfiling="1"/>
+    <TestEnvironment Include="jitpgo_edgeinstrumentation" TieredPGO="1" TieredCompilation="1" JitEdgeProfiling="1"/>
     <TestEnvironment Include="jitguardeddevirtualization" JitEnableGuardedDevirtualization="1" TieredCompilation="0" />
     <TestEnvironment Include="jitehwritethru" EnableEhWriteThru="1" TieredCompilation="0" />
     <TestEnvironment Include="jitobjectstackallocation" JitObjectStackAllocation="1" TieredCompilation="0" />