JIT: refactor instrumentation code (#47509)
authorAndy Ayers <andya@microsoft.com>
Thu, 28 Jan 2021 02:28:22 +0000 (18:28 -0800)
committerGitHub <noreply@github.com>
Thu, 28 Jan 2021 02:28:22 +0000 (18:28 -0800)
Refactor `fgInstrument` and related methods and classes, so that the strategies
for block and class instrumentation are clearly separated out, and can be
varied.

src/coreclr/jit/block.h
src/coreclr/jit/compiler.h
src/coreclr/jit/compmemkind.h
src/coreclr/jit/fgprofile.cpp

index d92f5b2..cfe2641 100644 (file)
@@ -726,8 +726,16 @@ struct BasicBlock : private LIR::Range
     };
 
 #define NO_BASE_TMP UINT_MAX // base# to use when we have none
-    unsigned bbStkTempsIn;   // base# for input stack temps
-    unsigned bbStkTempsOut;  // base# for output stack temps
+
+    union {
+        unsigned bbStkTempsIn;       // base# for input stack temps
+        int      bbCountSchemaIndex; // schema index for count instrumentation
+    };
+
+    union {
+        unsigned bbStkTempsOut;      // base# for output stack temps
+        int      bbClassSchemaIndex; // schema index for class instrumentation
+    };
 
 #define MAX_XCPTN_INDEX (USHRT_MAX - 1)
 
index 3feabf5..ca3d5ac 100644 (file)
@@ -5586,6 +5586,7 @@ public:
 #endif
 
 public:
+    Statement* fgNewStmtAtBeg(BasicBlock* block, GenTree* tree);
     void fgInsertStmtAtEnd(BasicBlock* block, Statement* stmt);
     Statement* fgNewStmtAtEnd(BasicBlock* block, GenTree* tree);
     Statement* fgNewStmtNearEnd(BasicBlock* block, GenTree* tree);
@@ -5593,8 +5594,6 @@ public:
 private:
     void fgInsertStmtNearEnd(BasicBlock* block, Statement* stmt);
     void fgInsertStmtAtBeg(BasicBlock* block, Statement* stmt);
-    Statement* fgNewStmtAtBeg(BasicBlock* block, GenTree* tree);
-
     void fgInsertStmtAfter(BasicBlock* block, Statement* insertionPoint, Statement* stmt);
 
 public:
index 44b856e..1d8e0a8 100644 (file)
@@ -58,6 +58,7 @@ CompMemKindMacro(ClassLayout)
 CompMemKindMacro(TailMergeThrows)
 CompMemKindMacro(EarlyProp)
 CompMemKindMacro(ZeroInit)
+CompMemKindMacro(Pgo)
 //clang-format on
 
 #undef CompMemKindMacro
index ed20f93..d9a5bb1 100644 (file)
@@ -211,6 +211,227 @@ bool Compiler::fgGetProfileWeightForBasicBlock(IL_OFFSET offset, BasicBlock::wei
     return true;
 }
 
+typedef jitstd::vector<ICorJitInfo::PgoInstrumentationSchema> Schema;
+
+//------------------------------------------------------------------------
+// Instrumentor: base class for count and class instrumentation
+//
+class Instrumentor
+{
+protected:
+    Compiler* m_comp;
+    unsigned  m_schemaCount;
+    unsigned  m_instrCount;
+
+protected:
+    Instrumentor(Compiler* comp) : m_comp(comp), m_schemaCount(0), m_instrCount(0)
+    {
+    }
+
+public:
+    virtual void Prepare()
+    {
+    }
+    virtual void BuildSchemaElements(BasicBlock* block, Schema& schema)
+    {
+    }
+    virtual void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
+    {
+    }
+    virtual void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
+    {
+    }
+    virtual void SuppressProbes()
+    {
+    }
+    unsigned SchemaCount()
+    {
+        return m_schemaCount;
+    }
+    unsigned InstrCount()
+    {
+        return m_instrCount;
+    }
+};
+
+//------------------------------------------------------------------------
+// NonInstrumentor: instrumentor that does not instrument anything
+//
+class NonInstrumentor : public Instrumentor
+{
+public:
+    NonInstrumentor(Compiler* comp) : Instrumentor(comp)
+    {
+    }
+};
+
+//------------------------------------------------------------------------
+// BlockCountInstrumentor: instrumentor that adds a counter to each
+//   basic block
+//
+class BlockCountInstrumentor : public Instrumentor
+{
+public:
+    BlockCountInstrumentor(Compiler* comp) : Instrumentor(comp)
+    {
+    }
+    void Prepare() override;
+    void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
+    void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
+    void InstrumentMethodEntry(Schema& schema, BYTE* profileMemory) override;
+};
+
+//------------------------------------------------------------------------
+// BlockCountInstrumentor::Prepare: prepare for count instrumentation
+//
+void BlockCountInstrumentor::Prepare()
+{
+
+#ifdef DEBUG
+    // Set schema index to invalid value
+    //
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        block->bbCountSchemaIndex = -1;
+    }
+#endif
+}
+
+//------------------------------------------------------------------------
+// BlockCountInstrumentor::BuildSchemaElements: create schema elements for a block counter
+//
+// Arguments:
+//   block -- block to instrument
+//
+void BlockCountInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema)
+{
+    // Remember the schema index for this block.
+    //
+    assert(block->bbCountSchemaIndex == -1);
+    block->bbCountSchemaIndex = (int)schema.size();
+
+    // Assign the current block's IL offset into the profile data
+    // (make sure IL offset is sane)
+    //
+    IL_OFFSET offset = block->bbCodeOffs;
+    assert((int)offset >= 0);
+
+    ICorJitInfo::PgoInstrumentationSchema schemaElem;
+    schemaElem.Count               = 1;
+    schemaElem.Other               = 0;
+    schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::BasicBlockIntCount;
+    schemaElem.ILOffset            = offset;
+    schemaElem.Offset              = 0;
+
+    schema.push_back(schemaElem);
+
+    m_schemaCount++;
+}
+
+//------------------------------------------------------------------------
+// BlockCountInstrumentor::Instrument: add counter probe to block
+//
+// Arguments:
+//   block -- block of interest
+//   schema -- instrumentation schema
+//   profileMemory -- profile data slab
+//
+void BlockCountInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
+{
+    const int schemaIndex = (int)block->bbCountSchemaIndex;
+
+    assert(block->bbCodeOffs == (IL_OFFSET)schema[schemaIndex].ILOffset);
+    assert(schema[schemaIndex].InstrumentationKind == ICorJitInfo::PgoInstrumentationKind::BasicBlockIntCount);
+    size_t addrOfCurrentExecutionCount = (size_t)(schema[schemaIndex].Offset + profileMemory);
+
+    // Read Basic-Block count value
+    GenTree* valueNode =
+        m_comp->gtNewIndOfIconHandleNode(TYP_INT, addrOfCurrentExecutionCount, GTF_ICON_BBC_PTR, false);
+
+    // Increment value by 1
+    GenTree* rhsNode = m_comp->gtNewOperNode(GT_ADD, TYP_INT, valueNode, m_comp->gtNewIconNode(1));
+
+    // Write new Basic-Block count value
+    GenTree* lhsNode = m_comp->gtNewIndOfIconHandleNode(TYP_INT, addrOfCurrentExecutionCount, GTF_ICON_BBC_PTR, false);
+    GenTree* asgNode = m_comp->gtNewAssignNode(lhsNode, rhsNode);
+
+    m_comp->fgNewStmtAtBeg(block, asgNode);
+
+    m_instrCount++;
+}
+
+//------------------------------------------------------------------------
+// BlockCountInstrumentor::InstrumentMethodEntry: add any special method entry instrumentation
+//
+// Arguments:
+//   schema -- instrumentation schema
+//   profileMemory -- profile data slab
+//
+// Notes:
+//   When prejitting, add the method entry callback node
+//
+void BlockCountInstrumentor::InstrumentMethodEntry(Schema& schema, BYTE* profileMemory)
+{
+    Compiler::Options& opts = m_comp->opts;
+    Compiler::Info&    info = m_comp->info;
+
+    // Nothing to do, if not prejitting.
+    //
+    if (!opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT))
+    {
+        return;
+    }
+
+    // Find the address of the entry block's counter.
+    //
+    BasicBlock* const block            = m_comp->fgFirstBB;
+    const int         firstSchemaIndex = block->bbCountSchemaIndex;
+    assert(block->bbCodeOffs == (IL_OFFSET)schema[firstSchemaIndex].ILOffset);
+    assert(schema[firstSchemaIndex].InstrumentationKind == ICorJitInfo::PgoInstrumentationKind::BasicBlockIntCount);
+    size_t addrOfFirstExecutionCount = (size_t)(schema[firstSchemaIndex].Offset + profileMemory);
+
+    GenTree* arg;
+
+#ifdef FEATURE_READYTORUN_COMPILER
+    if (opts.IsReadyToRun())
+    {
+        mdMethodDef currentMethodToken = info.compCompHnd->getMethodDefFromMethod(info.compMethodHnd);
+
+        CORINFO_RESOLVED_TOKEN resolvedToken;
+        resolvedToken.tokenContext = MAKE_METHODCONTEXT(info.compMethodHnd);
+        resolvedToken.tokenScope   = info.compScopeHnd;
+        resolvedToken.token        = currentMethodToken;
+        resolvedToken.tokenType    = CORINFO_TOKENKIND_Method;
+
+        info.compCompHnd->resolveToken(&resolvedToken);
+
+        arg = m_comp->impTokenToHandle(&resolvedToken);
+    }
+    else
+#endif
+    {
+        arg = m_comp->gtNewIconEmbMethHndNode(info.compMethodHnd);
+    }
+
+    GenTreeCall::Use* args = m_comp->gtNewCallArgs(arg);
+    GenTree*          call = m_comp->gtNewHelperCallNode(CORINFO_HELP_BBT_FCN_ENTER, TYP_VOID, args);
+
+    // Read Basic-Block count value
+    GenTree* valueNode = m_comp->gtNewIndOfIconHandleNode(TYP_INT, addrOfFirstExecutionCount, GTF_ICON_BBC_PTR, false);
+
+    // Compare Basic-Block count value against zero
+    GenTree*   relop = m_comp->gtNewOperNode(GT_NE, TYP_INT, valueNode, m_comp->gtNewIconNode(0, TYP_INT));
+    GenTree*   colon = new (m_comp, GT_COLON) GenTreeColon(TYP_VOID, m_comp->gtNewNothingNode(), call);
+    GenTree*   cond  = m_comp->gtNewQmarkNode(TYP_VOID, relop, colon);
+    Statement* stmt  = m_comp->gtNewStmt(cond);
+
+    m_comp->fgEnsureFirstBBisScratch();
+    m_comp->fgInsertStmtAtEnd(block, stmt);
+}
+
+//------------------------------------------------------------------------
+// ClassProbeVisitor: invoke functor on each virtual call in a tree
+//
 template <class TFunctor>
 class ClassProbeVisitor final : public GenTreeVisitor<ClassProbeVisitor<TFunctor>>
 {
@@ -244,387 +465,407 @@ public:
 };
 
 //------------------------------------------------------------------------
-// fgInstrumentMethod: add instrumentation probes to the method
-//
-// Note:
+// BuildClassProbeSchemaGen: functor that creates class probe schema elements
 //
-//   By default this instruments each non-internal block with
-//   a counter probe.
-//
-//   Probes data is held in a runtime-allocated slab of Entries, with
-//   each Entry an (IL offset, count) pair. This method determines
-//   the number of Entrys needed and initializes each entry's IL offset.
-//
-//   Options (many not yet implemented):
-//   * suppress count instrumentation for methods with
-//     a single block, or
-//   * instrument internal blocks (requires same internal expansions
-//     for BBOPT and BBINSTR, not yet guaranteed)
-//   * use spanning tree for minimal count probing
-//   * add class profile probes for virtual and interface call sites
-//   * record indirection cells for VSD calls
-//
-void Compiler::fgInstrumentMethod()
+class BuildClassProbeSchemaGen
 {
-    noway_assert(!compIsForInlining());
-    jitstd::vector<ICorJitInfo::PgoInstrumentationSchema> schema(getAllocator());
+private:
+    Schema&   m_schema;
+    unsigned& m_schemaCount;
 
-    // Count the number of basic blocks in the method
-    // that will get block count probes.
-    //
-    int         countOfBlocks = 0;
-    BasicBlock* block;
-    for (block = fgFirstBB; (block != nullptr); block = block->bbNext)
+public:
+    BuildClassProbeSchemaGen(Schema& schema, unsigned& schemaCount) : m_schema(schema), m_schemaCount(schemaCount)
     {
-        // We don't want to profile any un-imported blocks
-        //
-        if ((block->bbFlags & BBF_IMPORTED) == 0)
+    }
+
+    void operator()(Compiler* compiler, GenTreeCall* call)
+    {
+        ICorJitInfo::PgoInstrumentationSchema schemaElem;
+        schemaElem.Count = 1;
+        schemaElem.Other = ICorJitInfo::ClassProfile::CLASS_FLAG;
+        if (call->IsVirtualStub())
         {
-            continue;
+            schemaElem.Other |= ICorJitInfo::ClassProfile::INTERFACE_FLAG;
         }
-
-        if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) != 0)
+        else
         {
-            class BuildClassProbeSchemaGen
-            {
-                jitstd::vector<ICorJitInfo::PgoInstrumentationSchema>* m_schema;
+            assert(call->IsVirtualVtable());
+        }
 
-            public:
-                BuildClassProbeSchemaGen(jitstd::vector<ICorJitInfo::PgoInstrumentationSchema>* schema)
-                    : m_schema(schema)
-                {
-                }
-                void operator()(Compiler* compiler, GenTreeCall* call)
-                {
-                    ICorJitInfo::PgoInstrumentationSchema schemaElem;
-                    schemaElem.Count = 1;
-                    schemaElem.Other = ICorJitInfo::ClassProfile::CLASS_FLAG;
-                    if (call->IsVirtualStub())
-                    {
-                        schemaElem.Other |= ICorJitInfo::ClassProfile::INTERFACE_FLAG;
-                    }
-                    else
-                    {
-                        assert(call->IsVirtualVtable());
-                    }
+        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount;
+        schemaElem.ILOffset            = jitGetILoffs(call->gtClassProfileCandidateInfo->ilOffset);
+        schemaElem.Offset              = 0;
 
-                    schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount;
-                    schemaElem.ILOffset            = jitGetILoffs(call->gtClassProfileCandidateInfo->ilOffset);
-                    schemaElem.Offset              = 0;
+        m_schema.push_back(schemaElem);
 
-                    m_schema->push_back(schemaElem);
+        // Re-using ILOffset and Other fields from schema item for TypeHandleHistogramCount
+        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramTypeHandle;
+        schemaElem.Count               = ICorJitInfo::ClassProfile::SIZE;
+        m_schema.push_back(schemaElem);
 
-                    // Re-using ILOffset and Other fields from schema item for TypeHandleHistogramCount
-                    schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramTypeHandle;
-                    schemaElem.Count               = ICorJitInfo::ClassProfile::SIZE;
-                    m_schema->push_back(schemaElem);
-                }
-            };
-            // Scan the statements and identify the class probes
-            //
-            BuildClassProbeSchemaGen                    schemaGen(&schema);
-            ClassProbeVisitor<BuildClassProbeSchemaGen> visitor(this, schemaGen);
-            for (Statement* stmt : block->Statements())
-            {
-                visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
-            }
-        }
+        m_schemaCount++;
+    }
+};
 
-        if (block->bbFlags & BBF_INTERNAL)
-        {
-            continue;
-        }
+//------------------------------------------------------------------------
+// ClassProbeInserter: functor that adds class probe instrumentation
+//
+class ClassProbeInserter
+{
+    Schema&   m_schema;
+    BYTE*     m_profileMemory;
+    int*      m_currentSchemaIndex;
+    unsigned& m_instrCount;
 
-        // Assign the current block's IL offset into the profile data
-        // (make sure IL offset is sane)
+public:
+    ClassProbeInserter(Schema& schema, BYTE* profileMemory, int* pCurrentSchemaIndex, unsigned& instrCount)
+        : m_schema(schema)
+        , m_profileMemory(profileMemory)
+        , m_currentSchemaIndex(pCurrentSchemaIndex)
+        , m_instrCount(instrCount)
+    {
+    }
+
+    void operator()(Compiler* compiler, GenTreeCall* call)
+    {
+        JITDUMP("Found call [%06u] with probe index %d and ilOffset 0x%X\n", compiler->dspTreeID(call),
+                call->gtClassProfileCandidateInfo->probeIndex, call->gtClassProfileCandidateInfo->ilOffset);
+
+        // We transform the call from (CALLVIRT obj, ... args ...) to
+        // to
+        //      (CALLVIRT
+        //        (COMMA
+        //          (ASG tmp, obj)
+        //          (COMMA
+        //            (CALL probe_fn tmp, &probeEntry)
+        //            tmp)))
+        //         ... args ...)
         //
-        IL_OFFSET offset = block->bbCodeOffs;
-        assert((int)offset >= 0);
 
-        ICorJitInfo::PgoInstrumentationSchema schemaElem;
-        schemaElem.Count               = 1;
-        schemaElem.Other               = 0;
-        schemaElem.InstrumentationKind = ICorJitInfo::PgoInstrumentationKind::BasicBlockIntCount;
-        schemaElem.ILOffset            = offset;
-        schemaElem.Offset              = 0;
+        assert(call->gtCallThisArg->GetNode()->TypeGet() == TYP_REF);
+
+        // Sanity check that we're looking at the right schema entry
+        //
+        assert(m_schema[*m_currentSchemaIndex].ILOffset == (int32_t)call->gtClassProfileCandidateInfo->ilOffset);
+        assert(m_schema[*m_currentSchemaIndex].InstrumentationKind ==
+               ICorJitInfo::PgoInstrumentationKind::TypeHandleHistogramCount);
+
+        // Figure out where the table is located.
+        //
+        BYTE* classProfile = m_schema[*m_currentSchemaIndex].Offset + m_profileMemory;
+        *m_currentSchemaIndex += 2; // There are 2 schema entries per class probe
+
+        // Grab a temp to hold the 'this' object as it will be used three times
+        //
+        unsigned const tmpNum             = compiler->lvaGrabTemp(true DEBUGARG("class profile tmp"));
+        compiler->lvaTable[tmpNum].lvType = TYP_REF;
+
+        // Generate the IR...
+        //
+        GenTree* const          classProfileNode = compiler->gtNewIconNode((ssize_t)classProfile, TYP_I_IMPL);
+        GenTree* const          tmpNode          = compiler->gtNewLclvNode(tmpNum, TYP_REF);
+        GenTreeCall::Use* const args             = compiler->gtNewCallArgs(tmpNode, classProfileNode);
+        GenTree* const helperCallNode = compiler->gtNewHelperCallNode(CORINFO_HELP_CLASSPROFILE, TYP_VOID, args);
+        GenTree* const tmpNode2       = compiler->gtNewLclvNode(tmpNum, TYP_REF);
+        GenTree* const callCommaNode  = compiler->gtNewOperNode(GT_COMMA, TYP_REF, helperCallNode, tmpNode2);
+        GenTree* const tmpNode3       = compiler->gtNewLclvNode(tmpNum, TYP_REF);
+        GenTree* const asgNode = compiler->gtNewOperNode(GT_ASG, TYP_REF, tmpNode3, call->gtCallThisArg->GetNode());
+        GenTree* const asgCommaNode = compiler->gtNewOperNode(GT_COMMA, TYP_REF, asgNode, callCommaNode);
+
+        // Update the call
+        //
+        call->gtCallThisArg->SetNode(asgCommaNode);
+
+        JITDUMP("Modified call is now\n");
+        DISPTREE(call);
 
-        schema.push_back(schemaElem);
+        // Restore the stub address on the call
+        //
+        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
 
-        countOfBlocks++;
+        m_instrCount++;
     }
+};
 
-    // We've already counted the number of class probes
-    // when importing.
-    //
-    int countOfCalls = info.compClassProbeCount;
-    assert(((countOfCalls * 2) + countOfBlocks) == (int)schema.size());
+//------------------------------------------------------------------------
+// SuppressProbesFunctor: functor that resets IR back to the state
+//   it had if there were no class probes.
+//
+class SuppressProbesFunctor
+{
+private:
+    unsigned& m_cleanupCount;
 
-    // Optionally bail out, if there are less than three blocks and no call sites to profile.
-    // One block is common. We don't expect to see zero or two blocks here.
-    //
-    // Note we have to at least visit all the profile call sites to properly restore their
-    // stub addresses. So we can't bail out early if there are any of these.
-    //
-    if ((JitConfig.JitMinimalProfiling() > 0) && (countOfBlocks < 3) && (countOfCalls == 0))
+public:
+    SuppressProbesFunctor(unsigned& cleanupCount) : m_cleanupCount(cleanupCount)
     {
-        JITDUMP("Not instrumenting method: %d blocks and %d calls\n", countOfBlocks, countOfCalls);
-        assert(countOfBlocks == 1);
-        return;
     }
 
-    JITDUMP("Instrumenting method, %d blocks and %d calls\n", countOfBlocks, countOfCalls);
+    void operator()(Compiler* compiler, GenTreeCall* call)
+    {
+        // Restore the stub address on the call
+        //
+        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
 
-    // Allocate the profile buffer
+        m_cleanupCount++;
+    }
+};
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor: instrumentor that adds a class probe to each
+//   virtual call in the basic block
+//
+class ClassProbeInstrumentor : public Instrumentor
+{
+public:
+    ClassProbeInstrumentor(Compiler* comp) : Instrumentor(comp)
+    {
+    }
+    void Prepare() override;
+    void BuildSchemaElements(BasicBlock* block, Schema& schema) override;
+    void Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory) override;
+    void SuppressProbes() override;
+};
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::Prepare: prepare for class instrumentation
+//
+void ClassProbeInstrumentor::Prepare()
+{
+
+#ifdef DEBUG
+    // Set schema index to invalid value
     //
-    BYTE* profileMemory;
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        block->bbClassSchemaIndex = -1;
+    }
+#endif
+}
 
-    HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
-                                                                    (UINT32)schema.size(), &profileMemory);
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::BuildSchemaElements: create schema elements for a class probe
+//
+// Arguments:
+//   block -- block to instrument
+//
+void ClassProbeInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema)
+{
+    if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+    {
+        return;
+    }
 
-    // We may not be able to instrument, if so we'll set this false.
-    // We can't just early exit, because we have to clean up calls that we might have profiled.
+    // Remember the schema index for this block.
     //
-    bool instrument = true;
+    block->bbClassSchemaIndex = (int)schema.size();
 
-    if (!SUCCEEDED(res))
+    // Scan the statements and identify the class probes
+    //
+    BuildClassProbeSchemaGen                    schemaGen(schema, m_schemaCount);
+    ClassProbeVisitor<BuildClassProbeSchemaGen> visitor(m_comp, schemaGen);
+    for (Statement* stmt : block->Statements())
     {
-        JITDUMP("Unable to instrument -- block counter allocation failed: 0x%x\n", res);
-        instrument = false;
-        // The E_NOTIMPL status is returned when we are profiling a generic method from a different assembly
-        if (res != E_NOTIMPL)
-        {
-            noway_assert(!"Error: failed to allocate profileBlockCounts");
-            return;
-        }
+        visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+    }
+}
+
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::Instrument: add class probes to block
+//
+// Arguments:
+//   block -- block of interest
+//   schema -- instrumentation schema
+//   profileMemory -- profile data slab
+//
+void ClassProbeInstrumentor::Instrument(BasicBlock* block, Schema& schema, BYTE* profileMemory)
+{
+    if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
+    {
+        return;
     }
 
-    // For each BasicBlock (non-Internal)
-    //  1. Assign the blocks bbCodeOffs to the ILOffset field of this blocks profile data.
-    //  2. Add an operation that increments the ExecutionCount field at the beginning of the block.
+    // Would be nice to avoid having to search here by tracking
+    // candidates more directly.
     //
-    // Each (non-Internal) block has it own BlockCounts tuple [ILOffset, ExecutionCount]
-    // To start we initialize our current one with the first one that we allocated
+    JITDUMP("Scanning for calls to profile in " FMT_BB "\n", block->bbNum);
+
+    // Scan the statements and add class probes
     //
-    int currentSchemaIndex = 0;
+    int classSchemaIndex = block->bbClassSchemaIndex;
+    assert((classSchemaIndex >= 0) && (classSchemaIndex < (int)schema.size()));
 
-    // Hold the address of the first blocks ExecutionCount
-    size_t addrOfFirstExecutionCount = 0;
+    ClassProbeInserter                    insertProbes(schema, profileMemory, &classSchemaIndex, m_instrCount);
+    ClassProbeVisitor<ClassProbeInserter> visitor(m_comp, insertProbes);
+    for (Statement* stmt : block->Statements())
+    {
+        visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+    }
+}
 
-    for (block = fgFirstBB; (block != nullptr); block = block->bbNext)
+//------------------------------------------------------------------------
+// ClassProbeInstrumentor::SuppressProbes: clean up if we're not instrumenting
+//
+// Notes:
+//   Currently we're hijacking the gtCallStubAddre of the call node to hold
+//   a pointer to the profile candidate info.
+//
+//   We must undo this, if not instrumenting.
+//
+void ClassProbeInstrumentor::SuppressProbes()
+{
+    unsigned                                 cleanupCount = 0;
+    SuppressProbesFunctor                    suppressProbes(cleanupCount);
+    ClassProbeVisitor<SuppressProbesFunctor> visitor(m_comp, suppressProbes);
+
+    for (BasicBlock* block = m_comp->fgFirstBB; (block != nullptr); block = block->bbNext)
     {
-        // We don't want to profile any un-imported blocks
-        //
-        if ((block->bbFlags & BBF_IMPORTED) == 0)
+        if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) == 0)
         {
             continue;
         }
 
-        // We may see class probes in internal blocks, thanks to the
-        // block splitting done by the indirect call transformer.
-        //
-        if (JitConfig.JitClassProfiling() > 0)
+        for (Statement* stmt : block->Statements())
         {
-            // Only works when jitting.
-            assert(!opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT));
-
-            if ((block->bbFlags & BBF_HAS_CLASS_PROFILE) != 0)
-            {
-                // Would be nice to avoid having to search here by tracking
-                // candidates more directly.
-                //
-                JITDUMP("Scanning for calls to profile in " FMT_BB "\n", block->bbNum);
+            visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
+        }
+    }
 
-                class ClassProbeInserter
-                {
-                    jitstd::vector<ICorJitInfo::PgoInstrumentationSchema>* m_schema;
-                    BYTE*                                                  m_profileMemory;
-                    int*                                                   m_currentSchemaIndex;
-                    bool                                                   m_instrument;
-
-                public:
-                    int m_count = 0;
-
-                    ClassProbeInserter(jitstd::vector<ICorJitInfo::PgoInstrumentationSchema>* schema,
-                                       BYTE*                                                  profileMemory,
-                                       int*                                                   pCurrentSchemaIndex,
-                                       bool                                                   instrument)
-                        : m_schema(schema)
-                        , m_profileMemory(profileMemory)
-                        , m_currentSchemaIndex(pCurrentSchemaIndex)
-                        , m_instrument(instrument)
-                    {
-                    }
-                    void operator()(Compiler* compiler, GenTreeCall* call)
-                    {
-                        JITDUMP("Found call [%06u] with probe index %d and ilOffset 0x%X\n", compiler->dspTreeID(call),
-                                call->gtClassProfileCandidateInfo->probeIndex,
-                                call->gtClassProfileCandidateInfo->ilOffset);
+    assert(cleanupCount == m_comp->info.compClassProbeCount);
+}
 
-                        m_count++;
-                        if (m_instrument)
-                        {
-                            // We transform the call from (CALLVIRT obj, ... args ...) to
-                            // to
-                            //      (CALLVIRT
-                            //        (COMMA
-                            //          (ASG tmp, obj)
-                            //          (COMMA
-                            //            (CALL probe_fn tmp, &probeEntry)
-                            //            tmp)))
-                            //         ... args ...)
-                            //
-
-                            assert(call->gtCallThisArg->GetNode()->TypeGet() == TYP_REF);
-
-                            // Figure out where the table is located.
-                            //
-                            BYTE* classProfile = (*m_schema)[*m_currentSchemaIndex].Offset + m_profileMemory;
-                            *m_currentSchemaIndex += 2; // There are 2 schema entries per class probe
-
-                            // Grab a temp to hold the 'this' object as it will be used three times
-                            //
-                            unsigned const tmpNum = compiler->lvaGrabTemp(true DEBUGARG("class profile tmp"));
-                            compiler->lvaTable[tmpNum].lvType = TYP_REF;
-
-                            // Generate the IR...
-                            //
-                            GenTree* const classProfileNode =
-                                compiler->gtNewIconNode((ssize_t)classProfile, TYP_I_IMPL);
-                            GenTree* const          tmpNode = compiler->gtNewLclvNode(tmpNum, TYP_REF);
-                            GenTreeCall::Use* const args    = compiler->gtNewCallArgs(tmpNode, classProfileNode);
-                            GenTree* const          helperCallNode =
-                                compiler->gtNewHelperCallNode(CORINFO_HELP_CLASSPROFILE, TYP_VOID, args);
-                            GenTree* const tmpNode2 = compiler->gtNewLclvNode(tmpNum, TYP_REF);
-                            GenTree* const callCommaNode =
-                                compiler->gtNewOperNode(GT_COMMA, TYP_REF, helperCallNode, tmpNode2);
-                            GenTree* const tmpNode3 = compiler->gtNewLclvNode(tmpNum, TYP_REF);
-                            GenTree* const asgNode =
-                                compiler->gtNewOperNode(GT_ASG, TYP_REF, tmpNode3, call->gtCallThisArg->GetNode());
-                            GenTree* const asgCommaNode =
-                                compiler->gtNewOperNode(GT_COMMA, TYP_REF, asgNode, callCommaNode);
-
-                            // Update the call
-                            //
-                            call->gtCallThisArg->SetNode(asgCommaNode);
-
-                            JITDUMP("Modified call is now\n");
-                            DISPTREE(call);
-                        }
+//------------------------------------------------------------------------
+// fgInstrumentMethod: add instrumentation probes to the method
+//
+// Note:
+//
+//   By default this instruments each non-internal block with
+//   a counter probe.
+//
+//   Optionally adds class probes to virtual and interface calls.
+//
+//   Probe structure is described by a schema array, which is created
+//   here based on flowgraph and IR structure.
+//
+void Compiler::fgInstrumentMethod()
+{
+    noway_assert(!compIsForInlining());
 
-                        // Restore the stub address on the call, whether instrumenting or not.
-                        //
-                        call->gtStubCallStubAddr = call->gtClassProfileCandidateInfo->stubAddr;
-                    }
-                };
+    // Choose instrumentation technology.
+    //
+    Instrumentor* countInst = new (this, CMK_Pgo) BlockCountInstrumentor(this);
+    Instrumentor* classInst = nullptr;
 
-                // Scan the statements and add class probes
-                //
-                ClassProbeInserter insertProbes(&schema, profileMemory, &currentSchemaIndex, instrument);
-                ClassProbeVisitor<ClassProbeInserter> visitor(this, insertProbes);
-                for (Statement* stmt : block->Statements())
-                {
-                    visitor.WalkTree(stmt->GetRootNodePointer(), nullptr);
-                }
+    if (JitConfig.JitClassProfiling() > 0)
+    {
+        classInst = new (this, CMK_Pgo) ClassProbeInstrumentor(this);
+    }
+    else
+    {
+        classInst = new (this, CMK_Pgo) NonInstrumentor(this);
+    }
 
-                // Bookkeeping
-                //
-                assert(insertProbes.m_count <= countOfCalls);
-                countOfCalls -= insertProbes.m_count;
-                JITDUMP("\n%d calls remain to be visited\n", countOfCalls);
-            }
-            else
-            {
-                JITDUMP("No calls to profile in " FMT_BB "\n", block->bbNum);
-            }
-        }
+    // Do any up-front work.
+    //
+    countInst->Prepare();
+    classInst->Prepare();
 
-        // We won't need count probes in internal blocks.
-        //
-        // TODO, perhaps: profile the flow early expansion ... we would need
-        // some non-il based keying scheme.
+    // Walk the flow graph to build up the instrumentation schema.
+    //
+    Schema schema(getAllocator(CMK_Pgo));
+    for (BasicBlock* block = fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        // Skip internal and un-imported blocks.
         //
-        if ((block->bbFlags & BBF_INTERNAL) != 0)
+        if ((block->bbFlags & BBF_IMPORTED) == 0)
         {
             continue;
         }
 
-        // One less block
-        countOfBlocks--;
-
-        if (instrument)
+        if ((block->bbFlags & BBF_INTERNAL) == BBF_INTERNAL)
         {
-            assert(block->bbCodeOffs == (IL_OFFSET)schema[currentSchemaIndex].ILOffset);
-            size_t addrOfCurrentExecutionCount = (size_t)(schema[currentSchemaIndex].Offset + profileMemory);
-            if (addrOfFirstExecutionCount == 0)
-                addrOfFirstExecutionCount = addrOfCurrentExecutionCount;
-            currentSchemaIndex++;
-
-            // Read Basic-Block count value
-            GenTree* valueNode =
-                gtNewIndOfIconHandleNode(TYP_INT, addrOfCurrentExecutionCount, GTF_ICON_BBC_PTR, false);
-
-            // Increment value by 1
-            GenTree* rhsNode = gtNewOperNode(GT_ADD, TYP_INT, valueNode, gtNewIconNode(1));
-
-            // Write new Basic-Block count value
-            GenTree* lhsNode = gtNewIndOfIconHandleNode(TYP_INT, addrOfCurrentExecutionCount, GTF_ICON_BBC_PTR, false);
-            GenTree* asgNode = gtNewAssignNode(lhsNode, rhsNode);
-
-            fgNewStmtAtBeg(block, asgNode);
+            continue;
         }
+
+        countInst->BuildSchemaElements(block, schema);
+        classInst->BuildSchemaElements(block, schema);
     }
 
-    if (!instrument)
+    // Verify we created schema for the calls needing class probes.
+    // (we counted those when importing)
+    //
+    assert(classInst->SchemaCount() == info.compClassProbeCount);
+
+    // Optionally, if there were no class probes and only one count probe,
+    // suppress instrumentation.
+    //
+    if ((JitConfig.JitMinimalProfiling() > 0) && (countInst->SchemaCount() == 1) && (classInst->SchemaCount() == 0))
     {
+        JITDUMP("Not instrumenting method: only one counter, and no class probes\n");
         return;
     }
 
-    // Check that we allocated and initialized the same number of BlockCounts tuples
+    JITDUMP("Instrumenting method: %d count probes and %d class probes\n", countInst->SchemaCount(),
+            classInst->SchemaCount());
+
+    // Allocate the profile buffer
     //
-    noway_assert(countOfBlocks == 0);
-    noway_assert(countOfCalls == 0);
+    BYTE* profileMemory;
 
-    // When prejitting, add the method entry callback node
-    if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT))
+    HRESULT res = info.compCompHnd->allocPgoInstrumentationBySchema(info.compMethodHnd, schema.data(),
+                                                                    (UINT32)schema.size(), &profileMemory);
+
+    // Deal with allocation failures.
+    //
+    if (!SUCCEEDED(res))
     {
-        GenTree* arg;
+        JITDUMP("Unable to instrument: schema allocation failed: 0x%x\n", res);
 
-#ifdef FEATURE_READYTORUN_COMPILER
-        if (opts.IsReadyToRun())
+        // The E_NOTIMPL status is returned when we are profiling a generic method from a different assembly
+        //
+        if (res != E_NOTIMPL)
         {
-            mdMethodDef currentMethodToken = info.compCompHnd->getMethodDefFromMethod(info.compMethodHnd);
-
-            CORINFO_RESOLVED_TOKEN resolvedToken;
-            resolvedToken.tokenContext = MAKE_METHODCONTEXT(info.compMethodHnd);
-            resolvedToken.tokenScope   = info.compScopeHnd;
-            resolvedToken.token        = currentMethodToken;
-            resolvedToken.tokenType    = CORINFO_TOKENKIND_Method;
+            noway_assert(!"Error: unexpected hresult from allocPgoInstrumentationBySchema");
+            return;
+        }
 
-            info.compCompHnd->resolveToken(&resolvedToken);
+        // Do any cleanup we might need to do...
+        //
+        countInst->SuppressProbes();
+        classInst->SuppressProbes();
+        return;
+    }
 
-            arg = impTokenToHandle(&resolvedToken);
-        }
-        else
-#endif
+    // Add the instrumentation code
+    //
+    for (BasicBlock* block = fgFirstBB; (block != nullptr); block = block->bbNext)
+    {
+        // Skip internal and un-imported blocks.
+        //
+        if ((block->bbFlags & BBF_IMPORTED) == 0)
         {
-            arg = gtNewIconEmbMethHndNode(info.compMethodHnd);
+            continue;
         }
 
-        GenTreeCall::Use* args = gtNewCallArgs(arg);
-        GenTree*          call = gtNewHelperCallNode(CORINFO_HELP_BBT_FCN_ENTER, TYP_VOID, args);
+        if ((block->bbFlags & BBF_INTERNAL) == BBF_INTERNAL)
+        {
+            continue;
+        }
 
-        // Read Basic-Block count value
-        GenTree* valueNode = gtNewIndOfIconHandleNode(TYP_INT, addrOfFirstExecutionCount, GTF_ICON_BBC_PTR, false);
+        countInst->Instrument(block, schema, profileMemory);
+        classInst->Instrument(block, schema, profileMemory);
+    }
 
-        // Compare Basic-Block count value against zero
-        GenTree*   relop = gtNewOperNode(GT_NE, TYP_INT, valueNode, gtNewIconNode(0, TYP_INT));
-        GenTree*   colon = new (this, GT_COLON) GenTreeColon(TYP_VOID, gtNewNothingNode(), call);
-        GenTree*   cond  = gtNewQmarkNode(TYP_VOID, relop, colon);
-        Statement* stmt  = gtNewStmt(cond);
+    // Verify we instrumented everthing we created schemas for.
+    //
+    assert(countInst->InstrCount() == countInst->SchemaCount());
+    assert(classInst->InstrCount() == classInst->SchemaCount());
 
-        fgEnsureFirstBBisScratch();
-        fgInsertStmtAtEnd(fgFirstBB, stmt);
-    }
+    // Add any special entry instrumentation. This does not
+    // use the schema mechanism.
+    //
+    countInst->InstrumentMethodEntry(schema, profileMemory);
+    classInst->InstrumentMethodEntry(schema, profileMemory);
 }
 
 bool flowList::setEdgeWeightMinChecked(BasicBlock::weight_t newWeight, BasicBlock::weight_t slop, bool* wbUsedSlop)