Add alternative CPU sampling mode
authorGleb Balykov <g.balykov@samsung.com>
Wed, 31 May 2023 18:00:07 +0000 (21:00 +0300)
committerGleb Balykov/Advanced System SW Lab /SRR/Staff Engineer/Samsung Electronics <g.balykov@samsung.com>
Wed, 21 Jun 2023 12:38:33 +0000 (15:38 +0300)
Use SuspendRuntime/ResumeRuntime with async DoStackSnapshot ICorProfilerInfo API calls in sampling thread to get samples
for all managed threads instead of sending Linux signals to all managed threads with sample gathered in signal handler.

Alternative CPU sampling mode also disables Enter/Leave hooks and allows usage of ni.dll.

README.md
src/config/configurationmanager.cpp
src/config/profilerconfig.cpp
src/config/profilerconfig.h
src/profilerinfo.cpp
src/profilerinfo.h
src/trace/commontrace.cpp
src/trace/commontrace.h
src/trace/eventchannel.h
src/trace/executiontrace.cpp
src/trace/executiontrace.h

index 5ae4e7139f8e258b7a8398e4f58614ada6fabf1d..bbd7cc6905a8bbb4cff893561ab8d9e9336da19d 100644 (file)
--- a/README.md
+++ b/README.md
@@ -120,6 +120,8 @@ dumped (only timestamp of execution and configuration parameters is stored).
     of this subsystem is to track changes of execution stack.
     `PROF_EXECUTION_TRACE` is defaulted to `false`.
 
+      - PROF_USE_ALTERNATIVE_SAMPLING_EXECUTION_TRACING` - use real sampling profiling API (async `DoStackSnapshot`) instead of instrumentation (Enter/Leave hooks) for execution tracing. Default value is `false` (alternative sampling is disabled).
+
   - `PROF_MEMORY_TRACE` - enables Memory Trace subsystem. This subsystem
     collects data about allocations. `PROF_MEMORY_TRACE` is defaulted to
     `false`.
index 4c31e0c5628d91b0d58d5f135af82d31fcec0553..78836a1fdd2a7a0912936ce5eb61ea09929cfbbe 100644 (file)
@@ -174,6 +174,7 @@ void ConfigurationManager::FetchConfig(ProfilerConfig &config) const
     FetchValue("PROF_CPU_TRACE_TIMEOUT", new_config.CpuTraceTimeoutMs);
 
     FetchValue("PROF_EXECUTION_TRACE",  new_config.ExecutionTraceEnabled);
+    FetchValue("PROF_USE_ALTERNATIVE_SAMPLING_EXECUTION_TRACING", new_config.AlternativeSamplingExecutionTraceEnabled);
     FetchValue("PROF_MEMORY_TRACE",     new_config.MemoryTraceEnabled);
     if (new_config.MemoryTraceEnabled &&
         new_config.ExecutionTraceEnabled &&
index 4613d0246d1199fbb6f7d3818e09dd5d7435652b..35a24472d5a58a31239cf93c5937ed93ddd17259 100644 (file)
@@ -33,6 +33,7 @@ ProfilerConfig::ProfilerConfig()
     , CpuTraceThreadEnabled(false)
     , CpuTraceTimeoutMs(10)
     , ExecutionTraceEnabled(false)
+    , AlternativeSamplingExecutionTraceEnabled(false)
     , MemoryTraceEnabled(false)
     , StackTrackingEnabled(false)
     , MemoryMinSizeLimit(0)
@@ -120,6 +121,25 @@ std::vector<std::string> ProfilerConfig::Verify()
         }
     }
 
+    if (!ExecutionTraceEnabled)
+    {
+        if (AlternativeSamplingExecutionTraceEnabled)
+        {
+            warnings.push_back(
+                "real sampling requires execution tracing");
+            AlternativeSamplingExecutionTraceEnabled = false;
+        }
+    }
+    if (MemoryTraceEnabled || CollectionMethod != CollectionMethod::Sampling)
+    {
+        if (AlternativeSamplingExecutionTraceEnabled)
+        {
+            warnings.push_back(
+                "real sampling is disabled if memory tracing is required or instrumentation mode is used");
+            AlternativeSamplingExecutionTraceEnabled = false;
+        }
+    }
+
     if (!ExecutionTraceEnabled && !MemoryTraceEnabled)
     {
         // When all traces are disabled.
index d021611cfcfe256470aedcf2a0fd2edabe820a51..c1dfef3fecc05f2a098673c8cf6e5ffcc09a6987 100644 (file)
@@ -66,6 +66,7 @@ struct ProfilerConfig
     //
 
     bool             ExecutionTraceEnabled;
+    bool             AlternativeSamplingExecutionTraceEnabled;
 
     //
     // Memory Trace features.
index 65ad1bb59ba4d6a50ddaa1e71352e26a961dce56..4f68c7822ecca6d00e138266e6bd3c4b1ea017db 100644 (file)
@@ -26,7 +26,9 @@ ProfilerInfo::ProfilerInfo() noexcept
     , m_pProfilerInfo7(nullptr)
     , m_pProfilerInfo8(nullptr)
     , m_pProfilerInfo9(nullptr)
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
     , m_pProfilerInfo10(nullptr)
+#endif
     , m_version(0)
 {
 }
@@ -43,6 +45,7 @@ HRESULT ProfilerInfo::Initialize(IUnknown *pICorProfilerInfoUnk) noexcept
 
     HRESULT hr = S_OK;  // for case when Initialize() called twice
 
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
     if (m_pProfilerInfo10 == nullptr)
     {
         hr = pICorProfilerInfoUnk->QueryInterface(
@@ -72,6 +75,7 @@ HRESULT ProfilerInfo::Initialize(IUnknown *pICorProfilerInfoUnk) noexcept
                 m_pProfilerInfo10);
         }
     }
+#endif
 
     if (m_pProfilerInfo9 == nullptr)
     {
@@ -252,7 +256,10 @@ HRESULT ProfilerInfo::Initialize(IUnknown *pICorProfilerInfoUnk) noexcept
         }
     }
 
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
     _ASSERTE(m_version < 10 || m_pProfilerInfo10 != nullptr);
+#endif
+
     _ASSERTE(m_version < 9 || m_pProfilerInfo9 != nullptr);
     _ASSERTE(m_version < 8 || m_pProfilerInfo8 != nullptr);
     _ASSERTE(m_version < 7 || m_pProfilerInfo7 != nullptr);
@@ -282,7 +289,9 @@ void ProfilerInfo::Reset() noexcept
         m_pProfilerInfo7 = nullptr;
         m_pProfilerInfo8 = nullptr;
         m_pProfilerInfo9 = nullptr;
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
         m_pProfilerInfo10 = nullptr;
+#endif
         m_version = 0;
     }
 }
index d0e4fd7262bf7de41dceae307faa88495b6b5378..fe7b7d1db209923ea6c91b342e395b14463a8425 100644 (file)
@@ -100,11 +100,13 @@ public:
         return m_pProfilerInfo9;
     }
 
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
     ICorProfilerInfo10 *v10() const noexcept
     {
         _ASSERTE(m_version >= 10);
         return m_pProfilerInfo10;
     }
+#endif
 
 private:
     // Pointers to the implementation of the ProfilerInfo interface(s).
@@ -117,7 +119,9 @@ private:
     ICorProfilerInfo7 *m_pProfilerInfo7;
     ICorProfilerInfo8 *m_pProfilerInfo8;
     ICorProfilerInfo9 *m_pProfilerInfo9;
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
     ICorProfilerInfo10 *m_pProfilerInfo10;
+#endif
 
     // Version of the Profiler Info API.
     unsigned int m_version;
index 6a897935e94b08628c97208089f2d54e5abd716b..23decdff2081b58ceef15d76689b9c6ffbcad6c2 100644 (file)
@@ -201,6 +201,14 @@ void CommonTrace::ProcessConfig(ProfilerConfig &config)
         }
     }
 
+    if (config.AlternativeSamplingExecutionTraceEnabled)
+    {
+#ifndef __ICorProfilerInfo10_INTERFACE_DEFINED__
+        config.AlternativeSamplingExecutionTraceEnabled = false;
+        LOG().Warn() << "Real CPU sampling option is disabled";
+#endif
+    }
+
     //
     // Starting service threads.
     //
@@ -248,7 +256,7 @@ void CommonTrace::ProcessConfig(ProfilerConfig &config)
 
     // This events are required for tracing of call stack dynamics.
 
-    if (config.LineTraceEnabled)
+    if (config.LineTraceEnabled || config.AlternativeSamplingExecutionTraceEnabled)
     {
         events |= COR_PRF_ENABLE_STACK_SNAPSHOT;
     }
@@ -283,6 +291,131 @@ void CommonTrace::Shutdown() noexcept
     m_sampleAction . Release();
 }
 
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
+static HRESULT __stdcall StackCallback(FunctionID funcId, UINT_PTR ip, COR_PRF_FRAME_INFO frameInfo, ULONG32 contextSize, BYTE context[], void *clientData)
+{
+    std::vector< std::pair<FunctionID, UINT_PTR> > *frames = reinterpret_cast<std::vector< std::pair<FunctionID, UINT_PTR> > *>(clientData);
+
+    frames->emplace_back(funcId, ip);
+
+    return S_OK;
+}
+#endif
+
+// return true on successful sample, false otherwise
+bool CommonTrace::DoSampleManually() noexcept
+{
+#ifdef __ICorProfilerInfo10_INTERFACE_DEFINED__
+    // 1. suspend runtime
+    HRESULT hr = m_info.v10()->SuspendRuntime();
+    if (FAILED(hr))
+    {
+        // can't suspend runtime for some reason, ignore this frame
+        // (this can happen when runtime is already suspended, e.g. by GC, etc.)
+        // don't sleep in this case and retry again immediately to try to get the closest frame
+        return false;
+    }
+
+    // 2. check all already registered threads
+    ExecutionTrace &executionTrace = m_profiler.GetExecutionTrace();
+
+    std::vector< std::pair<FunctionID, UINT_PTR> > frames_reverse;
+    std::vector< std::pair<FunctionInfo*, UINT_PTR> > frames;
+
+    auto storage_lock = m_threadStorage.lock_shared();
+    for (ThreadInfo &threadInfo : storage_lock->GetLiveRange())
+    {
+        frames_reverse.clear();
+        frames.clear();
+
+        // 3. do DoStackSnapshot for thread and fill all frames
+        hr = m_info.v2()->DoStackSnapshot(threadInfo.id, StackCallback, COR_PRF_SNAPSHOT_DEFAULT, &frames_reverse, nullptr, 0);
+        if (FAILED(hr))
+        {
+            // ignore this, we can add even zero frames as a sample
+        }
+
+        // 4. Write new frames to EventChannel of thread
+        std::vector< std::pair<FunctionID, UINT_PTR> >::reverse_iterator it = frames_reverse.rbegin();
+        for (; it != frames_reverse.rend(); ++it)
+        {
+            FunctionInfo *funcInfo = nullptr;
+
+            if (it->first == 0)
+            {
+                // unmanaged frame, insert special UnmanagedFunctionInfo if this was managed->unmanaged transition
+                if (it != frames_reverse.rbegin())
+                {
+                    std::vector< std::pair<FunctionID, UINT_PTR> >::reverse_iterator prev = std::prev(it);
+
+                    if (prev->first)
+                    {
+                        // managed->unmanaged transition
+                        funcInfo = executionTrace.getUnmanagedFuncInfo();
+                    }
+                }
+            }
+            else
+            {
+                // managed frame
+                BOOL bHookFunction;
+                funcInfo = reinterpret_cast<FunctionInfo*>(executionTrace.FunctionIDMap(it->first, &bHookFunction));
+            }
+
+            if (funcInfo)
+            {
+                if (m_profiler.GetConfig().StackDepthLimit && // In case `StackDepthLimit` == 0 limit is disabled.
+                    frames.size() > m_profiler.GetConfig().StackDepthLimit)
+                {
+                    break;
+                }
+
+                UINT_PTR ip = 0;
+                if (m_profiler.GetConfig().LineTraceEnabled)
+                {
+                    ip = it->second;
+                }
+                frames.emplace_back(funcInfo, ip);
+            }
+        }
+
+        // insert jit frames
+        size_t jitDepth = threadInfo.eventChannel.getJitDepth();
+        for (size_t i = 0; i < jitDepth; ++i)
+        {
+            if (m_profiler.GetConfig().StackDepthLimit && // In case `StackDepthLimit` == 0 limit is disabled.
+                frames.size() > m_profiler.GetConfig().StackDepthLimit)
+            {
+                break;
+            }
+            frames.emplace_back(executionTrace.getJitFuncInfo(), 0);
+        }
+
+        threadInfo.eventChannel.UpdateFully(frames);
+
+        // 5. create new sample for thread
+        if (threadInfo.eventChannel.Sample(m_profiler.GetTickCountFromInit(), 1))
+        {
+            this->SendDoLog(threadInfo);
+        }
+    }
+
+    // 6. resume runtime
+    hr = m_info.v10()->ResumeRuntime();
+    if (FAILED(hr))
+    {
+        // unable to resume runtime for some reason
+        // consider this as successful sample, let runtime handle this situation in next SuspendRuntime call
+        return true;
+    }
+
+    return true;
+#else
+    _ASSERTE(false);
+    return false;
+#endif
+}
+
 __forceinline void CommonTrace::SendDoSample(ThreadInfo &thrInfo) noexcept
 {
     union sigval val;
@@ -479,43 +612,59 @@ void CommonTrace::SamplingThread(binary_semaphore *pInitialized) noexcept
             {
                 struct timespec ts;
 
-                if (itThrInfo == endThrInfo)
+                if (!m_profiler.GetConfig().AlternativeSamplingExecutionTraceEnabled)
                 {
+                    if (itThrInfo == endThrInfo)
                     {
-                        auto storage_lock = this->GetThreadStorageShared();
-                        liveThreads = storage_lock->GetLiveContainer();
+                        {
+                            auto storage_lock = this->GetThreadStorageShared();
+                            liveThreads = storage_lock->GetLiveContainer();
+                        }
+                        itThrInfo  = liveThreads.begin();
+                        endThrInfo = liveThreads.end();
+                        splitter.Reset(liveThreads.size());
                     }
-                    itThrInfo  = liveThreads.begin();
-                    endThrInfo = liveThreads.end();
-                    splitter.Reset(liveThreads.size());
-                }
-
-                if (!liveThreads.empty())
-                {
-                    auto storage_lock = this->GetThreadStorageShared();
-                    ThreadInfo &thrInfo = *itThrInfo++;
 
-                    // We update all live threads if they are attached to OS
-                    // threads.
-                    if (thrInfo.id != 0 && thrInfo.nativeHandle != 0)
+                    if (!liveThreads.empty())
                     {
-                        thrInfo.genTicks++; // OK with unsigned overflows.
-                        if (m_profiler.GetConfig().HighGranularityEnabled)
+                        auto storage_lock = this->GetThreadStorageShared();
+                        ThreadInfo &thrInfo = *itThrInfo++;
+
+                        // We update all live threads if they are attached to OS
+                        // threads.
+                        if (thrInfo.id != 0 && thrInfo.nativeHandle != 0)
                         {
-                            this->SendDoSample(thrInfo);
+                            thrInfo.genTicks++; // OK with unsigned overflows.
+                            if (m_profiler.GetConfig().HighGranularityEnabled)
+                            {
+                                this->SendDoSample(thrInfo);
+                            }
                         }
-                    }
 
-                    ts = MsToTS(splitter.GetNext());
+                        ts = MsToTS(splitter.GetNext());
+                    }
+                    else
+                    {
+                        ts = MsToTS(m_profiler.GetConfig().SamplingTimeoutMs);
+                    }
                 }
                 else
                 {
-                    ts = MsToTS(m_profiler.GetConfig().SamplingTimeoutMs);
+                    bool successful = this->DoSampleManually();
+                    if (successful)
+                    {
+                        ts = MsToTS(m_profiler.GetConfig().SamplingTimeoutMs);
+                    }
+                    else
+                    {
+                        ts.tv_sec = 0;
+                        ts.tv_nsec = 0;
+                    }
                 }
 
                 // NOTE: Sleep() function has better precision so we use it
-                // for short pauses.
-                if (ts.tv_sec == 0)
+                // for short pauses (except 0 pauses).
+                if (ts.tv_sec == 0 && ts.tv_nsec != 0)
                 {
                     Sleep(ts.tv_nsec / 1000000);
                     ts.tv_nsec = 0;
@@ -1042,6 +1191,15 @@ HRESULT CommonTrace::ThreadCreated(
         InternalID threadIid =
             m_threadStorage.lock()->Place(threadId).first.internalId;
         TRACE().DumpThreadCreated(threadId, threadIid);
+
+        ThreadID currentThreadId = 0;
+        m_info.v1()->GetCurrentThreadID(&currentThreadId);
+
+        if (currentThreadId == threadId)
+        {
+            // init local thread cache if we're on the same thread (this is used in jit related callbacks)
+            ThreadInfo *pThreadInfo = m_profiler.GetCommonTrace().GetThreadInfo();
+        }
     }
     catch (const std::exception &e)
     {
@@ -1120,6 +1278,15 @@ HRESULT CommonTrace::ThreadAssignedToOSThread(
             threadIid = thrInfo.internalId;
         }
         TRACE().DumpThreadAssignedToOSThread(threadIid, osThreadId);
+
+        ThreadID currentThreadId = 0;
+        m_info.v1()->GetCurrentThreadID(&currentThreadId);
+
+        if (currentThreadId == managedThreadId)
+        {
+            // init local thread cache if we're on the same thread (this is used in jit related callbacks)
+            ThreadInfo *pThreadInfo = m_profiler.GetCommonTrace().GetThreadInfo();
+        }
     }
     catch (const std::exception &e)
     {
index 7176a6fb4af50501baa4ddd36a5df85d39eb29d7..a0c7d7f1fc0439d3fe3bb25723e0ae289c3fef4d 100644 (file)
@@ -61,6 +61,8 @@ private:
         SAMPLING_EVENT_RESUME,
     };
 
+    bool DoSampleManually() noexcept;
+
     void SendDoSample(ThreadInfo &thrInfo) noexcept;
 
     void SendDoLog(ThreadInfo &thrInfo) noexcept;
index 1d2644dedfb9459ba1dc93a103cf51d0f54b62eb..d63783b88a5d4e7794ea2f6c4ab031d9288d79e2 100644 (file)
@@ -118,6 +118,7 @@ public:
         , m_buffer(EVENT_CHANNEL_START_CAP)
         , m_mutex()
         , m_bufferCapacityIncreaseIsPlanned(false)
+        , m_jitDepth(0)
     {}
 
 private:
@@ -169,6 +170,29 @@ public:
     // Writer methods.
     //
 
+    void UpdateFully(std::vector< std::pair<FunctionInfo*, UINT_PTR> > &frames) noexcept
+    {
+        size_t size = m_stack.size();
+        for (size_t i = 0; i < size; ++i)
+        {
+            Pop();
+        }
+
+        for (size_t i = 0; i < frames.size(); ++i)
+        {
+            Push(*frames[i].first, frames[i].second);
+        }
+    }
+
+    void Push(const FunctionInfo &funcInfo, UINT_PTR ip) noexcept
+    {
+        assert(m_currentState.matchPrefixSize <= m_stack.size());
+        m_stack.push_back(Frame{&funcInfo, ip});
+
+        // XXX: exception in this call will terminate process!
+        this->EnsureBufferCapacity(); // Perform planned reallocation.
+    }
+
     void Push(const FunctionInfo &funcInfo) noexcept
     {
         assert(m_currentState.matchPrefixSize <= m_stack.size());
@@ -304,6 +328,22 @@ public:
         m_mutex.unlock();
     }
 
+    void increaseJitDepth() noexcept
+    {
+        ++m_jitDepth;
+    }
+
+    void decreaseJitDepth() noexcept
+    {
+        assert(jitDepth != 0);
+        --m_jitDepth;
+    }
+
+    size_t getJitDepth() noexcept
+    {
+        return m_jitDepth;
+    }
+
 private:
     Stack m_stack;
     EventSummary m_currentState;
@@ -311,6 +351,8 @@ private:
     ring_buffer<EventSummary> m_buffer;
     std::mutex m_mutex;
     bool m_bufferCapacityIncreaseIsPlanned;
+
+    std::atomic<size_t> m_jitDepth;
 };
 
 #endif // _EVENT_CHANNEL_H_
index fe4e08c8e754161b6360ad3b3d7bb4862b397569..b128535ca44be552e3fabadefcfc2997e6aa151c 100644 (file)
@@ -148,22 +148,25 @@ void ExecutionTrace::ProcessConfig(ProfilerConfig &config)
 
         m_info.v3()->SetFunctionIDMapper2(FunctionIDMapStub, this);
 
-        hr = m_info.v3()->SetEnterLeaveFunctionHooks3(
-            EnterNaked3, LeaveNaked3, TailcallNaked3);
-        if (FAILED(hr))
+        if (!config.AlternativeSamplingExecutionTraceEnabled)
         {
-            m_profiler.HandleHresult(
-                "ExecutionTrace::ProcessConfig(): "
-                "SetEnterLeaveFunctionHooks3()", hr
-            );
-            goto next_stage;
-        }
+            hr = m_info.v3()->SetEnterLeaveFunctionHooks3(
+              EnterNaked3, LeaveNaked3, TailcallNaked3);
+            if (FAILED(hr))
+            {
+                m_profiler.HandleHresult(
+                    "ExecutionTrace::ProcessConfig(): "
+                    "SetEnterLeaveFunctionHooks3()", hr
+                );
+                goto next_stage;
+            }
 
-        // This events are required for tracing of call stack dynamics.
-        events = events
-            | COR_PRF_MONITOR_ENTERLEAVE
-            | COR_PRF_MONITOR_CODE_TRANSITIONS
-            | COR_PRF_MONITOR_EXCEPTIONS;
+            // This events are required for tracing of call stack dynamics.
+            events = events
+                | COR_PRF_MONITOR_ENTERLEAVE
+                | COR_PRF_MONITOR_CODE_TRANSITIONS
+                | COR_PRF_MONITOR_EXCEPTIONS;
+        }
     }
 
 next_stage:
@@ -485,13 +488,20 @@ HRESULT ExecutionTrace::JITStarted(
     ThreadInfo *pThreadInfo = m_profiler.GetCommonTrace().GetThreadInfoR();
     if (pThreadInfo != nullptr)
     {
-        if (m_profiler.GetConfig().LineTraceEnabled)
+        if (!m_profiler.GetConfig().AlternativeSamplingExecutionTraceEnabled)
         {
-            this->UpdateCallStackPush(*m_pJitFunctionInfo, 0);
+            if (m_profiler.GetConfig().LineTraceEnabled)
+            {
+                this->UpdateCallStackPush(*m_pJitFunctionInfo, 0);
+            }
+            else
+            {
+                this->UpdateCallStackPush(*m_pJitFunctionInfo);
+            }
         }
         else
         {
-            this->UpdateCallStackPush(*m_pJitFunctionInfo);
+            pThreadInfo->eventChannel.increaseJitDepth();
         }
     }
 
@@ -507,7 +517,14 @@ HRESULT ExecutionTrace::JITFinished(
     ThreadInfo *pThreadInfo = m_profiler.GetCommonTrace().GetThreadInfoR();
     if (pThreadInfo != nullptr)
     {
-        this->UpdateCallStackPop();
+        if (!m_profiler.GetConfig().AlternativeSamplingExecutionTraceEnabled)
+        {
+            this->UpdateCallStackPop();
+        }
+        else
+        {
+            pThreadInfo->eventChannel.decreaseJitDepth();
+        }
     }
 
 
index a6151b391143c0a78a3c02c9c2efa6031a482bf7..b59bab50f4abc4ed50643569f80f77e5beef7d60 100644 (file)
@@ -67,6 +67,16 @@ public:
     __forceinline void AfterSample(
         ThreadInfo &thrInfo, SamplingSharedState &state) noexcept;
 
+    FunctionInfo * getUnmanagedFuncInfo() noexcept
+    {
+        return m_pUnmanagedFunctionInfo;
+    }
+
+    FunctionInfo * getJitFuncInfo() noexcept
+    {
+        return m_pJitFunctionInfo;
+    }
+
 private:
     //
     // Various useful instance methods.