From 4d2890e8f769624882e8e19528761150369c0794 Mon Sep 17 00:00:00 2001 From: Jan Zielinski Date: Fri, 26 Jul 2019 09:37:12 +0200 Subject: [PATCH] swr/rasterizer: Add memory tracking support Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/archrast/archrast.cpp | 104 +++++++++++++++++++++ .../drivers/swr/rasterizer/archrast/events.proto | 21 ++++- .../swr/rasterizer/archrast/events_private.proto | 15 +++ src/gallium/drivers/swr/rasterizer/core/api.cpp | 5 + src/gallium/drivers/swr/rasterizer/core/api.h | 7 ++ .../drivers/swr/rasterizer/core/frontend.cpp | 4 +- src/gallium/drivers/swr/rasterizer/core/state.h | 3 +- .../drivers/swr/rasterizer/core/threads.cpp | 40 ++++---- .../swr/rasterizer/jitter/builder_gfx_mem.cpp | 57 +++++++++++ .../swr/rasterizer/jitter/builder_gfx_mem.h | 8 +- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 2 +- .../swr/rasterizer/jitter/streamout_jit.cpp | 12 ++- 12 files changed, 252 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp index a454fc1..06e0f61 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp +++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp @@ -26,6 +26,7 @@ * ******************************************************************************/ #include +#include #include "common/os.h" #include "archrast/archrast.h" @@ -85,6 +86,74 @@ namespace ArchRast uint32_t alphaBlendCount = 0; }; + struct MemoryStats + { + struct MemoryTrackerKey + { + uint64_t address; + uint64_t mask; + }; + + struct MemoryTrackerData + { + uint32_t accessCountRead; + uint32_t accessCountWrite; + uint64_t tscMin; + uint64_t tscMax; + }; + + struct AddressRangeComparator + { + bool operator()(MemoryTrackerKey a, MemoryTrackerKey b) const + { + return (a.address & a.mask) < (b.address & b.mask); + } + }; + + typedef std::map MemoryTrackerMap; + MemoryTrackerMap trackedMemory = {}; + + void TrackMemoryAccess(uint64_t address, uint64_t addressMask, uint8_t isRead, uint64_t tsc) + { + MemoryTrackerKey key; + key.address = address; + key.mask = addressMask; + + MemoryTrackerMap::iterator i = trackedMemory.lower_bound(key); + if (i != trackedMemory.end() && !(trackedMemory.key_comp()(key, i->first))) + { + // already in map + if (isRead) + { + i->second.accessCountRead++; + } + else + { + i->second.accessCountWrite++; + } + i->second.tscMax = tsc; + } + else + { + // new entry + MemoryTrackerData data; + if (isRead) + { + data.accessCountRead = 1; + data.accessCountWrite = 0; + } + else + { + data.accessCountRead = 0; + data.accessCountWrite = 1; + } + data.tscMin = tsc; + data.tscMax = tsc; + trackedMemory.insert(i, MemoryTrackerMap::value_type(key, data)); + } + } + }; + ////////////////////////////////////////////////////////////////////////// /// @brief Event handler that handles API thread events. This is shared /// between the API and its caller (e.g. driver shim) but typically @@ -180,6 +249,16 @@ namespace ArchRast EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false) { memset(mShaderStats, 0, sizeof(mShaderStats)); + + // compute address mask for memory tracking + mAddressMask = 0; + uint64_t addressRangeBytes = 64; + while (addressRangeBytes > 0) + { + mAddressMask = (mAddressMask << 1) | 1; + addressRangeBytes = addressRangeBytes >> 1; + } + mAddressMask = ~mAddressMask; } virtual void Handle(const EarlyDepthStencilInfoSingleSample& event) @@ -585,6 +664,28 @@ namespace ArchRast mGS = {}; } + virtual void Handle(const MemoryAccessEvent& event) + { + mMemoryStats.TrackMemoryAccess(event.data.ptr, mAddressMask, event.data.isRead, event.data.tsc); + } + + virtual void Handle(const MemoryStatsEndEvent& event) + { + MemoryStats::MemoryTrackerMap::iterator i = mMemoryStats.trackedMemory.begin(); + while (i != mMemoryStats.trackedMemory.end()) + { + MemoryStatsEvent mse(event.data.drawId, + i->first.address & mAddressMask, + i->second.accessCountRead, + i->second.accessCountWrite, + i->second.tscMin, + i->second.tscMax); + EventHandlerFile::Handle(mse); + i++; + } + mMemoryStats.trackedMemory.clear(); + } + virtual void Handle(const GSPrimInfo& event) { mGS.inputPrimCount += event.data.inputPrimCount; @@ -631,6 +732,9 @@ namespace ArchRast SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES]; + MemoryStats mMemoryStats = {}; + uint64_t mAddressMask = 0; + }; static EventManager* FromHandle(HANDLE hThreadContext) diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events.proto b/src/gallium/drivers/swr/rasterizer/archrast/events.proto index 1057a94..1618e5f 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto +++ b/src/gallium/drivers/swr/rasterizer/archrast/events.proto @@ -463,4 +463,23 @@ event SWTagFlushEvent uint32_t swTagFlushCounter; char swTagFlushReason[256]; uint32_t swTagFlushType; -}; \ No newline at end of file +}; + +event SWTagApiCallEvent +{ + uint64_t swTagFrame; + uint32_t swTagDrawOrDispatch; + uint32_t swTagDraw; + uint32_t swTagDispatch; + char swTagApiCall[256]; +}; + +event MemoryStatsEvent +{ + uint32_t drawId; + uint64_t baseAddr; + uint32_t accessCountRead; + uint32_t accessCountWrite; + uint64_t tscMin; + uint64_t tscMax; +}; diff --git a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto b/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto index b49d4bf..19fb582 100644 --- a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto +++ b/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto @@ -90,6 +90,21 @@ event FrontendDrawEndEvent uint32_t drawId; }; +event MemoryAccessEvent +{ + uint32_t drawId; + uint64_t tsc; + uint64_t ptr; + uint32_t size; + uint8_t isRead; + uint8_t client; +}; + +event MemoryStatsEndEvent +{ + uint32_t drawId; +}; + event TessPrimCount { uint64_t primCount; diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index a043a34..20f1a34 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -181,7 +181,12 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) #if defined(KNOB_ENABLE_AR) // Initialize worker thread context for ArchRast. pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER); + + SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData; + pWorkerData->hArContext = pContext->pArContext[i]; #endif + + } #if defined(KNOB_ENABLE_AR) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index a3f065d..4d523bb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -219,10 +219,17 @@ struct SWR_API_THREADING_INFO // Independent of KNOB_MAX_THREADS_PER_CORE. }; +struct SWR_WORKER_DATA +{ + HANDLE hArContext; // handle to the archrast context +}; + ////////////////////////////////////////////////////////////////////////// /// SWR_WORKER_PRIVATE_STATE /// Data used to allocate per-worker thread private data. A pointer /// to this data will be passed in to each shader function. +/// The first field of this private data must be SWR_WORKER_DATA +/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA ///////////////////////////////////////////////////////////////////////// struct SWR_WORKER_PRIVATE_STATE { diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 816b84e..d8703e5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -520,6 +520,8 @@ static void StreamOut( { RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId); + void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; + const API_STATE& state = GetApiState(pDC); const SWR_STREAMOUT_STATE& soState = state.soState; @@ -575,7 +577,7 @@ static void StreamOut( // Call SOS SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function."); - state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext); + state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext); } // Update SO write offset. The driver provides memory for the update. diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 8b24c43..5202e61 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -233,6 +233,7 @@ struct SWR_SHADER_STATS uint32_t numLodExecuted; }; + ////////////////////////////////////////////////////////////////////////// /// SWR_VS_CONTEXT /// @brief Input to vertex shader @@ -905,7 +906,7 @@ typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateDat typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext); typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext); typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext); -typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, SWR_STREAMOUT_CONTEXT& soContext); +typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext); typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*); diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 59e37a4..3090a24 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -458,6 +458,9 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, { ExecuteCallbacks(pContext, workerId, pDC); + // Report accumulated memory access stats + AR_EVENT(MemoryStatsEndEvent(pDC->drawId)); + // Cleanup memory allocations pDC->pArena->Reset(true); if (!pDC->isCompute) @@ -1193,26 +1196,31 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) // Allocate worker private data pPool->pWorkerPrivateDataArray = nullptr; - if (pContext->workerPrivateState.perWorkerPrivateStateSize) + if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0) { - size_t perWorkerSize = - AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); - size_t totalSize = perWorkerSize * pPool->numThreads; - if (totalSize) - { - pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64); - SWR_ASSERT(pPool->pWorkerPrivateDataArray); + pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA); + pContext->workerPrivateState.pfnInitWorkerData = nullptr; + pContext->workerPrivateState.pfnFinishWorkerData = nullptr; + } + + // initialize contents of SWR_WORKER_DATA + size_t perWorkerSize = + AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); + size_t totalSize = perWorkerSize * pPool->numThreads; + if (totalSize) + { + pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64); + SWR_ASSERT(pPool->pWorkerPrivateDataArray); - void* pWorkerData = pPool->pWorkerPrivateDataArray; - for (uint32_t i = 0; i < pPool->numThreads; ++i) + void* pWorkerData = pPool->pWorkerPrivateDataArray; + for (uint32_t i = 0; i < pPool->numThreads; ++i) + { + pPool->pThreadData[i].pWorkerPrivateData = pWorkerData; + if (pContext->workerPrivateState.pfnInitWorkerData) { - pPool->pThreadData[i].pWorkerPrivateData = pWorkerData; - if (pContext->workerPrivateState.pfnInitWorkerData) - { - pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i); - } - pWorkerData = PtrAdd(pWorkerData, perWorkerSize); + pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i); } + pWorkerData = PtrAdd(pWorkerData, perWorkerSize); } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp index adf8924..21e3d47c 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp @@ -42,7 +42,9 @@ namespace SwrJit mpTranslationFuncTy = nullptr; mpfnTranslateGfxAddressForRead = nullptr; mpfnTranslateGfxAddressForWrite = nullptr; + mpfnTrackMemAccess = nullptr; mpParamSimDC = nullptr; + mpWorkerData = nullptr; } @@ -167,9 +169,57 @@ namespace SwrJit return Ptr; } + void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead) + { +#if defined(KNOB_ENABLE_AR) + if (!KNOB_TRACK_MEMORY_WORKING_SET) + { + return; + } + + Value* tmpPtr; + // convert actual pointers to int64. + uint32_t size = 0; + + if (Ptr->getType() == mInt64Ty) + { + DataLayout dataLayout(JM()->mpCurrentModule); + size = (uint32_t)dataLayout.getTypeAllocSize(Ty); + + tmpPtr = Ptr; + } + else + { + DataLayout dataLayout(JM()->mpCurrentModule); + size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType()); + + tmpPtr = PTR_TO_INT(Ptr, mInt64Ty); + } + + // There are some shader compile setups where there's no translation functions set up. + // This would be a situation where the accesses are to internal rasterizer memory and won't + // be logged. + // TODO: we may wish to revisit this for URB reads/writes, though. + if (mpfnTrackMemAccess) + { + SWR_ASSERT(mpWorkerData != nullptr); + CALL(mpfnTrackMemAccess, + {mpParamSimDC, + mpWorkerData, + tmpPtr, + C((uint32_t)size), + C((uint8_t)isRead), + C((uint32_t)usage)}); + } +#endif + + return; + } + LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); + TrackerHelper(Ptr, Ty, usage, true); Ptr = TranslationHelper(Ptr, Ty); return Builder::LOAD(Ptr, Name); @@ -178,6 +228,7 @@ namespace SwrJit LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); + TrackerHelper(Ptr, Ty, usage, true); Ptr = TranslationHelper(Ptr, Ty); return Builder::LOAD(Ptr, Name); @@ -188,6 +239,7 @@ namespace SwrJit Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); + TrackerHelper(Ptr, Ty, usage, true); Ptr = TranslationHelper(Ptr, Ty); return Builder::LOAD(Ptr, isVolatile, Name); @@ -232,6 +284,7 @@ namespace SwrJit JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); + TrackerHelper(Ptr, Ty, usage, true); Ptr = TranslationHelper(Ptr, Ty); return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage); @@ -241,6 +294,7 @@ namespace SwrJit BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); + TrackerHelper(Ptr, Ty, usage, false); Ptr = TranslationHelper(Ptr, Ty); return Builder::STORE(Val, Ptr, isVolatile, Ty, usage); @@ -253,6 +307,7 @@ namespace SwrJit JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(BasePtr, usage); + TrackerHelper(BasePtr, Ty, usage, false); BasePtr = TranslationHelper(BasePtr, Ty); return Builder::STORE(Val, BasePtr, offset, Ty, usage); @@ -263,6 +318,8 @@ namespace SwrJit { AssertGFXMemoryParams(Ptr, usage); + TrackerHelper(Ptr, Ty, usage, false); + Ptr = TranslationHelper(Ptr, Ty); return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage); } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h index 1bbe86d..52bd3ac 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h @@ -110,7 +110,7 @@ namespace SwrJit Type* PtrTy = nullptr, const Twine& Name = "", JIT_MEM_CLIENT usage = JIT_MEM_CLIENT::MEM_CLIENT_INTERNAL); - + protected: void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage); @@ -120,6 +120,8 @@ namespace SwrJit virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset); Value* TranslationHelper(Value* Ptr, Type* Ty); + void TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead); + FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; } Value* GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; } @@ -127,10 +129,14 @@ namespace SwrJit Value* GetParamSimDC() { return mpParamSimDC; } + Value* mpWorkerData; + private: FunctionType* mpTranslationFuncTy; Value* mpfnTranslateGfxAddressForRead; Value* mpfnTranslateGfxAddressForWrite; Value* mpParamSimDC; + FunctionType* mpTrackMemAccessFuncTy; + Value* mpfnTrackMemAccess; }; } // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 5a09653..8601d05 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -113,7 +113,6 @@ struct FetchJit : public BuilderGfxMem SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); void ConvertFormat(SWR_FORMAT format, Value* texels[4]); - Value* mpWorkerData; Value* mpFetchInfo; }; @@ -141,6 +140,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) mpWorkerData = &*argitr; ++argitr; mpWorkerData->setName("pWorkerData"); + mpFetchInfo = &*argitr; ++argitr; mpFetchInfo->setName("fetchInfo"); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index 43e2c44..c47acf7 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -263,12 +263,10 @@ struct StreamOutJit : public BuilderGfxMem std::ios_base::in | std::ios_base::out | std::ios_base::ate); fnName << ComputeCRC(0, &state, sizeof(state)); - Type* typeParam0; - typeParam0 = mInt8PtrTy; - std::vector args{ - typeParam0, - PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* + mInt8PtrTy, + mInt8PtrTy, + PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* }; FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); @@ -290,6 +288,10 @@ struct StreamOutJit : public BuilderGfxMem privateContext->setName("privateContext"); SetPrivateContext(privateContext); + mpWorkerData = &*argitr; + ++argitr; + mpWorkerData->setName("pWorkerData"); + Value* pSoCtx = &*argitr++; pSoCtx->setName("pSoCtx"); -- 2.7.4