From 4e8763cb0904c30d1962cf5ad52fe3a87be7b4bd Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Sat, 6 Aug 2016 20:10:14 -0600 Subject: [PATCH] swr: [rasterizer core] split FE and BE stats Separated FE stats out into its own structure. There are 17 FE vs 3 BE stat fields. Since there is only one FE thread per DC then we don't have to loop over all threads and sum up FE stats over all the worker threads. This also reduces size of DC since we only need to store one copy of the FE stats and not one per worker. Finally, we can use the new FE callback mechanism to update these. Signed-off-by: Tim Rowley --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 1 + src/gallium/drivers/swr/rasterizer/core/api.h | 21 +++++++++---- src/gallium/drivers/swr/rasterizer/core/clip.h | 6 ++-- src/gallium/drivers/swr/rasterizer/core/context.h | 15 ++++++---- .../drivers/swr/rasterizer/core/frontend.cpp | 18 +++++------ src/gallium/drivers/swr/rasterizer/core/state.h | 16 ++++++++-- .../drivers/swr/rasterizer/core/threads.cpp | 21 ++++--------- src/gallium/drivers/swr/swr_context.cpp | 19 ++++++++++-- src/gallium/drivers/swr/swr_context.h | 1 + src/gallium/drivers/swr/swr_query.cpp | 35 ++++++++++++---------- src/gallium/drivers/swr/swr_query.h | 1 + 11 files changed, 95 insertions(+), 59 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 0797c8a..d6aa80d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -144,6 +144,7 @@ HANDLE SwrCreateContext( pContext->pfnClearTile = pCreateInfo->pfnClearTile; pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; + pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; // pass pointer to bucket manager back to caller #ifdef KNOB_ENABLE_RDTSC diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 4ee04dc..ed18fe0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -95,6 +95,16 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext, typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats); +////////////////////////////////////////////////////////////////////////// +/// @brief Callback to allow driver to update their copy of FE stats. +/// @note Its optimal to have a separate callback for FE stats since +/// there is only one DC per FE thread. This means we do not have +/// to sum up the stats across all of the workers. +/// @param hPrivateContext - handle to private data +/// @param pStats - pointer to draw stats +typedef void(SWR_API *PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, + const SWR_STATS_FE* pStats); + class BucketManager; ////////////////////////////////////////////////////////////////////////// @@ -121,11 +131,12 @@ struct SWR_CREATECONTEXT_INFO uint32_t privateStateSize; // Callback functions - PFN_LOAD_TILE pfnLoadTile; - PFN_STORE_TILE pfnStoreTile; - PFN_CLEAR_TILE pfnClearTile; - PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; - PFN_UPDATE_STATS pfnUpdateStats; + PFN_LOAD_TILE pfnLoadTile; + PFN_STORE_TILE pfnStoreTile; + PFN_CLEAR_TILE pfnClearTile; + PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; + PFN_UPDATE_STATS pfnUpdateStats; + PFN_UPDATE_STATS_FE pfnUpdateStatsFE; // Pointer to rdtsc buckets mgr returned to the caller. // Only populated when KNOB_ENABLE_RDTSC is set diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index b2b3bb4..a2ba769 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -495,7 +495,7 @@ public: // update global pipeline stat SWR_CONTEXT* pContext = this->pDC->pContext; - UPDATE_STAT(CPrimitives, numClippedPrims); + UPDATE_STAT_FE(CPrimitives, numClippedPrims); } // execute the clipper stage @@ -523,7 +523,7 @@ public: // update clipper invocations pipeline stat SWR_CONTEXT* pContext = this->pDC->pContext; uint32_t numInvoc = _mm_popcnt_u32(primMask); - UPDATE_STAT(CInvocations, numInvoc); + UPDATE_STAT_FE(CInvocations, numInvoc); ComputeClipCodes(prim); @@ -559,7 +559,7 @@ public: { // update CPrimitives pipeline state SWR_CONTEXT* pContext = this->pDC->pContext; - UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask)); + UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); // forward valid prims directly to binner pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index c478ee9..144fcef 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -365,7 +365,8 @@ struct DRAW_DYNAMIC_STATE uint32_t SoWriteOffset[4]; bool SoWriteOffsetDirty[4]; - SWR_STATS stats[KNOB_MAX_NUM_THREADS]; + SWR_STATS_FE statsFE; // Only one FE thread per DC. + SWR_STATS stats[KNOB_MAX_NUM_THREADS]; }; // Draw Context @@ -470,11 +471,12 @@ struct SWR_CONTEXT HotTileMgr *pHotTileMgr; // Callback functions, passed in at create context time - PFN_LOAD_TILE pfnLoadTile; - PFN_STORE_TILE pfnStoreTile; - PFN_CLEAR_TILE pfnClearTile; - PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; - PFN_UPDATE_STATS pfnUpdateStats; + PFN_LOAD_TILE pfnLoadTile; + PFN_STORE_TILE pfnStoreTile; + PFN_CLEAR_TILE pfnClearTile; + PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; + PFN_UPDATE_STATS pfnUpdateStats; + PFN_UPDATE_STATS_FE pfnUpdateStatsFE; // Global Stats SWR_STATS stats[KNOB_MAX_NUM_THREADS]; @@ -492,3 +494,4 @@ void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); void WakeAllThreads(SWR_CONTEXT *pContext); #define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; } +#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; } diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index e32f743..3014c7d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -580,8 +580,8 @@ static void StreamOut( } } - UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); - UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); + UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); + UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); RDTSC_STOP(FEStreamout, 1, 0); } @@ -843,8 +843,8 @@ static void GeometryShaderStage( } // update GS pipeline stats - UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount); - UPDATE_STAT(GsPrimitives, totalPrimsGenerated); + UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount); + UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated); RDTSC_STOP(FEGeometryShader, 1, 0); } @@ -1009,7 +1009,7 @@ static void TessellationStages( state.pfnHsFunc(GetPrivateState(pDC), &hsContext); RDTSC_STOP(FEHullShader, 0, 0); - UPDATE_STAT(HsInvocations, numPrims); + UPDATE_STAT_FE(HsInvocations, numPrims); const uint32_t* pPrimId = (const uint32_t*)&primID; @@ -1065,7 +1065,7 @@ static void TessellationStages( dsInvocations += KNOB_SIMD_WIDTH; } - UPDATE_STAT(DsInvocations, tsData.NumDomainPoints); + UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); PA_TESS tessPa( pDC, @@ -1302,7 +1302,7 @@ void ProcessDraw( *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); } - UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex)); + UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); #if KNOB_ENABLE_TOSS_POINTS if (!KNOB_TOSS_FETCH) @@ -1312,7 +1312,7 @@ void ProcessDraw( state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); RDTSC_STOP(FEVertexShader, 0, 0); - UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex)); + UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); } } @@ -1335,7 +1335,7 @@ void ProcessDraw( { if (assemble) { - UPDATE_STAT(IaPrimitives, pa.NumPrims()); + UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); if (HasTessellationT::value) { diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index fdf5d7e..988de75 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -564,17 +564,27 @@ struct SWR_STATS uint64_t DepthPassCount; // Number of passing depth tests. Not exact. // Pipeline Stats + uint64_t PsInvocations; // Number of Pixel Shader invocations + uint64_t CsInvocations; // Number of Compute Shader invocations + +}; + +////////////////////////////////////////////////////////////////////////// +/// SWR_STATS +/// +/// @brief All statistics generated by FE. +///////////////////////////////////////////////////////////////////////// +struct SWR_STATS_FE +{ uint64_t IaVertices; // Number of Fetch Shader vertices uint64_t IaPrimitives; // Number of PA primitives. uint64_t VsInvocations; // Number of Vertex Shader invocations uint64_t HsInvocations; // Number of Hull Shader invocations uint64_t DsInvocations; // Number of Domain Shader invocations uint64_t GsInvocations; // Number of Geometry Shader invocations - uint64_t PsInvocations; // Number of Pixel Shader invocations - uint64_t CsInvocations; // Number of Compute Shader invocations + uint64_t GsPrimitives; // Number of prims GS outputs. uint64_t CInvocations; // Number of clipper invocations uint64_t CPrimitives; // Number of clipper primitives. - uint64_t GsPrimitives; // Number of prims GS outputs. // Streamout Stats uint64_t SoPrimStorageNeeded[4]; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index fb17af1..dce23b2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -322,23 +322,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { stats.DepthPassCount += dynState.stats[i].DepthPassCount; - stats.IaVertices += dynState.stats[i].IaVertices; - stats.IaPrimitives += dynState.stats[i].IaPrimitives; - stats.VsInvocations += dynState.stats[i].VsInvocations; - stats.HsInvocations += dynState.stats[i].HsInvocations; - stats.DsInvocations += dynState.stats[i].DsInvocations; - stats.GsInvocations += dynState.stats[i].GsInvocations; + stats.PsInvocations += dynState.stats[i].PsInvocations; - stats.CInvocations += dynState.stats[i].CInvocations; stats.CsInvocations += dynState.stats[i].CsInvocations; - stats.CPrimitives += dynState.stats[i].CPrimitives; - stats.GsPrimitives += dynState.stats[i].GsPrimitives; - - for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) - { - stats.SoPrimStorageNeeded[stream] += dynState.stats[i].SoPrimStorageNeeded[stream]; - stats.SoNumPrimsWritten[stream] += dynState.stats[i].SoNumPrimsWritten[stream]; - } } pContext->pfnUpdateStats(GetPrivateState(pDC), &stats); @@ -560,6 +546,11 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { _ReadWriteBarrier(); + if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats) + { + pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE); + } + if (pContext->pfnUpdateSoWriteOffset) { for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i) diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp index 53d2b93..15e60cd 100644 --- a/src/gallium/drivers/swr/swr_context.cpp +++ b/src/gallium/drivers/swr/swr_context.cpp @@ -355,15 +355,29 @@ swr_UpdateStats(HANDLE hPrivateContext, const SWR_STATS *pStats) struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx; SWR_STATS *pSwrStats = &ctx->stats; + pSwrStats->DepthPassCount += pStats->DepthPassCount; + pSwrStats->PsInvocations += pStats->PsInvocations; + pSwrStats->CsInvocations += pStats->CsInvocations; +} + +static void +swr_UpdateStatsFE(HANDLE hPrivateContext, const SWR_STATS_FE *pStats) +{ + swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; + + if (!pDC) + return; + + struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx; + + SWR_STATS_FE *pSwrStats = &ctx->statsFE; pSwrStats->IaVertices += pStats->IaVertices; pSwrStats->IaPrimitives += pStats->IaPrimitives; pSwrStats->VsInvocations += pStats->VsInvocations; pSwrStats->HsInvocations += pStats->HsInvocations; pSwrStats->DsInvocations += pStats->DsInvocations; pSwrStats->GsInvocations += pStats->GsInvocations; - pSwrStats->PsInvocations += pStats->PsInvocations; - pSwrStats->CsInvocations += pStats->CsInvocations; pSwrStats->CInvocations += pStats->CInvocations; pSwrStats->CPrimitives += pStats->CPrimitives; pSwrStats->GsPrimitives += pStats->GsPrimitives; @@ -389,6 +403,7 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) createInfo.pfnStoreTile = swr_StoreHotTile; createInfo.pfnClearTile = swr_StoreHotTileClear; createInfo.pfnUpdateStats = swr_UpdateStats; + createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE; ctx->swrContext = SwrCreateContext(&createInfo); /* Init Load/Store/ClearTiles Tables */ diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h index 4133720..b4553fb 100644 --- a/src/gallium/drivers/swr/swr_context.h +++ b/src/gallium/drivers/swr/swr_context.h @@ -159,6 +159,7 @@ struct swr_context { struct swr_draw_context swrDC; SWR_STATS stats; + SWR_STATS_FE statsFE; unsigned dirty; /**< Mask of SWR_NEW_x flags */ }; diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp index 35d0e53..c51c529 100644 --- a/src/gallium/drivers/swr/swr_query.cpp +++ b/src/gallium/drivers/swr/swr_query.cpp @@ -94,6 +94,7 @@ swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq) /* TODO: should fence instead of stalling pipeline */ SwrWaitForIdle(ctx->swrContext); memcpy(&result->core, &ctx->stats, sizeof(result->core)); + memcpy(&result->coreFE, &ctx->statsFE, sizeof(result->coreFE)); #if 0 if (!pq->fence) { @@ -150,17 +151,17 @@ swr_get_query_result(struct pipe_context *pipe, result->u64 = end->timestamp - start->timestamp; break; case PIPE_QUERY_PRIMITIVES_GENERATED: - result->u64 = end->core.IaPrimitives - start->core.IaPrimitives; + result->u64 = end->coreFE.IaPrimitives - start->coreFE.IaPrimitives; break; case PIPE_QUERY_PRIMITIVES_EMITTED: - result->u64 = end->core.SoNumPrimsWritten[index] - - start->core.SoNumPrimsWritten[index]; + result->u64 = end->coreFE.SoNumPrimsWritten[index] + - start->coreFE.SoNumPrimsWritten[index]; break; /* Structures */ case PIPE_QUERY_SO_STATISTICS: { struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; - struct SWR_STATS *start = &pq->start.core; - struct SWR_STATS *end = &pq->end.core; + struct SWR_STATS_FE *start = &pq->start.coreFE; + struct SWR_STATS_FE *end = &pq->end.coreFE; so_stats->num_primitives_written = end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index]; so_stats->primitives_storage_needed = @@ -176,21 +177,23 @@ swr_get_query_result(struct pipe_context *pipe, &result->pipeline_statistics; struct SWR_STATS *start = &pq->start.core; struct SWR_STATS *end = &pq->end.core; - p_stats->ia_vertices = end->IaVertices - start->IaVertices; - p_stats->ia_primitives = end->IaPrimitives - start->IaPrimitives; - p_stats->vs_invocations = end->VsInvocations - start->VsInvocations; - p_stats->gs_invocations = end->GsInvocations - start->GsInvocations; - p_stats->gs_primitives = end->GsPrimitives - start->GsPrimitives; - p_stats->c_invocations = end->CPrimitives - start->CPrimitives; - p_stats->c_primitives = end->CPrimitives - start->CPrimitives; + struct SWR_STATS_FE *startFE = &pq->start.coreFE; + struct SWR_STATS_FE *endFE = &pq->end.coreFE; + p_stats->ia_vertices = endFE->IaVertices - startFE->IaVertices; + p_stats->ia_primitives = endFE->IaPrimitives - startFE->IaPrimitives; + p_stats->vs_invocations = endFE->VsInvocations - startFE->VsInvocations; + p_stats->gs_invocations = endFE->GsInvocations - startFE->GsInvocations; + p_stats->gs_primitives = endFE->GsPrimitives - startFE->GsPrimitives; + p_stats->c_invocations = endFE->CPrimitives - startFE->CPrimitives; + p_stats->c_primitives = endFE->CPrimitives - startFE->CPrimitives; p_stats->ps_invocations = end->PsInvocations - start->PsInvocations; - p_stats->hs_invocations = end->HsInvocations - start->HsInvocations; - p_stats->ds_invocations = end->DsInvocations - start->DsInvocations; + p_stats->hs_invocations = endFE->HsInvocations - startFE->HsInvocations; + p_stats->ds_invocations = endFE->DsInvocations - startFE->DsInvocations; p_stats->cs_invocations = end->CsInvocations - start->CsInvocations; } break; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { - struct SWR_STATS *start = &pq->start.core; - struct SWR_STATS *end = &pq->end.core; + struct SWR_STATS_FE *start = &pq->start.coreFE; + struct SWR_STATS_FE *end = &pq->end.coreFE; uint64_t num_primitives_written = end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index]; uint64_t primitives_storage_needed = diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h index 0ab034d..931d687 100644 --- a/src/gallium/drivers/swr/swr_query.h +++ b/src/gallium/drivers/swr/swr_query.h @@ -29,6 +29,7 @@ struct swr_query_result { SWR_STATS core; + SWR_STATS_FE coreFE; uint64_t timestamp; }; -- 2.7.4