swr: [rasterizer core] Add macros for mapping ArchRast to buckets
authorTim Rowley <timothy.o.rowley@intel.com>
Tue, 6 Sep 2016 17:36:02 +0000 (12:36 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Tue, 20 Sep 2016 01:10:19 +0000 (20:10 -0500)
Switch all RDTSC_START/STOP macros to use AR_BEGIN/END macros.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/clip.cpp
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
src/gallium/drivers/swr/rasterizer/core/tilemgr.h

index 6bdb8f4..df87d14 100644 (file)
@@ -46,8 +46,6 @@
 #include "common/simdintrin.h"
 #include "common/os.h"
 
-#include "archrast/archrast.h"
-
 static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y };
 
 void SetupDefaultState(SWR_CONTEXT *pContext);
@@ -264,9 +262,9 @@ void QueueWork(SWR_CONTEXT *pContext)
     }
     else
     {
-        RDTSC_START(APIDrawWakeAllThreads);
+        AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
         WakeAllThreads(pContext);
-        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
+        AR_API_END(APIDrawWakeAllThreads, 1);
     }
 
     // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
@@ -286,7 +284,7 @@ INLINE void QueueDispatch(SWR_CONTEXT* pContext)
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 {
-    RDTSC_START(APIGetDrawContext);
+    AR_API_BEGIN(APIGetDrawContext, 0);
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
@@ -372,7 +370,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
     }
 
-    RDTSC_STOP(APIGetDrawContext, 0, 0);
+    AR_API_END(APIGetDrawContext, 0);
     return pContext->pCurDrawContext;
 }
 
@@ -418,13 +416,13 @@ void SetupDefaultState(SWR_CONTEXT *pContext)
 
 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
 {
-    RDTSC_START(APISync);
-
     SWR_ASSERT(pfnFunc != nullptr);
 
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    AR_API_BEGIN(APISync, 0);
+
     pDC->FeWork.type = SYNC;
     pDC->FeWork.pfnWork = ProcessSync;
 
@@ -437,35 +435,35 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint
     //enqueue
     QueueDraw(pContext);
 
-    RDTSC_STOP(APISync, 1, 0);
+    AR_API_END(APISync, 1);
 }
 
 void SwrWaitForIdle(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    RDTSC_START(APIWaitForIdle);
+    AR_API_BEGIN(APIWaitForIdle, 0);
 
     while (!pContext->dcRing.IsEmpty())
     {
         _mm_pause();
     }
 
-    RDTSC_STOP(APIWaitForIdle, 1, 0);
+    AR_API_END(APIWaitForIdle, 1);
 }
 
 void SwrWaitForIdleFE(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    RDTSC_START(APIWaitForIdle);
+    AR_API_BEGIN(APIWaitForIdle, 0);
 
     while (pContext->drawsOutstandingFE > 0)
     {
         _mm_pause();
     }
 
-    RDTSC_STOP(APIWaitForIdle, 1, 0);
+    AR_API_END(APIWaitForIdle, 1);
 }
 
 void SwrSetVertexBuffers(
@@ -1080,11 +1078,11 @@ void DrawInstanced(
         return;
     }
 
-    RDTSC_START(APIDraw);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    AR_API_BEGIN(APIDraw, pDC->drawId);
+
     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
     uint32_t remainingVerts = numVertices;
@@ -1139,7 +1137,7 @@ void DrawInstanced(
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
-    RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
+    AR_API_END(APIDraw, numVertices * numInstances);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1200,14 +1198,12 @@ void DrawIndexedInstance(
         return;
     }
 
-    RDTSC_START(APIDrawIndexed);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     API_STATE* pState = &pDC->pState->state;
 
-    AR_BEGIN(AR_API_CTX, APIDrawIndexed, pDC->drawId);
-    AR_EVENT(AR_API_CTX, DrawIndexedInstance(topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
+    AR_API_BEGIN(APIDrawIndexed, pDC->drawId);
+    AR_API_EVENT(DrawIndexedInstance(topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
 
     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
@@ -1280,8 +1276,7 @@ void DrawIndexedInstance(
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
-    AR_END(AR_API_CTX, APIDrawIndexed, numIndices * numInstances);
-    RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
+    AR_API_END(APIDrawIndexed, numIndices * numInstances);
 }
 
 
@@ -1406,10 +1401,11 @@ void SwrDispatch(
         return;
     }
 
-    RDTSC_START(APIDispatch);
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    AR_API_BEGIN(APIDispatch, pDC->drawId);
+
     pDC->isCompute = true;      // This is a compute context.
 
     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
@@ -1424,7 +1420,7 @@ void SwrDispatch(
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
 
     QueueDispatch(pContext);
-    RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
+    AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
 }
 
 // Deswizzles, converts and stores current contents of the hot tiles to surface
@@ -1440,11 +1436,11 @@ void SWR_API SwrStoreTiles(
         return;
     }
 
-    RDTSC_START(APIStoreTiles);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    AR_API_BEGIN(APIStoreTiles, pDC->drawId);
+
     pDC->FeWork.type = STORETILES;
     pDC->FeWork.pfnWork = ProcessStoreTiles;
     pDC->FeWork.desc.storeTiles.attachment = attachment;
@@ -1455,7 +1451,7 @@ void SWR_API SwrStoreTiles(
     //enqueue
     QueueDraw(pContext);
 
-    RDTSC_STOP(APIStoreTiles, 0, 0);
+    AR_API_END(APIStoreTiles, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1479,11 +1475,11 @@ void SWR_API SwrClearRenderTarget(
         return;
     }
 
-    RDTSC_START(APIClearRenderTarget);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    AR_API_BEGIN(APIClearRenderTarget, pDC->drawId);
+
     CLEAR_FLAGS flags;
     flags.bits = 0;
     flags.mask = clearMask;
@@ -1503,7 +1499,7 @@ void SWR_API SwrClearRenderTarget(
     // enqueue draw
     QueueDraw(pContext);
 
-    RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
+    AR_API_END(APIClearRenderTarget, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
index 0e92ccf..d3d114e 100644 (file)
@@ -47,10 +47,10 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
 {
-    RDTSC_START(BEDispatch);
-
     SWR_CONTEXT *pContext = pDC->pContext;
 
+    AR_BEGIN(BEDispatch, pDC->drawId);
+
     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
     SWR_ASSERT(pTaskData != nullptr);
 
@@ -75,7 +75,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
 
     UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
 
-    RDTSC_STOP(BEDispatch, 1, 0);
+    AR_END(BEDispatch, 1);
 }
 
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
@@ -180,16 +180,17 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, ui
 
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
 {
+    SWR_CONTEXT *pContext = pDC->pContext;
+
     if (KNOB_FAST_CLEAR)
     {
         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        SWR_CONTEXT *pContext = pDC->pContext;
         SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
         uint32_t numSamples = GetNumSamples(sampleCount);
 
         SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
 
-        RDTSC_START(BEClear);
+        AR_BEGIN(BEClear, pDC->drawId);
 
         if (pClear->flags.mask & SWR_CLEAR_COLOR)
         {
@@ -217,13 +218,13 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             pHotTile->state = HOTTILE_CLEAR;
         }
 
-        RDTSC_STOP(BEClear, 0, 0);
+        AR_END(BEClear, 1);
     }
     else
     {
         // Legacy clear
         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        RDTSC_START(BEClear);
+        AR_BEGIN(BEClear, pDC->drawId);
 
         if (pClear->flags.mask & SWR_CLEAR_COLOR)
         {
@@ -265,17 +266,18 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData, pClear->rect);
         }
 
-        RDTSC_STOP(BEClear, 0, 0);
+        AR_END(BEClear, 1);
     }
 }
 
 
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
-    RDTSC_START(BEStoreTiles);
     STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
     SWR_CONTEXT *pContext = pDC->pContext;
 
+    AR_BEGIN(BEStoreTiles, pDC->drawId);
+
 #ifdef KNOB_ENABLE_RDTSC
     uint32_t numTiles = 0;
 #endif
@@ -326,7 +328,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
             pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
         }
     }
-    RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
+    AR_END(BEStoreTiles, numTiles);
 }
 
 
@@ -387,8 +389,10 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
 template<typename T>
 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    RDTSC_START(BESingleSampleBackend);
-    RDTSC_START(BESetup);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -423,7 +427,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
         pColorBase[rt] = renderBuffers.pColor[rt];
     }
     uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-    RDTSC_STOP(BESetup, 0, 0);
+    AR_END(BESetup, 1);
 
     SWR_PS_CONTEXT psContext;
     psContext.pAttribs = work.pAttribs;
@@ -462,7 +466,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
                 }
 
-                RDTSC_START(BEBarycentric);
+                AR_BEGIN(BEBarycentric, pDC->drawId);
                 CalcPixelBarycentrics(coeffs, psContext);
 
                 // for 1x case, centroid is pixel center
@@ -475,7 +479,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                RDTSC_STOP(BEBarycentric, 0, 0);
+                AR_END(BEBarycentric, 1);
 
                 simdmask clipCoverageMask = coverageMask & MASK;
                 // interpolate user clip distance if available
@@ -492,10 +496,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // Early-Z?
                 if(T::bCanEarlyZ)
                 {
-                    RDTSC_START(BEEarlyDepthTest);
+                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                      psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
-                    RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+                    AR_END(BEEarlyDepthTest, 0);
 
                     // early-exit if no pixels passed depth or earlyZ is forced on
                     if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
@@ -514,20 +518,20 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 psContext.activeMask = _simd_castps_si(vCoverageMask);
 
                 // execute pixel shader
-                RDTSC_START(BEPixelShader);
+                AR_BEGIN(BEPixelShader, pDC->drawId);
                 UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                RDTSC_STOP(BEPixelShader, 0, 0);
+                AR_END(BEPixelShader, 0);
 
                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                 // late-Z
                 if(!T::bCanEarlyZ)
                 {
-                    RDTSC_START(BELateDepthTest);
+                    AR_BEGIN(BELateDepthTest, pDC->drawId);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
-                    RDTSC_STOP(BELateDepthTest, 0, 0);
+                    AR_END(BELateDepthTest, 0);
 
                     if(!_simd_movemask_ps(depthPassMask))
                     {
@@ -543,7 +547,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 UPDATE_STAT(DepthPassCount, statCount);
 
                 // output merger
-                RDTSC_START(BEOutputMerger);
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
                 OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
 
                 // do final depth write after all pixel kills
@@ -552,11 +556,11 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
                 }
-                RDTSC_STOP(BEOutputMerger, 0, 0);
+                AR_END(BEOutputMerger, 0);
             }
 
 Endtile:
-            RDTSC_START(BEEndTile);
+            AR_BEGIN(BEEndTile, pDC->drawId);
             coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
@@ -569,17 +573,19 @@ Endtile:
             {
                 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
-            RDTSC_STOP(BEEndTile, 0, 0);
+            AR_END(BEEndTile, 0);
         }
     }
-    RDTSC_STOP(BESingleSampleBackend, 0, 0);
+    AR_END(BESingleSampleBackend, 0);
 }
 
 template<typename T>
 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    RDTSC_START(BESampleRateBackend);
-    RDTSC_START(BESetup);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BESampleRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -613,7 +619,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
         pColorBase[rt] = renderBuffers.pColor[rt];
     }
     uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-    RDTSC_STOP(BESetup, 0, 0);
+    AR_END(BESetup, 0);
 
     SWR_PS_CONTEXT psContext;
     psContext.pAttribs = work.pAttribs;
@@ -643,9 +649,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             // pixel center
             psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-            RDTSC_START(BEBarycentric);
+            AR_BEGIN(BEBarycentric, pDC->drawId);
             CalcPixelBarycentrics(coeffs, psContext);
-            RDTSC_STOP(BEBarycentric, 0, 0);
+            AR_END(BEBarycentric, 0);
 
             if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
@@ -657,7 +663,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             if(T::bCentroidPos)
             {
                 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-                RDTSC_START(BEBarycentric);
+                AR_BEGIN(BEBarycentric, pDC->drawId);
                 if(T::bIsStandardPattern)
                 {
                     CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
@@ -668,7 +674,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     psContext.vY.centroid = _simd_add_ps(psContext.vY.UL, _simd_set1_ps(0.5f));
                 }
                 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
-                RDTSC_STOP(BEBarycentric, 0, 0);
+                AR_END(BEBarycentric, 0);
             }
             else
             {
@@ -681,7 +687,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                 simdmask coverageMask = work.coverageMask[sample] & MASK;
                 if (coverageMask)
                 {
-                    RDTSC_START(BEBarycentric);
+                    AR_BEGIN(BEBarycentric, pDC->drawId);
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
@@ -691,7 +697,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                    RDTSC_STOP(BEBarycentric, 0, 0);
+                    AR_END(BEBarycentric, 0);
 
                     // interpolate user clip distance if available
                     if (rastState.clipDistanceMask)
@@ -711,10 +717,10 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // Early-Z?
                     if (T::bCanEarlyZ)
                     {
-                        RDTSC_START(BEEarlyDepthTest);
+                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+                        AR_END(BEEarlyDepthTest, 0);
 
                         // early-exit if no samples passed depth or earlyZ is forced on.
                         if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
@@ -734,20 +740,20 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     psContext.activeMask = _simd_castps_si(vCoverageMask);
 
                     // execute pixel shader
-                    RDTSC_START(BEPixelShader);
+                    AR_BEGIN(BEPixelShader, pDC->drawId);
                     UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                     state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                    RDTSC_STOP(BEPixelShader, 0, 0);
+                    AR_END(BEPixelShader, 0);
 
                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                     // late-Z
                     if (!T::bCanEarlyZ)
                     {
-                        RDTSC_START(BELateDepthTest);
+                        AR_BEGIN(BELateDepthTest, pDC->drawId);
                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        RDTSC_STOP(BELateDepthTest, 0, 0);
+                        AR_END(BELateDepthTest, 0);
 
                         if (!_simd_movemask_ps(depthPassMask))
                         {
@@ -765,7 +771,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     UPDATE_STAT(DepthPassCount, statCount);
 
                     // output merger
-                    RDTSC_START(BEOutputMerger);
+                    AR_BEGIN(BEOutputMerger, pDC->drawId);
                     OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
 
                     // do final depth write after all pixel kills
@@ -774,11 +780,11 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                     }
-                    RDTSC_STOP(BEOutputMerger, 0, 0);
+                    AR_END(BEOutputMerger, 0);
                 }
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
-            RDTSC_START(BEEndTile);
+            AR_BEGIN(BEEndTile, pDC->drawId);
             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
@@ -790,17 +796,19 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             {
                 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
-            RDTSC_STOP(BEEndTile, 0, 0);
+            AR_END(BEEndTile, 0);
         }
     }
-    RDTSC_STOP(BESampleRateBackend, 0, 0);
+    AR_END(BESampleRateBackend, 0);
 }
 
 template<typename T>
 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    RDTSC_START(BEPixelRateBackend);
-    RDTSC_START(BESetup);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -834,7 +842,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
         pColorBase[rt] = renderBuffers.pColor[rt];
     }
     uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-    RDTSC_STOP(BESetup, 0, 0);
+    AR_END(BESetup, 0);
 
     SWR_PS_CONTEXT psContext;
     psContext.pAttribs = work.pAttribs;
@@ -852,7 +860,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
     psContext.sampleIndex = 0;
     
-    PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
 
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
@@ -868,9 +876,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             // set pixel center positions
             psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-            RDTSC_START(BEBarycentric);
+            AR_BEGIN(BEBarycentric, pDC->drawId);
             CalcPixelBarycentrics(coeffs, psContext);
-            RDTSC_STOP(BEBarycentric, 0, 0);
+            AR_END(BEBarycentric, 0);
 
             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
@@ -882,7 +890,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             if(T::bCentroidPos)
             {
                 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-                RDTSC_START(BEBarycentric);
+                AR_BEGIN(BEBarycentric, pDC->drawId);
                 if(T::bIsStandardPattern)
                 {
                     CalcCentroidPos<T>(psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
@@ -894,7 +902,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 }
 
                 CalcCentroidBarycentrics(coeffs, psContext, psContext.vX.UL, psContext.vY.UL);
-                RDTSC_STOP(BEBarycentric, 0, 0);
+                AR_END(BEBarycentric, 0);
             }
             else
             {
@@ -921,11 +929,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
             if(pPSState->usesSourceDepth)
             {
-                RDTSC_START(BEBarycentric);
+                AR_BEGIN(BEBarycentric, pDC->drawId);
                 // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                RDTSC_STOP(BEBarycentric, 0, 0);
+                AR_END(BEBarycentric, 0);
             }
 
             // pixels that are currently active
@@ -933,10 +941,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             psContext.oMask = T::MultisampleT::FullSampleMask();
 
             // execute pixel shader
-            RDTSC_START(BEPixelShader);
+            AR_BEGIN(BEPixelShader, pDC->drawId);
             state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
             UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
-            RDTSC_STOP(BEPixelShader, 0, 0);
+            AR_END(BEPixelShader, 0);
 
             // update active lanes to remove any discarded or oMask'd pixels
             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
@@ -956,7 +964,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             // loop over all samples, broadcasting the results of the PS to all passing pixels
             for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
             {
-                RDTSC_START(BEOutputMerger);
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
                 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
                 simdscalar coverageMask, depthMask;
@@ -971,7 +979,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     if(!_simd_movemask_ps(depthMask))
                     {
                         // stencil should already have been written in early/lateZ tests
-                        RDTSC_STOP(BEOutputMerger, 0, 0);
+                        AR_END(BEOutputMerger, 0);
                         continue;
                     }
                 }
@@ -987,10 +995,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
                 }
-                RDTSC_STOP(BEOutputMerger, 0, 0);        
+                AR_END(BEOutputMerger, 0);
             }
 Endtile:
-            RDTSC_START(BEEndTile);
+            AR_BEGIN(BEEndTile, pDC->drawId);
             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
@@ -1008,19 +1016,21 @@ Endtile:
             {
                 pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
-            RDTSC_STOP(BEEndTile, 0, 0);
+            AR_END(BEEndTile, 0);
         }
     }
-    RDTSC_STOP(BEPixelRateBackend, 0, 0);
+    AR_END(BEPixelRateBackend, 0);
 }
 // optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
-    RDTSC_START(BENullBackend);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BENullBackend, pDC->drawId);
     ///@todo: handle center multisample pattern
     typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
-    RDTSC_START(BESetup);
+    AR_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
@@ -1043,7 +1053,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
     uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
 
-    RDTSC_STOP(BESetup, 0, 0);
+    AR_END(BESetup, 0);
 
     SWR_PS_CONTEXT psContext;
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
@@ -1065,7 +1075,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                 simdmask coverageMask = work.coverageMask[sample] & MASK;
                 if (coverageMask)
                 {
-                    RDTSC_START(BEBarycentric);
+                    AR_BEGIN(BEBarycentric, pDC->drawId);
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
@@ -1076,7 +1086,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
-                    RDTSC_STOP(BEBarycentric, 0, 0);
+                    AR_END(BEBarycentric, 0);
 
                     // interpolate user clip distance if available
                     if (rastState.clipDistanceMask)
@@ -1092,12 +1102,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
                     uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
-                    RDTSC_START(BEEarlyDepthTest);
+                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-                    RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+                    AR_END(BEEarlyDepthTest, 0);
 
                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
                     uint32_t statCount = _mm_popcnt_u32(statMask);
@@ -1109,7 +1119,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
         }
     }
-    RDTSC_STOP(BENullBackend, 0, 0);
+    AR_END(BENullBackend, 0);
 }
 
 void InitClearTilesTable()
index fde5a3f..9d2f317 100644 (file)
@@ -432,15 +432,17 @@ INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
 template<typename T>
 struct PixelRateZTestLoop
 {
-    PixelRateZTestLoop(DRAW_CONTEXT *DC, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, 
+    PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
                        uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) :
-                       work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+                       pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
                        clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
            
     INLINE
     uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
                         const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
     {
+        SWR_CONTEXT *pContext = pDC->pContext;
+
         uint32_t statCount = 0;
         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
         for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
@@ -454,7 +456,7 @@ struct PixelRateZTestLoop
                 continue;
             }
 
-            RDTSC_START(BEBarycentric);
+            AR_BEGIN(BEBarycentric, pDC->drawId);
             // calculate per sample positions
             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
@@ -472,7 +474,7 @@ struct PixelRateZTestLoop
                 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
             }
-            RDTSC_STOP(BEBarycentric, 0, 0);
+            AR_END(BEBarycentric, 0);
 
             ///@todo: perspective correct vs non-perspective correct clipping?
             // if clip distances are enabled, we need to interpolate for each sample
@@ -488,13 +490,14 @@ struct PixelRateZTestLoop
             uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
             // ZTest for this sample
-            RDTSC_START(BEDepthBucket);
+            ///@todo Need to uncomment out this bucket.
+            //AR_BEGIN(BEDepthBucket, pDC->drawId);
             depthPassMask[sample] = vCoverageMask[sample];
             stencilPassMask[sample] = vCoverageMask[sample];
             depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                      vZ[sample], pDepthSample, vCoverageMask[sample], 
                                                      pStencilSample, &stencilPassMask[sample]);
-            RDTSC_STOP(BEDepthBucket, 0, 0);
+            //AR_END(BEDepthBucket, 0);
 
             // early-exit if no pixels passed depth or earlyZ is forced on
             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
@@ -525,6 +528,9 @@ struct PixelRateZTestLoop
 
 private:
     // functor inputs
+    DRAW_CONTEXT* pDC;
+    uint32_t workerId;
+
     const SWR_TRIANGLE_DESC& work;
     const BarycentricCoeffs& coeffs;
     const API_STATE& state;
index 21cbb0a..7b1e09d 100644 (file)
@@ -181,24 +181,27 @@ void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *
 
 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
 {
-    RDTSC_START(FEClipTriangles);
+    SWR_CONTEXT *pContext = pDC->pContext;
+    AR_BEGIN(FEClipTriangles, pDC->drawId);
     Clipper<3> clipper(workerId, pDC);
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
-    RDTSC_STOP(FEClipTriangles, 1, 0);
+    AR_END(FEClipTriangles, 1);
 }
 
 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
 {
-    RDTSC_START(FEClipLines);
+    SWR_CONTEXT *pContext = pDC->pContext;
+    AR_BEGIN(FEClipLines, pDC->drawId);
     Clipper<2> clipper(workerId, pDC);
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
-    RDTSC_STOP(FEClipLines, 1, 0);
+    AR_END(FEClipLines, 1);
 }
 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
 {
-    RDTSC_START(FEClipPoints);
+    SWR_CONTEXT *pContext = pDC->pContext;
+    AR_BEGIN(FEClipPoints, pDC->drawId);
     Clipper<1> clipper(workerId, pDC);
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
-    RDTSC_STOP(FEClipPoints, 1, 0);
+    AR_END(FEClipPoints, 1);
 }
 
index 2f3ce85..43bc522 100644 (file)
@@ -501,6 +501,10 @@ public:
     // execute the clipper stage
     void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
     {
+        SWR_ASSERT(pa.pDC != nullptr);
+
+        SWR_CONTEXT *pContext = pa.pDC->pContext;
+
         // set up binner based on PA state
         PFN_PROCESS_PRIMS pfnBinner;
         switch (pa.binTopology)
@@ -548,11 +552,11 @@ public:
 
         if (clipMask)
         {
-            RDTSC_START(FEGuardbandClip);
+            AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
             ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
-            RDTSC_STOP(FEGuardbandClip, 1, 0);
+            AR_END(FEGuardbandClip, 1);
         }
         else if (validMask)
         {
index 6d63e08..a4dbbc5 100644 (file)
@@ -42,6 +42,7 @@
 #include "common/simdintrin.h"
 #include "core/threads.h"
 #include "ringbuffer.h"
+#include "archrast/archrast.h"
 
 // x.8 fixed point precision values
 #define FIXED_POINT_SHIFT 8
@@ -515,15 +516,30 @@ struct SWR_CONTEXT
 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
 
 // ArchRast instrumentation framework
-#ifdef KNOB_ENABLE_AR
-#define AR_WORKER_CTX  pDC->pContext->pArContext[workerId]
-#define AR_API_CTX     pDC->pContext->pArContext[pContext->NumWorkerThreads]
+#define AR_WORKER_CTX  pContext->pArContext[workerId]
+#define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]
 
-#define AR_BEGIN(ctx, type, id)    ArchRast::dispatch(ctx, ArchRast::Start(ArchRast::type, id))
-#define AR_END(ctx, type, count)   ArchRast::dispatch(ctx, ArchRast::End(ArchRast::type, count))
-#define AR_EVENT(ctx, event)       ArchRast::dispatch(ctx, ArchRast::event)
+#ifdef KNOB_ENABLE_AR
+    #define _AR_BEGIN(ctx, type, id)    ArchRast::dispatch(ctx, ArchRast::Start(ArchRast::type, id))
+    #define _AR_END(ctx, type, count)   ArchRast::dispatch(ctx, ArchRast::End(ArchRast::type, count))
+    #define _AR_EVENT(ctx, event)       ArchRast::dispatch(ctx, ArchRast::event)
 #else
-#define AR_BEGIN(ctx, type, id)
-#define AR_END(ctx, type, id)
-#define AR_EVENT(ctx, event)
-#endif
\ No newline at end of file
+    #ifdef KNOB_ENABLE_RDTSC
+        #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
+        #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
+    #else
+        #define _AR_BEGIN(ctx, type, id) (void)ctx
+        #define _AR_END(ctx, type, id)
+    #endif
+    #define _AR_EVENT(ctx, event)
+#endif
+
+// Use these macros for api thread.
+#define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
+#define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
+#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
+
+// Use these macros for worker threads.
+#define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
+#define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
+#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
index db47078..decc161 100644 (file)
@@ -130,7 +130,7 @@ void ProcessStoreTiles(
     uint32_t workerId,
     void *pUserData)
 {
-    RDTSC_START(FEProcessStoreTiles);
+    AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 
@@ -155,7 +155,7 @@ void ProcessStoreTiles(
         }
     }
 
-    RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
+    AR_END(FEProcessStoreTiles, 0);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -171,7 +171,7 @@ void ProcessDiscardInvalidateTiles(
     uint32_t workerId,
     void *pUserData)
 {
-    RDTSC_START(FEProcessInvalidateTiles);
+    AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
@@ -210,7 +210,7 @@ void ProcessDiscardInvalidateTiles(
         }
     }
 
-    RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
+    AR_END(FEProcessInvalidateTiles, 0);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -542,7 +542,9 @@ static void StreamOut(
     uint32_t* pPrimData,
     uint32_t streamIndex)
 {
-    RDTSC_START(FEStreamout);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(FEStreamout, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_STREAMOUT_STATE &soState = state.soState;
@@ -615,7 +617,7 @@ static void StreamOut(
     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 
-    RDTSC_STOP(FEStreamout, 1, 0);
+    AR_END(FEStreamout, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -698,7 +700,9 @@ static void GeometryShaderStage(
     uint32_t* pSoPrimData,
     simdscalari primID)
 {
-    RDTSC_START(FEGeometryShader);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(FEGeometryShader, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_GS_STATE* pState = &state.gsState;
@@ -895,7 +899,7 @@ static void GeometryShaderStage(
     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 
-    RDTSC_STOP(FEGeometryShader, 1, 0);
+    AR_END(FEGeometryShader, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -990,6 +994,7 @@ static void TessellationStages(
     uint32_t* pSoPrimData,
     simdscalari primID)
 {
+    SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const SWR_TS_STATE& tsState = state.tsState;
 
@@ -1053,9 +1058,9 @@ static void TessellationStages(
     hsContext.mask = GenerateMask(numPrims);
 
     // Run the HS
-    RDTSC_START(FEHullShader);
+    AR_BEGIN(FEHullShader, pDC->drawId);
     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
-    RDTSC_STOP(FEHullShader, 0, 0);
+    AR_END(FEHullShader, 0);
 
     UPDATE_STAT_FE(HsInvocations, numPrims);
 
@@ -1065,9 +1070,9 @@ static void TessellationStages(
     {
         // Run Tessellator
         SWR_TS_TESSELLATED_DATA tsData = { 0 };
-        RDTSC_START(FETessellation);
+        AR_BEGIN(FETessellation, pDC->drawId);
         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
-        RDTSC_STOP(FETessellation, 0, 0);
+        AR_END(FETessellation, 0);
 
         if (tsData.NumPrimitives == 0)
         {
@@ -1107,9 +1112,9 @@ static void TessellationStages(
         {
             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
 
-            RDTSC_START(FEDomainShader);
+            AR_BEGIN(FEDomainShader, pDC->drawId);
             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
-            RDTSC_STOP(FEDomainShader, 0, 0);
+            AR_END(FEDomainShader, 0);
 
             dsInvocations += KNOB_SIMD_WIDTH;
         }
@@ -1142,12 +1147,12 @@ static void TessellationStages(
                 if (HasRastT::value)
                 {
                     simdvector prim[3]; // Only deal with triangles, lines, or points
-                    RDTSC_START(FEPAAssemble);
+                    AR_BEGIN(FEPAAssemble, pDC->drawId);
 #if SWR_ENABLE_ASSERTS
                     bool assemble =
 #endif
                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
-                    RDTSC_STOP(FEPAAssemble, 1, 0);
+                    AR_END(FEPAAssemble, 1);
                     SWR_ASSERT(assemble);
 
                     SWR_ASSERT(pfnClipFunc);
@@ -1196,7 +1201,7 @@ void ProcessDraw(
     }
 #endif
 
-    RDTSC_START(FEProcessDraw);
+    AR_BEGIN(FEProcessDraw, pDC->drawId);
 
     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
     const API_STATE&    state = GetApiState(pDC);
@@ -1334,9 +1339,9 @@ void ProcessDraw(
             {
 
                 // 1. Execute FS/VS for a single SIMD.
-                RDTSC_START(FEFetchShader);
+                AR_BEGIN(FEFetchShader, pDC->drawId);
                 state.pfnFetchFunc(fetchInfo, vin);
-                RDTSC_STOP(FEFetchShader, 0, 0);
+                AR_END(FEFetchShader, 0);
 
                 // forward fetch generated vertex IDs to the vertex shader
                 vsContext.VertexID = fetchInfo.VertexID;
@@ -1356,9 +1361,9 @@ void ProcessDraw(
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
-                    RDTSC_START(FEVertexShader);
+                    AR_BEGIN(FEVertexShader, pDC->drawId);
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
-                    RDTSC_STOP(FEVertexShader, 0, 0);
+                    AR_END(FEVertexShader, 0);
 
                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
@@ -1369,9 +1374,9 @@ void ProcessDraw(
             {
                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
                 // PaAssemble returns false if there is not enough verts to assemble.
-                RDTSC_START(FEPAAssemble);
+                AR_BEGIN(FEPAAssemble, pDC->drawId);
                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
-                RDTSC_STOP(FEPAAssemble, 1, 0);
+                AR_END(FEPAAssemble, 1);
 
 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
@@ -1428,7 +1433,7 @@ void ProcessDraw(
         pa.Reset();
     }
 
-    RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
+    AR_END(FEProcessDraw, numPrims * work.numInstances);
 }
 
 struct FEDrawChooser
@@ -1787,7 +1792,9 @@ void BinTriangles(
     simdscalari primID,
     simdscalari viewportIdx)
 {
-    RDTSC_START(FEBinTriangles);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(FEBinTriangles, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -2168,7 +2175,7 @@ void BinTriangles(
     }
 
 endBinTriangles:
-    RDTSC_STOP(FEBinTriangles, 1, 0);
+    AR_END(FEBinTriangles, 1);
 }
 
 struct FEBinTrianglesChooser
@@ -2204,7 +2211,9 @@ void BinPoints(
     simdscalari primID,
     simdscalari viewportIdx)
 {
-    RDTSC_START(FEBinPoints);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(FEBinPoints, pDC->drawId);
 
     simdvector& primVerts = prim[0];
 
@@ -2519,10 +2528,7 @@ void BinPoints(
         }
     }
 
-
-
-    
-    RDTSC_STOP(FEBinPoints, 1, 0);
+    AR_END(FEBinPoints, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2542,7 +2548,9 @@ void BinLines(
     simdscalari primID,
     simdscalari viewportIdx)
 {
-    RDTSC_START(FEBinLines);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(FEBinLines, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -2765,5 +2773,5 @@ void BinLines(
 
 endBinLines:
 
-    RDTSC_STOP(FEBinLines, 1, 0);
+    AR_END(FEBinLines, 1);
 }
index c9380da..6d4e504 100644 (file)
@@ -758,7 +758,7 @@ INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1,
 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 struct GenerateSVInnerCoverage
 {
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, EDGE*, double*,  uint64_t &){};
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -768,8 +768,10 @@ struct GenerateSVInnerCoverage
 template <typename RT>
 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
 {
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
     {
+        SWR_CONTEXT *pContext = pDC->pContext;
+
         double startQuadEdgesAdj[RT::NumEdgesT::value];
         for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
         {
@@ -777,9 +779,9 @@ struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
         }
 
         // not trivial accept or reject, must rasterize full tile
-        RDTSC_START(BERasterizePartial);
+        AR_BEGIN(BERasterizePartial, pDC->drawId);
         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
-        RDTSC_STOP(BERasterizePartial, 0, 0);
+        AR_END(BERasterizePartial, 0);
     }
 };
 
@@ -835,6 +837,7 @@ struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCov
 template <typename RT>
 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
 {
+    SWR_CONTEXT *pContext = pDC->pContext;
     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
@@ -842,9 +845,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         return;
     }
 #endif
-    RDTSC_START(BERasterizeTriangle);
+    AR_BEGIN(BERasterizeTriangle, pDC->drawId);
+    AR_BEGIN(BETriangleSetup, pDC->drawId);
 
-    RDTSC_START(BETriangleSetup);
     const API_STATE &state = GetApiState(pDC);
     const SWR_RASTSTATE &rastState = state.rastState;
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
@@ -1009,7 +1012,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
 
-    RDTSC_STOP(BETriangleSetup, 0, pDC->drawId);
+    AR_END(BETriangleSetup, 0);
 
     // update triangle desc
     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
@@ -1022,11 +1025,11 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     if (numTilesX == 0 || numTilesY == 0) 
     {
         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
-        RDTSC_STOP(BERasterizeTriangle, 1, 0);
+        AR_END(BERasterizeTriangle, 1);
         return;
     }
 
-    RDTSC_START(BEStepSetup);
+    AR_BEGIN(BEStepSetup, pDC->drawId);
 
     // Step to pixel center of top-left pixel of the triangle bbox
     // Align intersect bbox (top/left) to raster tile's (top/left).
@@ -1134,7 +1137,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         }
     }
 
-    RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
+    AR_END(BEStepSetup, 0);
 
     uint32_t tY = minTileY;
     uint32_t tX = minTileX;
@@ -1226,14 +1229,14 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                         }
 
                         // not trivial accept or reject, must rasterize full tile
-                        RDTSC_START(BERasterizePartial);
+                        AR_BEGIN(BERasterizePartial, pDC->drawId);
                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
-                        RDTSC_STOP(BERasterizePartial, 0, 0);
+                        AR_END(BERasterizePartial, 0);
 
                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
                         
                         // Output SV InnerCoverage, if needed
-                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
+                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
                     }
                 }
                 else
@@ -1264,9 +1267,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
                 }
 
-                RDTSC_START(BEPixelBackend);
+                AR_BEGIN(BEPixelBackend, pDC->drawId);
                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
-                RDTSC_STOP(BEPixelBackend, 0, 0);
+                AR_END(BEPixelBackend, 0);
             }
 
             // step to the next tile in X
@@ -1285,7 +1288,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
     }
 
-    RDTSC_STOP(BERasterizeTriangle, 1, 0);
+    AR_END(BERasterizeTriangle, 1);
 }
 
 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
@@ -1420,6 +1423,8 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile,
 
 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
 {
+    SWR_CONTEXT *pContext = pDC->pContext;
+
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
     {
@@ -1475,9 +1480,9 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi
     GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
         renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
 
-    RDTSC_START(BEPixelBackend);
+    AR_BEGIN(BEPixelBackend, pDC->drawId);
     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-    RDTSC_STOP(BEPixelBackend, 0, 0);
+    AR_END(BEPixelBackend, 0);
 }
 
 // Get pointers to hot tile memory for color RT, depth, stencil
@@ -1561,6 +1566,7 @@ INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, Render
 
 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
+    SWR_CONTEXT *pContext = pDC->pContext;
     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
@@ -1570,7 +1576,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
 #endif
 
     // bloat line to two tris and call the triangle rasterizer twice
-    RDTSC_START(BERasterizeLine);
+    AR_BEGIN(BERasterizeLine, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
     const SWR_RASTSTATE &rastState = state.rastState;
@@ -1763,7 +1769,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
 
-    RDTSC_STOP(BERasterizeLine, 1, 0);
+    AR_END(BERasterizeLine, 1);
 }
 
 struct RasterizerChooser
index 24e7812..446e795 100644 (file)
@@ -501,7 +501,7 @@ void WorkOnFifoBE(
             {
                 BE_WORK *pWork;
 
-                RDTSC_START(WorkerFoundWork);
+                AR_BEGIN(WorkerFoundWork, pDC->drawId);
 
                 uint32_t numWorkItems = tile->getNumQueued();
                 SWR_ASSERT(numWorkItems);
@@ -510,7 +510,7 @@ void WorkOnFifoBE(
                 SWR_ASSERT(pWork);
                 if (pWork->type == DRAW)
                 {
-                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
                 }
 
                 while ((pWork = tile->peek()) != nullptr)
@@ -518,7 +518,7 @@ void WorkOnFifoBE(
                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                     tile->dequeue();
                 }
-                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+                AR_END(WorkerFoundWork, numWorkItems);
 
                 _ReadWriteBarrier();
 
@@ -735,12 +735,12 @@ DWORD workerThreadMain(LPVOID pData)
                 break;
             }
 
-            RDTSC_START(WorkerWaitForThreadEvent);
+            AR_BEGIN(WorkerWaitForThreadEvent, 0);
 
             pContext->FifosNotEmpty.wait(lock);
             lock.unlock();
 
-            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
+            AR_END(WorkerWaitForThreadEvent, 0);
 
             if (pContext->threadPool.inThreadShutdown)
             {
@@ -750,9 +750,9 @@ DWORD workerThreadMain(LPVOID pData)
 
         if (IsBEThread)
         {
-            RDTSC_START(WorkerWorkOnFifoBE);
+            AR_BEGIN(WorkerWorkOnFifoBE, 0);
             WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-            RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+            AR_END(WorkerWorkOnFifoBE, 0);
 
             WorkOnCompute(pContext, workerId, curDrawBE);
         }
index 1bd1805..bd189ab 100644 (file)
@@ -281,7 +281,7 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
 /// to avoid unnecessary setup every triangle
 /// @todo support deferred clear
 /// @param pCreateInfo - pointer to creation info.
-void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID)
 {
     const API_STATE& state = GetApiState(pDC);
 
@@ -301,19 +301,19 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
 
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            RDTSC_START(BELoadTiles);
+            AR_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
+            AR_END(BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            RDTSC_START(BELoadTiles);
+            AR_BEGIN(BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearColorHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
+            AR_END(BELoadTiles, 0);
         }
         colorHottileEnableMask &= ~(1 << rtSlot);
     }
@@ -324,19 +324,19 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            RDTSC_START(BELoadTiles);
+            AR_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
+            AR_END(BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            RDTSC_START(BELoadTiles);
+            AR_BEGIN(BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearDepthHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
+            AR_END(BELoadTiles, 0);
         }
     }
 
@@ -346,19 +346,19 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            RDTSC_START(BELoadTiles);
+            AR_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
+            AR_END(BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            RDTSC_START(BELoadTiles);
+            AR_BEGIN(BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearStencilHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
+            AR_END(BELoadTiles, 0);
         }
     }
 }
index 4ec0283..2befe97 100644 (file)
@@ -291,7 +291,7 @@ public:
         }
     }
 
-    void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID);
+    void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID);
 
     HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
         uint32_t renderTargetArrayIndex = 0);