swr: [rasterizer core] Better thread destruction
authorTim Rowley <timothy.o.rowley@intel.com>
Mon, 12 Sep 2016 18:08:12 +0000 (13:08 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Tue, 20 Sep 2016 01:10:19 +0000 (20:10 -0500)
Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/frontend.h
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/rasterizer/core/threads.h

index df87d14..703f239 100644 (file)
@@ -157,46 +157,6 @@ HANDLE SwrCreateContext(
     return (HANDLE)pContext;
 }
 
-void SwrDestroyContext(HANDLE hContext)
-{
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DestroyThreadPool(pContext, &pContext->threadPool);
-
-    // free the fifos
-    for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
-    {
-        delete [] pContext->dcRing[i].dynState.pStats;
-        delete pContext->dcRing[i].pArena;
-        delete pContext->dsRing[i].pArena;
-        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
-        pContext->pDispatchQueueArray[i].~DispatchQueue();
-    }
-
-    AlignedFree(pContext->pDispatchQueueArray);
-    AlignedFree(pContext->pMacroTileManagerArray);
-
-    // Free scratch space.
-    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-    {
-#if defined(_WIN32)
-        VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
-#else
-        AlignedFree(pContext->ppScratch[i]);
-#endif
-
-        ArchRast::DestroyThreadContext(pContext->pArContext[i]);
-    }
-
-    delete [] pContext->ppScratch;
-    delete [] pContext->pArContext;
-    delete [] pContext->pStats;
-
-    delete(pContext->pHotTileMgr);
-
-    pContext->~SWR_CONTEXT();
-    AlignedFree(GetContext(hContext));
-}
-
 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
 {
     memcpy(&dst.state, &src.state, sizeof(API_STATE));
@@ -382,6 +342,54 @@ API_STATE* GetDrawState(SWR_CONTEXT *pContext)
     return &pDC->pState->state;
 }
 
+void SwrDestroyContext(HANDLE hContext)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->FeWork.type = SHUTDOWN;
+    pDC->FeWork.pfnWork = ProcessShutdown;
+
+    //enqueue
+    QueueDraw(pContext);
+
+    DestroyThreadPool(pContext, &pContext->threadPool);
+
+    // free the fifos
+    for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
+    {
+        delete[] pContext->dcRing[i].dynState.pStats;
+        delete pContext->dcRing[i].pArena;
+        delete pContext->dsRing[i].pArena;
+        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+        pContext->pDispatchQueueArray[i].~DispatchQueue();
+    }
+
+    AlignedFree(pContext->pDispatchQueueArray);
+    AlignedFree(pContext->pMacroTileManagerArray);
+
+    // Free scratch space.
+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+    {
+#if defined(_WIN32)
+        VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
+#else
+        AlignedFree(pContext->ppScratch[i]);
+#endif
+
+        ArchRast::DestroyThreadContext(pContext->pArContext[i]);
+    }
+
+    delete[] pContext->ppScratch;
+    delete[] pContext->pArContext;
+    delete[] pContext->pStats;
+
+    delete(pContext->pHotTileMgr);
+
+    pContext->~SWR_CONTEXT();
+    AlignedFree(GetContext(hContext));
+}
+
 void SWR_API SwrSaveState(
     HANDLE hContext,
     void* pOutputStateBlock,
index d3d114e..0a0001d 100644 (file)
@@ -78,6 +78,16 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     AR_END(BEDispatch, 1);
 }
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Process shutdown.
+/// @param pDC - pointer to draw context (dispatch).
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param threadGroupId - the linear index for the thread group within the dispatch.
+void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+    // Dummy function
+}
+
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
 {
     uint32_t x, y;
index 9d2f317..e19a53d 100644 (file)
@@ -38,6 +38,7 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
 void InitClearTilesTable();
 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
index a4dbbc5..dfcc1c0 100644 (file)
@@ -158,6 +158,7 @@ enum WORK_TYPE
     CLEAR,
     DISCARDINVALIDATETILES,
     STORETILES,
+    SHUTDOWN,
 };
 
 struct BE_WORK
index decc161..5d54987 100644 (file)
@@ -81,6 +81,36 @@ void ProcessSync(
 }
 
 //////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrDestroyContext.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to sync callback.
+void ProcessShutdown(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+    BE_WORK work;
+    work.type = SHUTDOWN;
+    work.pfnWork = ProcessShutdownBE;
+
+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    // Enqueue at least 1 work item for each worker thread
+    // account for number of numa nodes
+    uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
+
+    for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
+    {
+        for (uint32_t n = 0; n < numNumaNodes; ++n)
+        {
+            pTileMgr->enqueue(i, n, &work);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrClearRenderTarget.
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
index 6316156..4692494 100644 (file)
@@ -304,6 +304,7 @@ void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, v
 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 
 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
 
index 446e795..b1a27f3 100644 (file)
@@ -428,7 +428,8 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t& curDrawBE,
 ///                      still have work pending in a previous draw. Additionally, the lockedTiles is
 ///                      hueristic that can steer a worker back to the same macrotile that it had been
 ///                      working on in a previous draw.
-void WorkOnFifoBE(
+/// @returns        true if worker thread should shutdown
+bool WorkOnFifoBE(
     SWR_CONTEXT *pContext,
     uint32_t workerId,
     uint32_t &curDrawBE,
@@ -436,12 +437,14 @@ void WorkOnFifoBE(
     uint32_t numaNode,
     uint32_t numaMask)
 {
+    bool bShutdown = false;
+
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
     uint32_t drawEnqueued = 0;
     if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
     {
-        return;
+        return false;
     }
 
     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
@@ -458,17 +461,17 @@ void WorkOnFifoBE(
     {
         DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
 
-        if (pDC->isCompute) return; // We don't look at compute work.
+        if (pDC->isCompute) return false; // We don't look at compute work.
 
         // First wait for FE to be finished with this draw. This keeps threading model simple
         // but if there are lots of bubbles between draws then serializing FE and BE may
         // need to be revisited.
-        if (!pDC->doneFE) return;
+        if (!pDC->doneFE) return false;
         
         // If this draw is dependent on a previous draw then we need to bail.
         if (CheckDependency(pContext, pDC, lastRetiredDraw))
         {
-            return;
+            return false;
         }
 
         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
@@ -512,6 +515,10 @@ void WorkOnFifoBE(
                 {
                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
                 }
+                else if (pWork->type == SHUTDOWN)
+                {
+                    bShutdown = true;
+                }
 
                 while ((pWork = tile->peek()) != nullptr)
                 {
@@ -526,7 +533,7 @@ void WorkOnFifoBE(
 
                 // Optimization: If the draw is complete and we're the last one to have worked on it then
                 // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
                 {
                     // We can increment the current BE and safely move to next draw since we know this draw is complete.
                     curDrawBE++;
@@ -537,6 +544,11 @@ void WorkOnFifoBE(
                     lockedTiles.clear();
                     break;
                 }
+
+                if (bShutdown)
+                {
+                    break;
+                }
             }
             else
             {
@@ -545,6 +557,8 @@ void WorkOnFifoBE(
             }
         }
     }
+
+    return bShutdown;
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -710,8 +724,15 @@ DWORD workerThreadMain(LPVOID pData)
     uint32_t curDrawBE = 0;
     uint32_t curDrawFE = 0;
 
-    while (pContext->threadPool.inThreadShutdown == false)
+    bool bShutdown = false;
+
+    while (true)
     {
+        if (bShutdown && !threadHasWork(curDrawBE))
+        {
+            break;
+        }
+
         uint32_t loop = 0;
         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
         {
@@ -729,29 +750,18 @@ DWORD workerThreadMain(LPVOID pData)
                 continue;
             }
 
-            if (pContext->threadPool.inThreadShutdown)
-            {
-                lock.unlock();
-                break;
-            }
-
             AR_BEGIN(WorkerWaitForThreadEvent, 0);
 
             pContext->FifosNotEmpty.wait(lock);
             lock.unlock();
 
             AR_END(WorkerWaitForThreadEvent, 0);
-
-            if (pContext->threadPool.inThreadShutdown)
-            {
-                break;
-            }
         }
 
         if (IsBEThread)
         {
             AR_BEGIN(WorkerWorkOnFifoBE, 0);
-            WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
             AR_END(WorkerWorkOnFifoBE, 0);
 
             WorkOnCompute(pContext, workerId, curDrawBE);
@@ -918,7 +928,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     pPool->numThreads = numThreads;
     pContext->NumWorkerThreads = pPool->numThreads;
 
-    pPool->inThreadShutdown = false;
     pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
     pPool->numaMask = 0;
 
@@ -1001,17 +1010,15 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
     if (!pContext->threadInfo.SINGLE_THREADED)
     {
-        // Inform threads to finish up
-        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pPool->inThreadShutdown = true;
-        _mm_mfence();
-        pContext->FifosNotEmpty.notify_all();
-        lock.unlock();
+        // Wait for all threads to finish
+        SwrWaitForIdle(pContext);
 
         // Wait for threads to finish and destroy them
         for (uint32_t t = 0; t < pPool->numThreads; ++t)
         {
-            pPool->pThreads[t]->join();
+            // Detach from thread.  Cannot join() due to possibility (in Windows) of code
+            // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
+            pPool->pThreads[t]->detach();
             delete(pPool->pThreads[t]);
         }
 
index 05231c5..c802c57 100644 (file)
@@ -54,7 +54,6 @@ struct THREAD_POOL
     THREAD_PTR* pThreads;
     uint32_t numThreads;
     uint32_t numaMask;
-    volatile bool inThreadShutdown;
     THREAD_DATA *pThreadData;
 };
 
@@ -65,6 +64,6 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
+bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);