From 45f0ce168ce21a7a95f48d3164e42a947732b896 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Fri, 19 Feb 2016 17:55:23 -0600 Subject: [PATCH] swr: [rasterizer core] RingBuffer class for DC/DS Use head/tail ring buffer indices for thread synchronization. 1. SwrWaitForIdle loops until ring is empty. (head == tail) 2. GetDrawContext waits until ring is not full. (head - tail) == Ring Size 3. Draw enqueues by incrementing head. 4. Last worker thread to move past a DC dequeues by incrementing tail. Todo: To reduce contention we can cache the tail in the API thread. For example, if you know you have 64 free entries in the ring then you don't need to keep checking the tail until you used those 64 entries. --- src/gallium/drivers/swr/Makefile.sources-arch | 1 + src/gallium/drivers/swr/rasterizer/common/os.h | 1 + src/gallium/drivers/swr/rasterizer/core/api.cpp | 153 +++++++-------------- src/gallium/drivers/swr/rasterizer/core/context.h | 19 +-- .../drivers/swr/rasterizer/core/ringbuffer.h | 102 ++++++++++++++ .../drivers/swr/rasterizer/core/threads.cpp | 40 +++--- 6 files changed, 180 insertions(+), 136 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/core/ringbuffer.h diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch index 6c105f4..7544f8e 100644 --- a/src/gallium/drivers/swr/Makefile.sources-arch +++ b/src/gallium/drivers/swr/Makefile.sources-arch @@ -83,6 +83,7 @@ CORE_CXX_SOURCES := \ rasterizer/core/rasterizer.h \ rasterizer/core/rdtsc_core.cpp \ rasterizer/core/rdtsc_core.h \ + rasterizer/core/ringbuffer.h \ rasterizer/core/state.h \ rasterizer/core/threads.cpp \ rasterizer/core/threads.h \ diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 522ae0d..265b879 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -192,6 +192,7 @@ unsigned int _mm_popcnt_u32(unsigned int v) #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) +#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1) #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1) #define _ReadWriteBarrier() asm volatile("" ::: "memory") #define __stdcall diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index c70b4fa..e18f9e7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -61,11 +61,8 @@ HANDLE SwrCreateContext( pContext->driverType = pCreateInfo->driver; pContext->privateStateSize = pCreateInfo->privateStateSize; - pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); - - pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); pContext->numSubContexts = pCreateInfo->maxSubContexts; if (pContext->numSubContexts > 1) @@ -77,7 +74,6 @@ HANDLE SwrCreateContext( for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new Arena(); - pContext->dcRing[dc].inUse = false; pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. @@ -108,9 +104,6 @@ HANDLE SwrCreateContext( pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); } - pContext->nextDrawId = 1; - pContext->DrawEnqueued = 1; - // State setup AFTER context is fully initialized SetupDefaultState(pContext); @@ -148,8 +141,6 @@ void SwrDestroyContext(HANDLE hContext) _aligned_free(pContext->pScratch[i]); } - _aligned_free(pContext->dcRing); - _aligned_free(pContext->dsRing); _aligned_free(pContext->subCtxSave); delete(pContext->pHotTileMgr); @@ -168,49 +159,28 @@ void WakeAllThreads(SWR_CONTEXT *pContext) pContext->FifosNotEmpty.notify_all(); } -bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) +template +void QueueWork(SWR_CONTEXT *pContext) { - // For single thread nothing should still be drawing. - if (KNOB_SINGLE_THREADED) { return false; } - - if (pDC->isCompute) + if (IsDraw) { - if (pDC->doneCompute) - { - pDC->inUse = false; - return false; - } + // Each worker thread looks at a DC for both FE and BE work at different times and so we + // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers + // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and + // then moved on if all work is done.) + pContext->pCurDrawContext->threadsDone = + pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; } - - // Check if backend work is done. First make sure all triangles have been binned. - if (pDC->doneFE == true) + else { - // ensure workers have all moved passed this draw - if (pDC->threadsDoneFE != pContext->NumWorkerThreads) - { - return true; - } - - if (pDC->threadsDoneBE != pContext->NumWorkerThreads) - { - return true; - } - - pDC->inUse = false; // all work is done. + pContext->pCurDrawContext->threadsDone = + pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1; } - return pDC->inUse; -} - -void QueueDraw(SWR_CONTEXT *pContext) -{ - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; - _ReadWriteBarrier(); { std::unique_lock lock(pContext->WaitLock); - pContext->DrawEnqueued++; + pContext->dcRing.Enqueue(); } if (KNOB_SINGLE_THREADED) @@ -219,10 +189,24 @@ void QueueDraw(SWR_CONTEXT *pContext) uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - std::unordered_set lockedTiles; - uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); - WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + if (IsDraw) + { + std::unordered_set lockedTiles; + uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + } + else + { + uint64_t curDispatch = pContext->pCurDrawContext->drawId; + WorkOnCompute(pContext, 0, curDispatch); + } + + // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). + if (!pContext->dcRing.IsEmpty()) + { + pContext->dcRing.Dequeue(); + } // restore csr _mm_setcsr(mxcsr); @@ -239,40 +223,14 @@ void QueueDraw(SWR_CONTEXT *pContext) pContext->pCurDrawContext = nullptr; } -///@todo Combine this with QueueDraw -void QueueDispatch(SWR_CONTEXT *pContext) +INLINE void QueueDraw(SWR_CONTEXT* pContext) { - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; - - _ReadWriteBarrier(); - { - std::unique_lock lock(pContext->WaitLock); - pContext->DrawEnqueued++; - } - - if (KNOB_SINGLE_THREADED) - { - // flush denormals to 0 - uint32_t mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - - uint64_t curDispatch = pContext->pCurDrawContext->drawId; - WorkOnCompute(pContext, 0, curDispatch); - - // restore csr - _mm_setcsr(mxcsr); - } - else - { - RDTSC_START(APIDrawWakeAllThreads); - WakeAllThreads(pContext); - RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); - } + QueueWork(pContext); +} - // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. - pContext->pPrevDrawContext = pContext->pCurDrawContext; - pContext->pCurDrawContext = nullptr; +INLINE void QueueDispatch(SWR_CONTEXT* pContext) +{ + QueueWork(pContext); } DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) @@ -281,17 +239,17 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { - uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; - - DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; - pContext->pCurDrawContext = pCurDrawContext; - - // Need to wait until this draw context is available to use. - while (StillDrawing(pContext, pCurDrawContext)) + // Need to wait for a free entry. + while (pContext->dcRing.IsFull()) { _mm_pause(); } + uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT; + + DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; + pContext->pCurDrawContext = pCurDrawContext; + // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; @@ -332,18 +290,15 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->pArena->Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. - pCurDrawContext->inUse = false; - pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; - pCurDrawContext->threadsDoneFE = 0; - pCurDrawContext->threadsDoneBE = 0; + pCurDrawContext->threadsDone = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC - pCurDrawContext->drawId = pContext->nextDrawId++; + pCurDrawContext->drawId = pContext->dcRing.GetHead(); } else { @@ -431,16 +386,12 @@ void SwrWaitForIdle(HANDLE hContext) SWR_CONTEXT *pContext = GetContext(hContext); RDTSC_START(APIWaitForIdle); - // Wait for all work to complete. - for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) - { - DRAW_CONTEXT *pDC = &pContext->dcRing[dc]; - while (StillDrawing(pContext, pDC)) - { - _mm_pause(); - } + while (!pContext->dcRing.IsEmpty()) + { + _mm_pause(); } + RDTSC_STOP(APIWaitForIdle, 1, 0); } diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 4a214af..d75d975 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -41,6 +41,7 @@ #include "core/knobs.h" #include "common/simdintrin.h" #include "core/threads.h" +#include "ringbuffer.h" // x.8 fixed point precision values #define FIXED_POINT_SHIFT 8 @@ -381,19 +382,14 @@ struct DRAW_CONTEXT FE_WORK FeWork; volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) inUse; volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - - // Have all worker threads moved past draw in DC ring? - volatile OSALIGNLINE(uint32_t) threadsDoneFE; - volatile OSALIGNLINE(uint32_t) threadsDoneBE; + volatile OSALIGNLINE(int64_t) threadsDone; uint64_t dependency; MacroTileMgr* pTileMgr; // The following fields are valid if isCompute is true. - volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) DRAW_STATE* pState; @@ -438,7 +434,7 @@ struct SWR_CONTEXT // 3. State - When an applications sets state after draw // a. Same as step 1. // b. State is copied from prev draw context to current. - DRAW_CONTEXT* dcRing; + RingBuffer dcRing; DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. @@ -448,7 +444,7 @@ struct SWR_CONTEXT // These split draws all have identical state. So instead of storing the state directly // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs // to reference a single entry in the DS ring. - DRAW_STATE* dsRing; + RingBuffer dsRing; uint32_t curStateId; // Current index to the next available entry in the DS ring. @@ -463,13 +459,6 @@ struct SWR_CONTEXT std::condition_variable FifosNotEmpty; std::mutex WaitLock; - // Draw Contexts will get a unique drawId generated from this - uint64_t nextDrawId; - - // most recent draw id enqueued by the API thread - // written by api thread, read by multiple workers - OSALIGNLINE(volatile uint64_t) DrawEnqueued; - DRIVER_TYPE driverType; uint32_t privateStateSize; diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h new file mode 100644 index 0000000..e323136 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h @@ -0,0 +1,102 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file arena.h +* +* @brief RingBuffer +* The RingBuffer class manages all aspects of the ring buffer including +* the head/tail indices, etc. +* +******************************************************************************/ +#pragma once + +template +class RingBuffer +{ +public: + RingBuffer() + : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) + { + } + + ~RingBuffer() + { + Destroy(); + } + + void Init(uint32_t numEntries) + { + SWR_ASSERT(numEntries > 0); + mNumEntries = numEntries; + mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64); + SWR_ASSERT(mpRingBuffer != nullptr); + memset(mpRingBuffer, 0, sizeof(T)*numEntries); + } + + void Destroy() + { + _aligned_free(mpRingBuffer); + mpRingBuffer = nullptr; + } + + T& operator[](const uint32_t index) + { + SWR_ASSERT(index < mNumEntries); + return mpRingBuffer[index]; + } + + INLINE void Enqueue() + { + mRingHead++; // There's only one producer. + } + + INLINE void Dequeue() + { + InterlockedIncrement(&mRingTail); // There are multiple consumers. + } + + INLINE bool IsEmpty() + { + return (GetHead() == GetTail()); + } + + INLINE bool IsFull() + { + ///@note We don't handle wrap case due to using 64-bit indices. + /// It would take 11 million years to wrap at 50,000 DCs per sec. + /// If we used 32-bit indices then its about 23 hours to wrap. + uint64_t numEnqueued = GetHead() - GetTail(); + SWR_ASSERT(numEnqueued <= mNumEntries); + + return (numEnqueued == mNumEntries); + } + + INLINE volatile uint64_t GetTail() { return mRingTail; } + INLINE volatile uint64_t GetHead() { return mRingHead; } + +private: + T* mpRingBuffer; + uint32_t mNumEntries; + + OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter + OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 24c5588..8f0d924 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -265,9 +265,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup= INLINE uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) { - //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); - //return result; - return pContext->DrawEnqueued; + return pContext->dcRing.GetHead(); } INLINE @@ -449,6 +447,18 @@ void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macro } } +INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) +{ + int64_t result = InterlockedDecrement64(&pDC->threadsDone); + + if (result == 0) + { + _ReadWriteBarrier(); + + pContext->dcRing.Dequeue(); // Remove from tail + } +} + INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) { // increment our current draw id to the first incomplete draw @@ -466,7 +476,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) if (isWorkComplete) { curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); + CompleteDrawContext(pContext, pDC); } else { @@ -579,7 +589,7 @@ void WorkOnFifoBE( { // We can increment the current BE and safely move to next draw since we know this draw is complete. curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); + CompleteDrawContext(pContext, pDC); lastRetiredDraw++; @@ -608,8 +618,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; if (pDC->isCompute || pDC->doneFE || pDC->FeLock) { + CompleteDrawContext(pContext, pDC); curDrawFE++; - InterlockedIncrement(&pDC->threadsDoneFE); } else { @@ -673,22 +683,12 @@ void WorkOnCompute( // Is there any work remaining? if (queue.getNumQueued() > 0) { - bool lastToComplete = false; - uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { ProcessComputeBE(pDC, workerId, threadGroupId); - lastToComplete = queue.finishedWork(); - } - - _ReadWriteBarrier(); - - if (lastToComplete) - { - SWR_ASSERT(queue.isWorkComplete() == true); - pDC->doneCompute = true; + queue.finishedWork(); } } } @@ -732,10 +732,10 @@ DWORD workerThreadMain(LPVOID pData) // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock lock(pContext->WaitLock, std::defer_lock); - auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; }; + auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; - uint64_t curDrawBE = 1; - uint64_t curDrawFE = 1; + uint64_t curDrawBE = 0; + uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { -- 2.7.4