From 29e1c4a8a9f26ce41aa53dc9bf39852a8530adc6 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 3 Aug 2016 17:59:37 -0600 Subject: [PATCH] swr: [rasterizer core] allow override of KNOB thread settings - Remove HYPERTHREADED_FE support - Add threading info as optional data passed to SwrCreateContext. If supplied this data will override any KNOB thread settings. Signed-off-by: Tim Rowley --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 18 ++++-- src/gallium/drivers/swr/rasterizer/core/api.h | 15 +++++ src/gallium/drivers/swr/rasterizer/core/context.h | 1 + .../drivers/swr/rasterizer/core/threads.cpp | 73 +++++++--------------- src/gallium/drivers/swr/rasterizer/core/threads.h | 4 +- .../drivers/swr/rasterizer/scripts/knob_defs.py | 12 ---- 6 files changed, 53 insertions(+), 70 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index a4856ee..3922606 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -75,6 +75,17 @@ HANDLE SwrCreateContext( pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; + pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; + pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; + pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; + pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; + + if (pCreateInfo->pThreadInfo) + { + pContext->threadInfo = *pCreateInfo->pThreadInfo; + } + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); @@ -84,7 +95,7 @@ HANDLE SwrCreateContext( pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } - if (!KNOB_SINGLE_THREADED) + if (!pContext->threadInfo.SINGLE_THREADED) { memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); @@ -95,9 +106,8 @@ HANDLE SwrCreateContext( } // Calling createThreadPool() above can set SINGLE_THREADED - if (KNOB_SINGLE_THREADED) + if (pContext->threadInfo.SINGLE_THREADED) { - SET_KNOB(HYPERTHREADED_FE, false); pContext->NumWorkerThreads = 1; pContext->NumFEThreads = 1; pContext->NumBEThreads = 1; @@ -218,7 +228,7 @@ void QueueWork(SWR_CONTEXT *pContext) pContext->dcRing.Enqueue(); } - if (KNOB_SINGLE_THREADED) + if (pContext->threadInfo.SINGLE_THREADED) { // flush denormals to 0 uint32_t mxcsr = _mm_getcsr(); diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index b45d449..d7621d5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -91,6 +91,18 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext, class BucketManager; ////////////////////////////////////////////////////////////////////////// +/// SWR_THREADING_INFO +///////////////////////////////////////////////////////////////////////// +struct SWR_THREADING_INFO +{ + uint32_t MAX_WORKER_THREADS; + uint32_t MAX_NUMA_NODES; + uint32_t MAX_CORES_PER_NUMA_NODE; + uint32_t MAX_THREADS_PER_CORE; + bool SINGLE_THREADED; +}; + +////////////////////////////////////////////////////////////////////////// /// SWR_CREATECONTEXT_INFO ///////////////////////////////////////////////////////////////////////// struct SWR_CREATECONTEXT_INFO @@ -113,6 +125,9 @@ struct SWR_CREATECONTEXT_INFO // Output: size required memory passed to for SwrSaveState / SwrRestoreState size_t contextSaveSize; + + // Input (optional): Threading info that overrides any set KNOB values. + SWR_THREADING_INFO* pThreadInfo; }; ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 7e6a167..47fea16 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -464,6 +464,7 @@ struct SWR_CONTEXT uint32_t NumBEThreads; THREAD_POOL threadPool; // Thread pool associated with this context + SWR_THREADING_INFO threadInfo; std::condition_variable FifosNotEmpty; std::mutex WaitLock; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index b207ebd..143a77f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -239,10 +239,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread } -void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false) +void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false) { // Only bind threads when MAX_WORKER_THREADS isn't set. - if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false) + if (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false) { return; } @@ -267,9 +267,9 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup= else #endif { - // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group, + // If MAX_WORKER_THREADS is set, only bind to the proc group, // Not the individual HW thread. - if (!KNOB_MAX_WORKER_THREADS) + if (!pContext->threadInfo.MAX_WORKER_THREADS) { affinity.Mask = KAFFINITY(1) << threadId; } @@ -648,7 +648,7 @@ DWORD workerThreadMain(LPVOID pData) uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; - bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); + bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); RDTSC_INIT(threadId); @@ -771,7 +771,7 @@ template<> DWORD workerThreadInit(LPVOID pData) = delete; void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { - bindThread(0); + bindThread(pContext, 0); CPUNumaNodes nodes; uint32_t numThreadsPerProcGroup = 0; @@ -796,33 +796,23 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; - if (KNOB_MAX_WORKER_THREADS) + if (pContext->threadInfo.MAX_NUMA_NODES) { - SET_KNOB(HYPERTHREADED_FE, false); + numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES); } - if (KNOB_HYPERTHREADED_FE) + if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE) { - SET_KNOB(MAX_THREADS_PER_CORE, 0); + numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE); } - if (KNOB_MAX_NUMA_NODES) + if (pContext->threadInfo.MAX_THREADS_PER_CORE) { - numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); - } - - if (KNOB_MAX_CORES_PER_NUMA_NODE) - { - numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE); - } - - if (KNOB_MAX_THREADS_PER_CORE) - { - numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); + numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE); } #if defined(_WIN32) && !defined(_WIN64) - if (!KNOB_MAX_WORKER_THREADS) + if (!pContext->threadInfo.MAX_WORKER_THREADS) { // Limit 32-bit windows to bindable HW threads only if ((numCoresPerNode * numHWHyperThreads) > 32) @@ -832,19 +822,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } #endif - if (numHyperThreads < 2) - { - SET_KNOB(HYPERTHREADED_FE, false); - } - // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; numThreads = std::min(numThreads, numHWThreads); - if (KNOB_MAX_WORKER_THREADS) + if (pContext->threadInfo.MAX_WORKER_THREADS) { uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads; - numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads); + numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads); } if (numThreads > KNOB_MAX_NUM_THREADS) @@ -900,7 +885,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); pPool->numaMask = 0; - if (KNOB_MAX_WORKER_THREADS) + if (pContext->threadInfo.MAX_WORKER_THREADS) { bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup; @@ -962,25 +947,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].htId = t; pPool->pThreadData[workerId].pContext = pContext; - if (KNOB_HYPERTHREADED_FE) - { - if (t == 0) - { - pContext->NumBEThreads++; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); - } - else - { - pContext->NumFEThreads++; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); - } - } - else - { - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); - pContext->NumBEThreads++; - pContext->NumFEThreads++; - } + pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pContext->NumBEThreads++; + pContext->NumFEThreads++; ++workerId; } @@ -991,7 +960,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { - if (!KNOB_SINGLE_THREADED) + if (!pContext->threadInfo.SINGLE_THREADED) { // Inform threads to finish up std::unique_lock lock(pContext->WaitLock); diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 215c699..157f46a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -45,7 +45,7 @@ struct THREAD_DATA uint32_t htId; // Hyperthread id uint32_t workerId; SWR_CONTEXT *pContext; - bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set. + bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set. }; diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index 56c3144..f93147c 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -30,18 +30,6 @@ KNOBS = [ 'category' : 'debug', }], - ['HYPERTHREADED_FE', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['EXPERIMENTAL!!', - 'If enabled will attempt to use secondary threads per core to perform', - 'front-end (VS/GS) work.', - '', - 'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'], - 'category' : 'perf', - 'advanced' : 'true', - }], - ['DUMP_SHADER_IR', { 'type' : 'bool', 'default' : 'false', -- 2.7.4