Mesa (master): swr: [rasterizer core] allow override of KNOB thread settings

Wed Aug 10 18:17:13 UTC 2016

Module: Mesa
Branch: master
Commit: 29e1c4a8a9f26ce41aa53dc9bf39852a8530adc6
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=29e1c4a8a9f26ce41aa53dc9bf39852a8530adc6

Author: Tim Rowley <timothy.o.rowley at intel.com>
Date:   Wed Aug  3 17:59:37 2016 -0600

swr: [rasterizer core] allow override of KNOB thread settings

- Remove HYPERTHREADED_FE support
- Add threading info as optional data passed to SwrCreateContext.
  If supplied this data will override any KNOB thread settings.

Signed-off-by: Tim Rowley <timothy.o.rowley at intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp    | 18 ++++--
 src/gallium/drivers/swr/rasterizer/core/api.h      | 15 +++++
 src/gallium/drivers/swr/rasterizer/core/context.h  |  1 +
 .../drivers/swr/rasterizer/core/threads.cpp        | 73 +++++++---------------
 src/gallium/drivers/swr/rasterizer/core/threads.h  |  4 +-
 .../drivers/swr/rasterizer/scripts/knob_defs.py    | 12 ----
 6 files changed, 53 insertions(+), 70 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index a4856ee..3922606 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -75,6 +75,17 @@ HANDLE SwrCreateContext(
     pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
     pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
 
+    pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
+    pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
+    pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
+    pContext->threadInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
+    pContext->threadInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
+
+    if (pCreateInfo->pThreadInfo)
+    {
+        pContext->threadInfo = *pCreateInfo->pThreadInfo;
+    }
+
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
@@ -84,7 +95,7 @@ HANDLE SwrCreateContext(
         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
 
-    if (!KNOB_SINGLE_THREADED)
+    if (!pContext->threadInfo.SINGLE_THREADED)
     {
         memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
         memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@@ -95,9 +106,8 @@ HANDLE SwrCreateContext(
     }
 
     // Calling createThreadPool() above can set SINGLE_THREADED
-    if (KNOB_SINGLE_THREADED)
+    if (pContext->threadInfo.SINGLE_THREADED)
     {
-        SET_KNOB(HYPERTHREADED_FE, false);
         pContext->NumWorkerThreads = 1;
         pContext->NumFEThreads = 1;
         pContext->NumBEThreads = 1;
@@ -218,7 +228,7 @@ void QueueWork(SWR_CONTEXT *pContext)
         pContext->dcRing.Enqueue();
     }
 
-    if (KNOB_SINGLE_THREADED)
+    if (pContext->threadInfo.SINGLE_THREADED)
     {
         // flush denormals to 0
         uint32_t mxcsr = _mm_getcsr();
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index b45d449..d7621d5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -91,6 +91,18 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
 class BucketManager;
 
 //////////////////////////////////////////////////////////////////////////
+/// SWR_THREADING_INFO
+/////////////////////////////////////////////////////////////////////////
+struct SWR_THREADING_INFO
+{
+    uint32_t    MAX_WORKER_THREADS;
+    uint32_t    MAX_NUMA_NODES;
+    uint32_t    MAX_CORES_PER_NUMA_NODE;
+    uint32_t    MAX_THREADS_PER_CORE;
+    bool        SINGLE_THREADED;
+};
+
+//////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
 /////////////////////////////////////////////////////////////////////////
 struct SWR_CREATECONTEXT_INFO
@@ -113,6 +125,9 @@ struct SWR_CREATECONTEXT_INFO
 
     // Output: size required memory passed to for SwrSaveState / SwrRestoreState
     size_t  contextSaveSize;
+
+    // Input (optional): Threading info that overrides any set KNOB values.
+    SWR_THREADING_INFO* pThreadInfo;
 };
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 7e6a167..47fea16 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -464,6 +464,7 @@ struct SWR_CONTEXT
     uint32_t NumBEThreads;
 
     THREAD_POOL threadPool; // Thread pool associated with this context
+    SWR_THREADING_INFO threadInfo;
 
     std::condition_variable FifosNotEmpty;
     std::mutex WaitLock;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index b207ebd..143a77f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -239,10 +239,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
 }
 
 
-void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
+void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
 {
     // Only bind threads when MAX_WORKER_THREADS isn't set.
-    if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
+    if (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false)
     {
         return;
     }
@@ -267,9 +267,9 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
     else
 #endif
     {
-        // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
+        // If MAX_WORKER_THREADS is set, only bind to the proc group,
         // Not the individual HW thread.
-        if (!KNOB_MAX_WORKER_THREADS)
+        if (!pContext->threadInfo.MAX_WORKER_THREADS)
         {
             affinity.Mask = KAFFINITY(1) << threadId;
         }
@@ -648,7 +648,7 @@ DWORD workerThreadMain(LPVOID pData)
     uint32_t threadId = pThreadData->threadId;
     uint32_t workerId = pThreadData->workerId;
 
-    bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
+    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
 
     RDTSC_INIT(threadId);
 
@@ -771,7 +771,7 @@ template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
-    bindThread(0);
+    bindThread(pContext, 0);
 
     CPUNumaNodes nodes;
     uint32_t numThreadsPerProcGroup = 0;
@@ -796,33 +796,23 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     uint32_t numCoresPerNode    = numHWCoresPerNode;
     uint32_t numHyperThreads    = numHWHyperThreads;
 
-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_NUMA_NODES)
     {
-        SET_KNOB(HYPERTHREADED_FE, false);
+        numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
     }
 
-    if (KNOB_HYPERTHREADED_FE)
+    if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
     {
-        SET_KNOB(MAX_THREADS_PER_CORE, 0);
+        numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
     }
 
-    if (KNOB_MAX_NUMA_NODES)
+    if (pContext->threadInfo.MAX_THREADS_PER_CORE)
     {
-        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
-    }
-
-    if (KNOB_MAX_CORES_PER_NUMA_NODE)
-    {
-        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
-    }
-
-    if (KNOB_MAX_THREADS_PER_CORE)
-    {
-        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
+        numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
     }
 
 #if defined(_WIN32) && !defined(_WIN64)
-    if (!KNOB_MAX_WORKER_THREADS)
+    if (!pContext->threadInfo.MAX_WORKER_THREADS)
     {
         // Limit 32-bit windows to bindable HW threads only
         if ((numCoresPerNode * numHWHyperThreads) > 32)
@@ -832,19 +822,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
 #endif
 
-    if (numHyperThreads < 2)
-    {
-        SET_KNOB(HYPERTHREADED_FE, false);
-    }
-
     // Calculate numThreads
     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
     numThreads = std::min(numThreads, numHWThreads);
 
-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_WORKER_THREADS)
     {
         uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
-        numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
+        numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
     }
 
     if (numThreads > KNOB_MAX_NUM_THREADS)
@@ -900,7 +885,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
     pPool->numaMask = 0;
 
-    if (KNOB_MAX_WORKER_THREADS)
+    if (pContext->threadInfo.MAX_WORKER_THREADS)
     {
         bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
@@ -962,25 +947,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                     pPool->pThreadData[workerId].htId = t;
                     pPool->pThreadData[workerId].pContext = pContext;
 
-                    if (KNOB_HYPERTHREADED_FE)
-                    {
-                        if (t == 0)
-                        {
-                            pContext->NumBEThreads++;
-                            pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
-                        }
-                        else
-                        {
-                            pContext->NumFEThreads++;
-                            pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
-                        }
-                    }
-                    else
-                    {
-                        pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
-                        pContext->NumBEThreads++;
-                        pContext->NumFEThreads++;
-                    }
+                    pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+                    pContext->NumBEThreads++;
+                    pContext->NumFEThreads++;
 
                     ++workerId;
                 }
@@ -991,7 +960,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
-    if (!KNOB_SINGLE_THREADED)
+    if (!pContext->threadInfo.SINGLE_THREADED)
     {
         // Inform threads to finish up
         std::unique_lock<std::mutex> lock(pContext->WaitLock);
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 215c699..157f46a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -45,7 +45,7 @@ struct THREAD_DATA
     uint32_t htId;          // Hyperthread id
     uint32_t workerId;
     SWR_CONTEXT *pContext;
-    bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
+    bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set.
 };
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 56c3144..f93147c 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -30,18 +30,6 @@ KNOBS = [
         'category'  : 'debug',
     }],
 
-    ['HYPERTHREADED_FE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['EXPERIMENTAL!!',
-                       'If enabled will attempt to use secondary threads per core to perform',
-                       'front-end (VS/GS) work.',
-                       '',
-                       'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
-        'category'  : 'perf',
-        'advanced'  : 'true',
-    }],
-
     ['DUMP_SHADER_IR', {
         'type'      : 'bool',
         'default'   : 'false',