[Mesa-dev] [PATCH v2 13/14] swr: [rasterizer core] use wrap-around safe compares for dependency checking
Tim Rowley
timothy.o.rowley at intel.com
Mon Jun 20 21:37:07 UTC 2016
Move drawIDs from 64-bit to 32-bit to increase perf.
---
src/gallium/drivers/swr/rasterizer/core/api.cpp | 4 +-
src/gallium/drivers/swr/rasterizer/core/context.h | 4 +-
.../drivers/swr/rasterizer/core/ringbuffer.h | 8 ++--
.../drivers/swr/rasterizer/core/threads.cpp | 54 +++++++++++++---------
src/gallium/drivers/swr/rasterizer/core/threads.h | 6 +--
.../drivers/swr/rasterizer/scripts/knob_defs.py | 5 +-
6 files changed, 45 insertions(+), 36 deletions(-)
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index cec4519..b63d547 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -215,13 +215,13 @@ void QueueWork(SWR_CONTEXT *pContext)
if (IsDraw)
{
- uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+ uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
WorkOnFifoFE(pContext, 0, curDraw[0]);
WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0);
}
else
{
- uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+ uint32_t curDispatch = pContext->pCurDrawContext->drawId;
WorkOnCompute(pContext, 0, curDispatch);
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 3204352..08eadf4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -381,13 +381,13 @@ struct DRAW_STATE
struct DRAW_CONTEXT
{
SWR_CONTEXT* pContext;
- uint64_t drawId;
+ uint32_t drawId;
+ uint32_t dependency;
union
{
MacroTileMgr* pTileMgr;
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
};
- uint64_t dependency;
DRAW_STATE* pState;
CachingArena* pArena;
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
index b9076de..97f75c6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -90,13 +90,13 @@ public:
return (numEnqueued == mNumEntries);
}
- INLINE uint64_t GetTail() volatile { return mRingTail; }
- INLINE uint64_t GetHead() volatile { return mRingHead; }
+ INLINE uint32_t GetTail() volatile { return mRingTail; }
+ INLINE uint32_t GetHead() volatile { return mRingHead; }
protected:
T* mpRingBuffer;
uint32_t mNumEntries;
- OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter
- OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter
+ OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter
+ OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 17bf616..fe164a0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -294,22 +294,30 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
}
INLINE
-uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
+uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
{
return pContext->dcRing.GetHead();
}
INLINE
-DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
+DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId)
{
return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
}
+INLINE
+bool IDComparesLess(uint32_t a, uint32_t b)
+{
+ // Use signed delta to ensure that wrap-around to 0 is correctly handled.
+ int32_t delta = int32_t(a - b);
+ return (delta < 0);
+}
+
// returns true if dependency not met
INLINE
-bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
+bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
{
- return (pDC->dependency > lastRetiredDraw);
+ return IDComparesLess(lastRetiredDraw, pDC->dependency);
}
// inlined-only version
@@ -345,11 +353,11 @@ int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
return CompleteDrawContextInl(pContext, pDC);
}
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t& curDrawBE, uint32_t& drawEnqueued)
{
// increment our current draw id to the first incomplete draw
drawEnqueued = GetEnqueuedDraw(pContext);
- while (curDrawBE < drawEnqueued)
+ while (IDComparesLess(curDrawBE, drawEnqueued))
{
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -372,7 +380,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE,
}
// If there are no more incomplete draws then return false.
- return (curDrawBE >= drawEnqueued) ? false : true;
+ return IDComparesLess(curDrawBE, drawEnqueued);
}
//////////////////////////////////////////////////////////////////////////
@@ -392,20 +400,20 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE,
void WorkOnFifoBE(
SWR_CONTEXT *pContext,
uint32_t workerId,
- uint64_t &curDrawBE,
+ uint32_t &curDrawBE,
TileSet& lockedTiles,
uint32_t numaNode,
uint32_t numaMask)
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
- uint64_t drawEnqueued = 0;
+ uint32_t drawEnqueued = 0;
if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
{
return;
}
- uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+ uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
// Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
lockedTiles.clear();
@@ -415,7 +423,7 @@ void WorkOnFifoBE(
// 2. If we're trying to work on draws after curDrawBE, we are restricted to
// working on those macrotiles that are known to be complete in the prior draw to
// maintain order. The locked tiles provides the history to ensures this.
- for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
+ for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -508,11 +516,11 @@ void WorkOnFifoBE(
}
}
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
{
// Try to grab the next DC from the ring
- uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
- while (curDrawFE < drawEnqueued)
+ uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
+ while (IDComparesLess(curDrawFE, drawEnqueued))
{
uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
@@ -527,8 +535,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
}
}
- uint64_t curDraw = curDrawFE;
- while (curDraw < drawEnqueued)
+ uint32_t curDraw = curDrawFE;
+ while (IDComparesLess(curDraw, drawEnqueued))
{
uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
@@ -559,17 +567,17 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
void WorkOnCompute(
SWR_CONTEXT *pContext,
uint32_t workerId,
- uint64_t& curDrawBE)
+ uint32_t& curDrawBE)
{
- uint64_t drawEnqueued = 0;
+ uint32_t drawEnqueued = 0;
if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
{
return;
}
- uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+ uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
- for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
+ for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
if (pDC->isCompute == false) return;
@@ -639,10 +647,10 @@ DWORD workerThreadMain(LPVOID pData)
// the worker can safely increment its oldestDraw counter and move on to the next draw.
std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
- auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
+ auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
- uint64_t curDrawBE = 0;
- uint64_t curDrawFE = 0;
+ uint32_t curDrawBE = 0;
+ uint32_t curDrawFE = 0;
while (pContext->threadPool.inThreadShutdown == false)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 3aba632..e7b4924 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -64,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
-void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
+void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index ab6ec56..56c3144 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -133,8 +133,9 @@ KNOBS = [
['MAX_DRAWS_IN_FLIGHT', {
'type' : 'uint32_t',
- 'default' : '96',
- 'desc' : ['Maximum number of draws outstanding before API thread blocks.'],
+ 'default' : '128',
+ 'desc' : ['Maximum number of draws outstanding before API thread blocks.',
+ 'This value MUST be evenly divisible into 2^32'],
'category' : 'perf',
}],
--
1.9.1
More information about the mesa-dev
mailing list