<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
</head>
<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">
Ok, made the following changes - want a full v2 commit, or ok to do this on push?
<div class=""><br class="">
</div>
<div class="">
<div class="">--- a/src/gallium/drivers/swr/swr_shader.cpp</div>
<div class="">+++ b/src/gallium/drivers/swr/swr_shader.cpp</div>
<div class="">@@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)</div>
<div class=""> pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;</div>
<div class=""> pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;</div>
<div class=""> pGS->controlDataSize = 8; // GS ouputs max of 8 32B units</div>
<div class="">- pGS->controlDataOffset = 32;</div>
<div class="">- pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;</div>
<div class="">+ pGS->controlDataOffset = VERTEX_COUNT_SIZE;</div>
<div class="">+ pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;</div>
<div class=""> </div>
<div class=""> pGS->allocationSize =</div>
<div class="">- 32 + // vertex count</div>
<div class="">- (8 * 32) + // control header</div>
<div class="">+ VERTEX_COUNT_SIZE + // vertex count</div>
<div class="">+ CONTROL_HEADER_SIZE + // control header</div>
<div class=""> (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex</div>
<div class=""> pGS->maxNumVerts; // num verts</div>
</div>
<div class=""><br class="">
</div>
<div class=""><br class="">
<div>
<blockquote type="cite" class="">
<div class="">On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce <<a href="mailto:bruce.cherniak@intel.com" class="">bruce.cherniak@intel.com</a>> wrote:</div>
<br class="Apple-interchange-newline">
<div class="">
<blockquote type="cite" style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px;" class="">
<br class="Apple-interchange-newline">
On Sep 21, 2017, at 7:46 PM, Tim Rowley <<a href="mailto:timothy.o.rowley@intel.com" class="">timothy.o.rowley@intel.com</a>> wrote:<br class="">
<br class="">
One piglit regression, which was a false pass:<br class="">
spec@glsl-1.50@execution@geometry@dynamic_input_array_index<br class="">
---<br class="">
.../drivers/swr/rasterizer/core/frontend.cpp | 227 ++++++++++++---------<br class="">
src/gallium/drivers/swr/rasterizer/core/state.h | 55 +++--<br class="">
src/gallium/drivers/swr/swr_shader.cpp | 183 ++++++++---------<br class="">
3 files changed, 253 insertions(+), 212 deletions(-)<br class="">
<br class="">
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp<br class="">
index f882869..26e76a9 100644<br class="">
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp<br class="">
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp<br class="">
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num<br class="">
<br class="">
THREAD SWR_GS_CONTEXT tlsGsContext;<br class="">
<br class="">
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH><br class="">
-struct GsBufferInfo<br class="">
+// Buffers that are allocated if GS is enabled<br class="">
+struct GsBuffers<br class="">
{<br class="">
- GsBufferInfo(const SWR_GS_STATE &gsState)<br class="">
- {<br class="">
- const uint32_t vertexCount = gsState.maxNumVerts;<br class="">
- const uint32_t vertexStride = sizeof(SIMDVERTEX);<br class="">
- const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;<br class="">
+ uint8_t* pGsIn;<br class="">
+ uint8_t* pGsOut[KNOB_SIMD_WIDTH];<br class="">
+ uint8_t* pGsTransposed;<br class="">
+ void* pStreamCutBuffer;<br class="">
+};<br class="">
<br class="">
- vertexPrimitiveStride = vertexStride * numSimdBatches;<br class="">
- vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;<br class="">
+//////////////////////////////////////////////////////////////////////////<br class="">
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler<br class="">
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler<br class="">
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader<br class="">
+/// @param numVerts - Number of vertices outputted by the GS<br class="">
+/// @param numAttribs - Number of attributes per vertex<br class="">
+template<typename SIMD_T, uint32_t SimdWidth><br class="">
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)<br class="">
+{<br class="">
+ uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;<br class="">
+ uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;<br class="">
<br class="">
- if (gsState.isSingleStream)<br class="">
- {<br class="">
- cutPrimitiveStride = (vertexCount + 7) / 8;<br class="">
- cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;<br class="">
+ OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];<br class="">
<br class="">
- streamCutPrimitiveStride = 0;<br class="">
- streamCutInstanceStride = 0;<br class="">
- }<br class="">
- else<br class="">
- {<br class="">
- cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);<br class="">
- cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;<br class="">
-<br class="">
- streamCutPrimitiveStride = (vertexCount + 7) / 8;<br class="">
- streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;<br class="">
- }<br class="">
+ for (uint32_t i = 0; i < SimdWidth; ++i)<br class="">
+ {<br class="">
+ gatherOffsets[i] = srcVertexStride * i;<br class="">
}<br class="">
+ auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);<br class="">
<br class="">
- uint32_t vertexPrimitiveStride;<br class="">
- uint32_t vertexInstanceStride;<br class="">
+ uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;<br class="">
+ uint32_t remainingVerts = numVerts;<br class="">
<br class="">
- uint32_t cutPrimitiveStride;<br class="">
- uint32_t cutInstanceStride;<br class="">
+ for (uint32_t s = 0; s < numSimd; ++s)<br class="">
+ {<br class="">
+ uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;<br class="">
+ uint8_t* pDstBase = pDst + s * dstVertexStride;<br class="">
<br class="">
- uint32_t streamCutPrimitiveStride;<br class="">
- uint32_t streamCutInstanceStride;<br class="">
-};<br class="">
+ // Compute mask to prevent src overflow<br class="">
+ uint32_t mask = std::min(remainingVerts, SimdWidth);<br class="">
+ mask = GenMask(mask);<br class="">
+ auto vMask = SIMD_T::vmask_ps(mask);<br class="">
+ auto viMask = SIMD_T::castps_si(vMask);<br class="">
+<br class="">
+ for (uint32_t a = 0; a < numAttribs; ++a)<br class="">
+ {<br class="">
+ auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);<br class="">
+ auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);<br class="">
+ auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);<br class="">
+ auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);<br class="">
+<br class="">
+ SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);<br class="">
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);<br class="">
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);<br class="">
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);<br class="">
+<br class="">
+ pSrcBase += sizeof(float) * 4;<br class="">
+ pDstBase += sizeof(typename SIMD_T::Float) * 4;<br class="">
+ }<br class="">
+ remainingVerts -= SimdWidth;<br class="">
+ }<br class="">
+}<br class="">
<br class="">
//////////////////////////////////////////////////////////////////////////<br class="">
/// @brief Implements GS stage.<br class="">
@@ -763,9 +785,7 @@ static void GeometryShaderStage(<br class="">
DRAW_CONTEXT *pDC,<br class="">
uint32_t workerId,<br class="">
PA_STATE& pa,<br class="">
- void* pGsOut,<br class="">
- void* pCutBuffer,<br class="">
- void* pStreamCutBuffer,<br class="">
+ GsBuffers* pGsBuffers,<br class="">
uint32_t* pSoPrimData,<br class="">
#if USE_SIMD16_FRONTEND<br class="">
uint32_t numPrims_simd8,<br class="">
@@ -779,25 +799,29 @@ static void GeometryShaderStage(<br class="">
const API_STATE& state = GetApiState(pDC);<br class="">
const SWR_GS_STATE* pState = &state.gsState;<br class="">
<br class="">
- SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");<br class="">
- SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");<br class="">
+ static uint8_t sNullBuffer[1024] = { 0 };<br class="">
<br class="">
- tlsGsContext.pStream = (uint8_t*)pGsOut;<br class="">
- tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;<br class="">
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)<br class="">
+ {<br class="">
+ tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];<br class="">
+ }<br class="">
+ tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;<br class="">
tlsGsContext.PrimitiveID = primID;<br class="">
<br class="">
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);<br class="">
simdvector attrib[MAX_NUM_VERTS_PER_PRIM];<br class="">
<br class="">
// assemble all attributes for the input primitive<br class="">
+ tlsGsContext.inputVertStride = pState->inputVertStride;<br class="">
for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)<br class="">
{<br class="">
+ uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;<br class="">
uint32_t attribSlot = pState->vertexAttribOffset + slot;<br class="">
- pa.Assemble(attribSlot, attrib);<br class="">
+ pa.Assemble(srcAttribSlot, attrib);<br class="">
<br class="">
for (uint32_t i = 0; i < numVertsPerPrim; ++i)<br class="">
{<br class="">
- tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];<br class="">
+ tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];<br class="">
}<br class="">
}<br class="">
<br class="">
@@ -805,15 +829,9 @@ static void GeometryShaderStage(<br class="">
pa.Assemble(VERTEX_POSITION_SLOT, attrib);<br class="">
for (uint32_t i = 0; i < numVertsPerPrim; ++i)<br class="">
{<br class="">
- tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];<br class="">
+ tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];<br class="">
}<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
- const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);<br class="">
-#else<br class="">
- const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);<br class="">
-#endif<br class="">
-<br class="">
// record valid prims from the frontend to avoid over binning the newly generated<br class="">
// prims from the GS<br class="">
#if USE_SIMD16_FRONTEND<br class="">
@@ -830,8 +848,10 @@ static void GeometryShaderStage(<br class="">
// execute the geometry shader<br class="">
state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);<br class="">
<br class="">
- tlsGsContext.pStream += bufferInfo.vertexInstanceStride;<br class="">
- tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;<br class="">
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)<br class="">
+ {<br class="">
+ tlsGsContext.pStreams[i] += pState->allocationSize;<br class="">
+ }<br class="">
}<br class="">
<br class="">
// set up new binner and state for the GS output topology<br class="">
@@ -865,32 +885,48 @@ static void GeometryShaderStage(<br class="">
// foreach input prim:<br class="">
// - setup a new PA based on the emitted verts for that prim<br class="">
// - loop over the new verts, calling PA to assemble each prim<br class="">
- uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;<br class="">
uint32_t* pPrimitiveId = (uint32_t*)&primID;<br class="">
<br class="">
uint32_t totalPrimsGenerated = 0;<br class="">
for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)<br class="">
{<br class="">
- uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;<br class="">
- uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;<br class="">
+ uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];<br class="">
+<br class="">
+ // Vertex count is either emitted by shader or static<br class="">
+ uint32_t vertexCount = 0;<br class="">
+ if (pState->staticVertexCount)<br class="">
+ {<br class="">
+ vertexCount = pState->staticVertexCount;<br class="">
+ }<br class="">
+ else<br class="">
+ {<br class="">
+ // If emitted in shader, it should be the stored in the first dword of the output buffer<br class="">
+ vertexCount = *(uint32_t*)pInstanceBase;<br class="">
+ }<br class="">
<br class="">
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)<br class="">
{<br class="">
- uint32_t numEmittedVerts = pVertexCount[inputPrim];<br class="">
+ uint32_t numEmittedVerts = vertexCount;<br class="">
if (numEmittedVerts == 0)<br class="">
{<br class="">
continue;<br class="">
}<br class="">
<br class="">
- uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;<br class="">
- uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;<br class="">
+ uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;<br class="">
+ uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;<br class="">
+ uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;<br class="">
+<br class="">
+#if USE_SIMD16_FRONTEND<br class="">
+ TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);<br class="">
+#else<br class="">
+ TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);<br class="">
+#endif<br class="">
<br class="">
uint32_t numAttribs = state.feNumAttributes;<br class="">
<br class="">
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)<br class="">
{<br class="">
bool processCutVerts = false;<br class="">
-<br class="">
uint8_t* pCutBuffer = pCutBase;<br class="">
<br class="">
// assign default stream ID, only relevant when GS is outputting a single stream<br class="">
@@ -910,16 +946,16 @@ static void GeometryShaderStage(<br class="">
}<br class="">
<br class="">
// multi-stream output, need to translate StreamID buffer to a cut buffer<br class="">
- ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);<br class="">
- pCutBuffer = (uint8_t*)pStreamCutBuffer;<br class="">
+ ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);<br class="">
+ pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;<br class="">
processCutVerts = false;<br class="">
}<br class="">
<br class="">
#if USE_SIMD16_FRONTEND<br class="">
- PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
+ PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
<br class="">
#else<br class="">
- PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
+ PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
<br class="">
#endif<br class="">
while (gsPa.GetNextStreamOutput())<br class="">
@@ -979,42 +1015,40 @@ static void GeometryShaderStage(<br class="">
/// @param state - API state<br class="">
/// @param ppGsOut - pointer to GS output buffer allocation<br class="">
/// @param ppCutBuffer - pointer to GS output cut buffer allocation<br class="">
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH><br class="">
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,<br class="">
- void **ppStreamCutBuffer)<br class="">
+template<typename SIMD_T, uint32_t SIMD_WIDTH><br class="">
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)<br class="">
{<br class="">
auto pArena = pDC->pArena;<br class="">
SWR_ASSERT(pArena != nullptr);<br class="">
SWR_ASSERT(state.gsState.gsEnable);<br class="">
<br class="">
- // allocate arena space to hold GS output verts<br class="">
- // @todo pack attribs<br class="">
- // @todo support multiple streams<br class="">
+ const SWR_GS_STATE& gsState = state.gsState;<br class="">
<br class="">
- const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);<br class="">
+ // Allocate storage for vertex inputs<br class="">
+ uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;<br class="">
+ pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);<br class="">
<br class="">
- const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;<br class="">
+ // Allocate arena space to hold GS output verts<br class="">
+ const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;<br class="">
<br class="">
- *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)<br class="">
+ {<br class="">
+ pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);<br class="">
+ }<br class="">
<br class="">
- // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the<br class="">
- // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance<br class="">
+ // Allocate storage for transposed GS output<br class="">
+ uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;<br class="">
+ uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);<br class="">
+ pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);<br class="">
<br class="">
- // allocate space for temporary per-stream cut buffer if multi-stream is enabled<br class="">
+ // Allocate storage to hold temporary stream->cut buffer, if necessary<br class="">
if (state.gsState.isSingleStream)<br class="">
{<br class="">
- const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;<br class="">
-<br class="">
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
- *ppStreamCutBuffer = nullptr;<br class="">
+ pGsBuffers->pStreamCutBuffer = nullptr;<br class="">
}<br class="">
else<br class="">
{<br class="">
- const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;<br class="">
- const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;<br class="">
-<br class="">
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
- *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
+ pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);<br class="">
}<br class="">
}<br class="">
<br class="">
@@ -1062,9 +1096,7 @@ static void TessellationStages(<br class="">
DRAW_CONTEXT *pDC,<br class="">
uint32_t workerId,<br class="">
PA_STATE& pa,<br class="">
- void* pGsOut,<br class="">
- void* pCutBuffer,<br class="">
- void* pCutStreamBuffer,<br class="">
+ GsBuffers* pGsBuffers,<br class="">
uint32_t* pSoPrimData,<br class="">
#if USE_SIMD16_FRONTEND<br class="">
uint32_t numPrims_simd8,<br class="">
@@ -1264,17 +1296,16 @@ static void TessellationStages(<br class="">
{<br class="">
#if USE_SIMD16_FRONTEND<br class="">
tessPa.useAlternateOffset = false;<br class="">
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);<br class="">
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);<br class="">
<br class="">
if (numPrims_hi)<br class="">
{<br class="">
tessPa.useAlternateOffset = true;<br class="">
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);<br class="">
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);<br class="">
}<br class="">
#else<br class="">
GeometryShaderStage<HasStreamOutT, HasRastT>(<br class="">
- pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,<br class="">
- _simd_set1_epi32(dsContext.PrimitiveID));<br class="">
+ pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));<br class="">
#endif<br class="">
}<br class="">
else<br class="">
@@ -1408,15 +1439,13 @@ void ProcessDraw(<br class="">
uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);<br class="">
#endif<br class="">
<br class="">
- void* pGsOut = nullptr;<br class="">
- void* pCutBuffer = nullptr;<br class="">
- void* pStreamCutBuffer = nullptr;<br class="">
+ GsBuffers gsBuffers;<br class="">
if (HasGeometryShaderT::value)<br class="">
{<br class="">
#if USE_SIMD16_FRONTEND<br class="">
- AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);<br class="">
+ AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);<br class="">
#else<br class="">
- AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);<br class="">
+ AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);<br class="">
#endif<br class="">
}<br class="">
<br class="">
@@ -1672,23 +1701,23 @@ void ProcessDraw(<br class="">
if (HasTessellationT::value)<br class="">
{<br class="">
pa.useAlternateOffset = false;<br class="">
- TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);<br class="">
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);<br class="">
<br class="">
if (numPrims_hi)<br class="">
{<br class="">
pa.useAlternateOffset = true;<br class="">
- TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);<br class="">
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);<br class="">
}<br class="">
}<br class="">
else if (HasGeometryShaderT::value)<br class="">
{<br class="">
pa.useAlternateOffset = false;<br class="">
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);<br class="">
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);<br class="">
<br class="">
if (numPrims_hi)<br class="">
{<br class="">
pa.useAlternateOffset = true;<br class="">
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);<br class="">
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);<br class="">
}<br class="">
}<br class="">
else<br class="">
@@ -1847,12 +1876,12 @@ void ProcessDraw(<br class="">
if (HasTessellationT::value)<br class="">
{<br class="">
TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(<br class="">
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
+ pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
}<br class="">
else if (HasGeometryShaderT::value)<br class="">
{<br class="">
GeometryShaderStage<HasStreamOutT, HasRastT>(<br class="">
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
+ pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
}<br class="">
else<br class="">
{<br class="">
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h<br class="">
index 13c1d8b..f7c9308 100644<br class="">
--- a/src/gallium/drivers/swr/rasterizer/core/state.h<br class="">
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h<br class="">
@@ -301,13 +301,12 @@ struct SWR_DS_CONTEXT<br class="">
/////////////////////////////////////////////////////////////////////////<br class="">
struct SWR_GS_CONTEXT<br class="">
{<br class="">
- simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims<br class="">
- simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call<br class="">
- uint32_t InstanceID; // IN: input instance ID<br class="">
- simdscalari mask; // IN: Active mask for shader<br class="">
- uint8_t* pStream; // OUT: output stream (contains vertices for all output streams)<br class="">
- uint8_t* pCutOrStreamIdBuffer; // OUT: cut or stream id buffer<br class="">
- simdscalari vertexCount; // OUT: num vertices emitted per SIMD lane<br class="">
+ simdvector* pVerts; // IN: input primitive data for SIMD prims<br class="">
+ uint32_t inputVertStride; // IN: input vertex stride, in attributes<br class="">
+ simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call<br class="">
+ uint32_t InstanceID; // IN: input instance ID<br class="">
+ simdscalari mask; // IN: Active mask for shader<br class="">
+ uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)<br class="">
};<br class="">
<br class="">
struct PixelPositions<br class="">
@@ -714,30 +713,56 @@ struct SWR_GS_STATE<br class="">
{<br class="">
bool gsEnable;<br class="">
<br class="">
- // number of input attributes per vertex. used by the frontend to<br class="">
+ // Number of input attributes per vertex. Used by the frontend to<br class="">
// optimize assembling primitives for GS<br class="">
uint32_t numInputAttribs;<br class="">
<br class="">
- // output topology - can be point, tristrip, or linestrip<br class="">
+ // Stride of incoming verts in attributes<br class="">
+ uint32_t inputVertStride;<br class="">
+<br class="">
+ // Output topology - can be point, tristrip, or linestrip<br class="">
PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum<br class="">
<br class="">
- // maximum number of verts that can be emitted by a single instance of the GS<br class="">
+ // Maximum number of verts that can be emitted by a single instance of the GS<br class="">
uint32_t maxNumVerts;<br class="">
<br class="">
- // instance count<br class="">
+ // Instance count<br class="">
uint32_t instanceCount;<br class="">
<br class="">
- // if true, geometry shader emits a single stream, with separate cut buffer.<br class="">
- // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer<br class="">
+ // If true, geometry shader emits a single stream, with separate cut buffer.<br class="">
+ // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer<br class="">
// to map vertices to streams<br class="">
bool isSingleStream;<br class="">
<br class="">
- // when single stream is enabled, singleStreamID dictates which stream is being output.<br class="">
+ // When single stream is enabled, singleStreamID dictates which stream is being output.<br class="">
// field ignored if isSingleStream is false<br class="">
uint32_t singleStreamID;<br class="">
<br class="">
- // Offset to the start of the attributes of the input vertices, in simdvector units<br class="">
+ // Total amount of memory to allocate for one instance of the shader output in bytes<br class="">
+ uint32_t allocationSize;<br class="">
+<br class="">
+ // Offset to the start of the attributes of the input vertices, in simdvector units, as read by the GS<br class="">
uint32_t vertexAttribOffset;<br class="">
+<br class="">
+ // Offset to the attributes as stored by the preceding shader stage.<br class="">
+ uint32_t srcVertexAttribOffset;<br class="">
+<br class="">
+ // Size of the control data section which contains cut or streamID data, in simdscalar units. Should be sized to handle<br class="">
+ // the maximum number of verts output by the GS. Can be 0 if there are no cuts or streamID bits.<br class="">
+ uint32_t controlDataSize;<br class="">
+<br class="">
+ // Offset to the control data section, in bytes<br class="">
+ uint32_t controlDataOffset;<br class="">
+<br class="">
+ // Total size of an output vertex, in simdvector units<br class="">
+ uint32_t outputVertexSize;<br class="">
+<br class="">
+ // Offset to the start of the vertex section, in bytes<br class="">
+ uint32_t outputVertexOffset;<br class="">
+<br class="">
+ // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, shader is<br class="">
+ // expected to store the final vertex count in the first dword of the gs output stream.<br class="">
+ uint32_t staticVertexCount;<br class="">
};<br class="">
<br class="">
<br class="">
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp<br class="">
index 0a81eaa..7f11e72 100644<br class="">
--- a/src/gallium/drivers/swr/swr_shader.cpp<br class="">
+++ b/src/gallium/drivers/swr/swr_shader.cpp<br class="">
@@ -347,18 +347,20 @@ BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_ifac<br class="">
Value *attrib =<br class="">
LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)}));<br class="">
<br class="">
- Value *pInput =<br class="">
- LOAD(GEP(iface->pGsCtx,<br class="">
- {C(0),<br class="">
- C(SWR_GS_CONTEXT_vert),<br class="">
- unwrap(vertex_index),<br class="">
- C(0),<br class="">
- attrib,<br class="">
- unwrap(swizzle_index)}));<br class="">
+ Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});<br class="">
+ Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});<br class="">
+<br class="">
+ Value *pVector = ADD(MUL(unwrap(vertex_index), pInputVertStride), attrib);<br class="">
+<br class="">
+ Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));<br class="">
<br class="">
return wrap(pInput);<br class="">
}<br class="">
<br class="">
+// GS output stream layout<br class="">
+#define VERTEX_COUNT_SIZE 32<br class="">
+#define CONTROL_HEADER_SIZE (8*32)<br class="">
+<br class="">
void<br class="">
BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,<br class="">
struct lp_build_tgsi_context * bld_base,<br class="">
@@ -366,41 +368,19 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base<br class="">
LLVMValueRef emitted_vertices_vec)<br class="">
{<br class="">
swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;<br class="">
- SWR_GS_STATE *pGS = iface->pGsState;<br class="">
<br class="">
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
- const uint32_t simdVertexStride = sizeof(simdvertex) * 2;<br class="">
- const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);<br class="">
-#else<br class="">
- const uint32_t simdVertexStride = sizeof(simdvertex);<br class="">
- const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;<br class="">
-#endif<br class="">
- const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;<br class="">
-<br class="">
- Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });<br class="">
- Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });<br class="">
- Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));<br class="">
+ const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;<br class="">
+ const uint32_t attribSize = 4 * sizeof(float);<br class="">
+ const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS;<br class="">
+ Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize));<br class="">
<br class="">
- Value *vOffsets = C({<br class="">
- inputPrimStride * 0,<br class="">
- inputPrimStride * 1,<br class="">
- inputPrimStride * 2,<br class="">
- inputPrimStride * 3,<br class="">
- inputPrimStride * 4,<br class="">
- inputPrimStride * 5,<br class="">
- inputPrimStride * 6,<br class="">
- inputPrimStride * 7 } );<br class="">
+ Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask});<br class="">
+ Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth));<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
- const uint32_t simdShift = log2(mVWidth * 2);<br class="">
- Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);<br class="">
-#else<br class="">
- const uint32_t simdShift = log2(mVWidth);<br class="">
- Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);<br class="">
-#endif<br class="">
- Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);<br class="">
+ Value *pStack = STACKSAVE();<br class="">
+ Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking<br class="">
<br class="">
for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {<br class="">
uint32_t attribSlot = attrib;<br class="">
@@ -420,46 +400,36 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base<br class="">
}<br class="">
}<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
- Value *vOffsetsAttrib =<br class="">
- ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));<br class="">
- vOffsetsAttrib =<br class="">
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));<br class="">
-#else<br class="">
- Value *vOffsetsAttrib =<br class="">
- ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));<br class="">
- vOffsetsAttrib =<br class="">
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));<br class="">
-#endif<br class="">
- vOffsetsAttrib =<br class="">
- ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));<br class="">
+ Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ?<br class="">
<br class="">
- for (uint32_t channel = 0; channel < 4; ++channel) {<br class="">
- Value *vPtrs = GEP(pStream, vOffsetsAttrib);<br class="">
- Value *vData;<br class="">
+ for (uint32_t lane = 0; lane < mVWidth; ++lane) {<br class="">
+ Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane));<br class="">
+ Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+ Value *pStreamOffset = GEP(pStream, pLaneOffset);<br class="">
+ pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy);<br class="">
<br class="">
- if (attribSlot == VERTEX_SGV_SLOT)<br class="">
- vData = LOAD(unwrap(outputs[attrib][0]));<br class="">
- else<br class="">
- vData = LOAD(unwrap(outputs[attrib][channel]));<br class="">
+ Value *pLaneMask = VEXTRACT(vMask1, C(lane));<br class="">
+ pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);<br class="">
<br class="">
- if (attribSlot != VERTEX_SGV_SLOT ||<br class="">
- sgvChannel == channel) {<br class="">
- vPtrs = BITCAST(vPtrs,<br class="">
- VectorType::get(PointerType::get(mFP32Ty, 0), 8));<br class="">
+ for (uint32_t channel = 0; channel < 4; ++channel) {<br class="">
+ Value *vData;<br class="">
<br class="">
- MASKED_SCATTER(vData, vPtrs, 32, vMask1);<br class="">
- }<br class="">
+ if (attribSlot == VERTEX_SGV_SLOT)<br class="">
+ vData = LOAD(unwrap(outputs[attrib][0]));<br class="">
+ else<br class="">
+ vData = LOAD(unwrap(outputs[attrib][channel]));<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
- vOffsetsAttrib =<br class="">
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));<br class="">
-#else<br class="">
- vOffsetsAttrib =<br class="">
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));<br class="">
-#endif<br class="">
+ if (attribSlot != VERTEX_SGV_SLOT ||<br class="">
+ sgvChannel == channel) {<br class="">
+ vData = VEXTRACT(vData, C(lane));<br class="">
+ STORE(vData, pStreamOffset);<br class="">
+ }<br class="">
+ pStreamOffset = GEP(pStreamOffset, C(1));<br class="">
+ }<br class="">
}<br class="">
}<br class="">
+<br class="">
+ STACKRESTORE(pStack);<br class="">
}<br class="">
<br class="">
void<br class="">
@@ -469,12 +439,9 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba<br class="">
LLVMValueRef emitted_prims_vec)<br class="">
{<br class="">
swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;<br class="">
- SWR_GS_STATE *pGS = iface->pGsState;<br class="">
<br class="">
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));<br class="">
<br class="">
- Value *pCutBuffer =<br class="">
- LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer});<br class="">
Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });<br class="">
Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));<br class="">
<br class="">
@@ -496,31 +463,29 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba<br class="">
mask = AND(mask, cmpMask);<br class="">
vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8));<br class="">
<br class="">
- const uint32_t cutPrimStride =<br class="">
- (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth;<br class="">
- Value *vOffsets = C({<br class="">
- (uint32_t)(cutPrimStride * 0),<br class="">
- (uint32_t)(cutPrimStride * 1),<br class="">
- (uint32_t)(cutPrimStride * 2),<br class="">
- (uint32_t)(cutPrimStride * 3),<br class="">
- (uint32_t)(cutPrimStride * 4),<br class="">
- (uint32_t)(cutPrimStride * 5),<br class="">
- (uint32_t)(cutPrimStride * 6),<br class="">
- (uint32_t)(cutPrimStride * 7) } );<br class="">
-<br class="">
vCount = SUB(vCount, VIMMED1(1));<br class="">
- Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets);<br class="">
+ Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE));<br class="">
Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8)));<br class="">
<br class="">
vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8));<br class="">
<br class="">
- Value *vPtrs = GEP(pCutBuffer, vOffset);<br class="">
- vPtrs =<br class="">
- BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), JM()->mVWidth));<br class="">
+ Value *pStack = STACKSAVE();<br class="">
+ Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking<br class="">
+<br class="">
+ for (uint32_t lane = 0; lane < mVWidth; ++lane) {<br class="">
+ Value *vLaneOffset = VEXTRACT(vOffset, C(lane));<br class="">
+ Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+ Value *pStreamOffset = GEP(pStream, vLaneOffset);<br class="">
+<br class="">
+ Value *pLaneMask = VEXTRACT(vMask1, C(lane));<br class="">
+ pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);<br class="">
<br class="">
- Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1);<br class="">
- vValue = OR(vGather, vValue);<br class="">
- MASKED_SCATTER(vValue, vPtrs, 32, vMask1);<br class="">
+ Value *vVal = LOAD(pStreamOffset);<br class="">
+ vVal = OR(vVal, VEXTRACT(vValue, C(lane)));<br class="">
+ STORE(vVal, pStreamOffset);<br class="">
+ }<br class="">
+<br class="">
+ STACKRESTORE(pStack);<br class="">
}<br class="">
<br class="">
void<br class="">
@@ -533,7 +498,14 @@ BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base,<br class="">
<br class="">
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));<br class="">
<br class="">
- STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, SWR_GS_CONTEXT_vertexCount});<br class="">
+ // Store emit count to each output stream in the first DWORD<br class="">
+ for (uint32_t lane = 0; lane < mVWidth; ++lane)<br class="">
+ {<br class="">
+ Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+ pStream = BITCAST(pStream, mInt32PtrTy);<br class="">
+ Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane));<br class="">
+ STORE(pLaneCount, pStream);<br class="">
+ }<br class="">
}<br class="">
<br class="">
PFN_GS_FUNC<br class="">
@@ -542,6 +514,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)<br class="">
SWR_GS_STATE *pGS = &ctx->gs->gsState;<br class="">
struct tgsi_shader_info *info = &ctx->gs->info.base;<br class="">
<br class="">
+ memset(pGS, 0, sizeof(*pGS));<br class="">
+<br class="">
pGS->gsEnable = true;<br class="">
<br class="">
pGS->numInputAttribs = info->num_inputs;<br class="">
@@ -555,6 +529,18 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)<br class="">
pGS->singleStreamID = 0;<br class="">
<br class="">
pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize<br class="">
+ pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize<br class="">
+ pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;<br class="">
+ pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;<br class="">
+ pGS->controlDataSize = 8; // GS ouputs max of 8 32B units<br class="">
+ pGS->controlDataOffset = 32;<br class="">
+ pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;<br class="">
+<br class="">
+ pGS->allocationSize =<br class="">
+ 32 + // vertex count<br class="">
+ (8 * 32) + // control header<br class="">
+ (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex<br class="">
+ pGS->maxNumVerts; // num verts<br class="">
</blockquote>
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">Consider
using VERTEX_COUNT_SIZE and CONTROL_HEADER_SIZE defines?</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> pGS->controlDataOffset
= VERTEX_COUNT_SIZE;</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> pGS->outputVertexOffset
= VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> pGS->allocationSize
=</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> VERTEX_COUNT_SIZE
+<span class="Apple-converted-space"> </span></span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> CONTROL_HEADER_SIZE</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> (SWR_VTX_NUM_SLOTS
* 16) * // sizeof vertex</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class=""> pGS->maxNumVerts;
// num verts</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<blockquote type="cite" style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px;" class="">
struct swr_geometry_shader *gs = ctx->gs;<br class="">
<br class="">
@@ -635,10 +621,11 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)<br class="">
lp_type_float_vec(32, 32 * 8), wrap(mask_val));<br class="">
<br class="">
// zero out cut buffer so we can load/modify/store bits<br class="">
- MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}),<br class="">
- C((char)0),<br class="">
- pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth,<br class="">
- sizeof(float) * KNOB_SIMD_WIDTH);<br class="">
+ for (uint32_t lane = 0; lane < mVWidth; ++lane)<br class="">
+ {<br class="">
+ Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+ MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH);<br class="">
+ }<br class="">
<br class="">
struct swr_gs_llvm_iface gs_iface;<br class="">
gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input;<br class="">
--<span class="Apple-converted-space"> </span><br class="">
2.7.4<br class="">
<br class="">
_______________________________________________<br class="">
mesa-dev mailing list<br class="">
<a href="mailto:mesa-dev@lists.freedesktop.org" class="">mesa-dev@lists.freedesktop.org</a><br class="">
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" class="">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a></blockquote>
</div>
</blockquote>
</div>
<br class="">
</div>
</body>
</html>