<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
</head>
<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">
On Sep 25, 2017, at 11:31 AM, Rowley, Timothy O <<a href="mailto:timothy.o.rowley@intel.com" class="">timothy.o.rowley@intel.com</a>> wrote:<br class="">
<div>
<blockquote type="cite" class=""><br class="Apple-interchange-newline">
<div class="">
<div style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">
Ok, made the following changes - want a full v2 commit, or ok to do this on push?</div>
</div>
</blockquote>
<div><br class="">
</div>
I'm fine with doing it on push and don't need a full v2.  It simply replaces a couple magic numbers with their defines -- no functional change.</div>
<div>I'll mark the entire set rvb in patch 0/9.<br class="">
<br class="">
<blockquote type="cite" class="">
<div class="">
<div style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;" class="">
<div class=""></div>
<div class="">
<div class="">--- a/src/gallium/drivers/swr/swr_shader.cpp</div>
<div class="">+++ b/src/gallium/drivers/swr/swr_shader.cpp</div>
<div class="">@@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)</div>
<div class="">    pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;</div>
<div class="">    pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;</div>
<div class="">    pGS->controlDataSize = 8; // GS ouputs max of 8 32B units</div>
<div class="">-   pGS->controlDataOffset = 32;</div>
<div class="">-   pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;</div>
<div class="">+   pGS->controlDataOffset = VERTEX_COUNT_SIZE;</div>
<div class="">+   pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;</div>
<div class=""> </div>
<div class="">    pGS->allocationSize =</div>
<div class="">-      32 + // vertex count</div>
<div class="">-      (8 * 32) + // control header</div>
<div class="">+      VERTEX_COUNT_SIZE + // vertex count</div>
<div class="">+      CONTROL_HEADER_SIZE + // control header</div>
<div class="">       (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex</div>
<div class="">       pGS->maxNumVerts; // num verts</div>
</div>
<div class=""><br class="">
</div>
<div class=""><br class="">
<div class="">
<blockquote type="cite" class="">
<div class="">On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce <<a href="mailto:bruce.cherniak@intel.com" class="">bruce.cherniak@intel.com</a>> wrote:</div>
<br class="Apple-interchange-newline">
<div class="">
<blockquote type="cite" style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px;" class="">
<br class="Apple-interchange-newline">
On Sep 21, 2017, at 7:46 PM, Tim Rowley <<a href="mailto:timothy.o.rowley@intel.com" class="">timothy.o.rowley@intel.com</a>> wrote:<br class="">
<br class="">
One piglit regression, which was a false pass:<br class="">
spec@glsl-1.50@execution@geometry@dynamic_input_array_index<br class="">
---<br class="">
.../drivers/swr/rasterizer/core/frontend.cpp       | 227 ++++++++++++---------<br class="">
src/gallium/drivers/swr/rasterizer/core/state.h    |  55 +++--<br class="">
src/gallium/drivers/swr/swr_shader.cpp             | 183 ++++++++---------<br class="">
3 files changed, 253 insertions(+), 212 deletions(-)<br class="">
<br class="">
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp<br class="">
index f882869..26e76a9 100644<br class="">
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp<br class="">
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp<br class="">
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num<br class="">
<br class="">
THREAD SWR_GS_CONTEXT tlsGsContext;<br class="">
<br class="">
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH><br class="">
-struct GsBufferInfo<br class="">
+// Buffers that are allocated if GS is enabled<br class="">
+struct GsBuffers<br class="">
{<br class="">
-    GsBufferInfo(const SWR_GS_STATE &gsState)<br class="">
-    {<br class="">
-        const uint32_t vertexCount = gsState.maxNumVerts;<br class="">
-        const uint32_t vertexStride = sizeof(SIMDVERTEX);<br class="">
-        const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;<br class="">
+    uint8_t* pGsIn;<br class="">
+    uint8_t* pGsOut[KNOB_SIMD_WIDTH];<br class="">
+    uint8_t* pGsTransposed;<br class="">
+    void* pStreamCutBuffer;<br class="">
+};<br class="">
<br class="">
-        vertexPrimitiveStride = vertexStride * numSimdBatches;<br class="">
-        vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;<br class="">
+//////////////////////////////////////////////////////////////////////////<br class="">
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler<br class="">
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler<br class="">
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader<br class="">
+/// @param numVerts - Number of vertices outputted by the GS<br class="">
+/// @param numAttribs - Number of attributes per vertex<br class="">
+template<typename SIMD_T, uint32_t SimdWidth><br class="">
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)<br class="">
+{<br class="">
+    uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;<br class="">
+    uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;<br class="">
<br class="">
-        if (gsState.isSingleStream)<br class="">
-        {<br class="">
-            cutPrimitiveStride = (vertexCount + 7) / 8;<br class="">
-            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;<br class="">
+    OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];<br class="">
<br class="">
-            streamCutPrimitiveStride = 0;<br class="">
-            streamCutInstanceStride = 0;<br class="">
-        }<br class="">
-        else<br class="">
-        {<br class="">
-            cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);<br class="">
-            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;<br class="">
-<br class="">
-            streamCutPrimitiveStride = (vertexCount + 7) / 8;<br class="">
-            streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;<br class="">
-        }<br class="">
+    for (uint32_t i = 0; i < SimdWidth; ++i)<br class="">
+    {<br class="">
+        gatherOffsets[i] = srcVertexStride * i;<br class="">
   }<br class="">
+    auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);<br class="">
<br class="">
-    uint32_t vertexPrimitiveStride;<br class="">
-    uint32_t vertexInstanceStride;<br class="">
+    uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;<br class="">
+    uint32_t remainingVerts = numVerts;<br class="">
<br class="">
-    uint32_t cutPrimitiveStride;<br class="">
-    uint32_t cutInstanceStride;<br class="">
+    for (uint32_t s = 0; s < numSimd; ++s)<br class="">
+    {<br class="">
+        uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;<br class="">
+        uint8_t* pDstBase = pDst + s * dstVertexStride;<br class="">
<br class="">
-    uint32_t streamCutPrimitiveStride;<br class="">
-    uint32_t streamCutInstanceStride;<br class="">
-};<br class="">
+        // Compute mask to prevent src overflow<br class="">
+        uint32_t mask = std::min(remainingVerts, SimdWidth);<br class="">
+        mask = GenMask(mask);<br class="">
+        auto vMask = SIMD_T::vmask_ps(mask);<br class="">
+        auto viMask = SIMD_T::castps_si(vMask);<br class="">
+<br class="">
+        for (uint32_t a = 0; a < numAttribs; ++a)<br class="">
+        {<br class="">
+            auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);<br class="">
+            auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);<br class="">
+            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);<br class="">
+            auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);<br class="">
+<br class="">
+            SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);<br class="">
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);<br class="">
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);<br class="">
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);<br class="">
+<br class="">
+            pSrcBase += sizeof(float) * 4;<br class="">
+            pDstBase += sizeof(typename SIMD_T::Float) * 4;<br class="">
+        }<br class="">
+        remainingVerts -= SimdWidth;<br class="">
+    }<br class="">
+}<br class="">
<br class="">
//////////////////////////////////////////////////////////////////////////<br class="">
/// @brief Implements GS stage.<br class="">
@@ -763,9 +785,7 @@ static void GeometryShaderStage(<br class="">
   DRAW_CONTEXT *pDC,<br class="">
   uint32_t workerId,<br class="">
   PA_STATE& pa,<br class="">
-    void* pGsOut,<br class="">
-    void* pCutBuffer,<br class="">
-    void* pStreamCutBuffer,<br class="">
+    GsBuffers* pGsBuffers,<br class="">
   uint32_t* pSoPrimData,<br class="">
#if USE_SIMD16_FRONTEND<br class="">
   uint32_t numPrims_simd8,<br class="">
@@ -779,25 +799,29 @@ static void GeometryShaderStage(<br class="">
   const API_STATE& state = GetApiState(pDC);<br class="">
   const SWR_GS_STATE* pState = &state.gsState;<br class="">
<br class="">
-    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");<br class="">
-    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");<br class="">
+    static uint8_t sNullBuffer[1024] = { 0 };<br class="">
<br class="">
-    tlsGsContext.pStream = (uint8_t*)pGsOut;<br class="">
-    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;<br class="">
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)<br class="">
+    {<br class="">
+        tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];<br class="">
+    }<br class="">
+    tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;<br class="">
   tlsGsContext.PrimitiveID = primID;<br class="">
<br class="">
   uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);<br class="">
   simdvector attrib[MAX_NUM_VERTS_PER_PRIM];<br class="">
<br class="">
   // assemble all attributes for the input primitive<br class="">
+    tlsGsContext.inputVertStride = pState->inputVertStride;<br class="">
   for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)<br class="">
   {<br class="">
+        uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;<br class="">
       uint32_t attribSlot = pState->vertexAttribOffset + slot;<br class="">
-        pa.Assemble(attribSlot, attrib);<br class="">
+        pa.Assemble(srcAttribSlot, attrib);<br class="">
<br class="">
       for (uint32_t i = 0; i < numVertsPerPrim; ++i)<br class="">
       {<br class="">
-            tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];<br class="">
+            tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];<br class="">
       }<br class="">
   }<br class="">
<br class="">
@@ -805,15 +829,9 @@ static void GeometryShaderStage(<br class="">
   pa.Assemble(VERTEX_POSITION_SLOT, attrib);<br class="">
   for (uint32_t i = 0; i < numVertsPerPrim; ++i)<br class="">
   {<br class="">
-        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];<br class="">
+        tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];<br class="">
   }<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
-    const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);<br class="">
-#else<br class="">
-    const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);<br class="">
-#endif<br class="">
-<br class="">
   // record valid prims from the frontend to avoid over binning the newly generated<br class="">
   // prims from the GS<br class="">
#if USE_SIMD16_FRONTEND<br class="">
@@ -830,8 +848,10 @@ static void GeometryShaderStage(<br class="">
       // execute the geometry shader<br class="">
       state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);<br class="">
<br class="">
-        tlsGsContext.pStream += bufferInfo.vertexInstanceStride;<br class="">
-        tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;<br class="">
+        for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)<br class="">
+        {<br class="">
+            tlsGsContext.pStreams[i] += pState->allocationSize;<br class="">
+        }<br class="">
   }<br class="">
<br class="">
   // set up new binner and state for the GS output topology<br class="">
@@ -865,32 +885,48 @@ static void GeometryShaderStage(<br class="">
   // foreach input prim:<br class="">
   // - setup a new PA based on the emitted verts for that prim<br class="">
   // - loop over the new verts, calling PA to assemble each prim<br class="">
-    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;<br class="">
   uint32_t* pPrimitiveId = (uint32_t*)&primID;<br class="">
<br class="">
   uint32_t totalPrimsGenerated = 0;<br class="">
   for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)<br class="">
   {<br class="">
-        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;<br class="">
-        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;<br class="">
+        uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];<br class="">
+<br class="">
+        // Vertex count is either emitted by shader or static<br class="">
+        uint32_t vertexCount = 0;<br class="">
+        if (pState->staticVertexCount)<br class="">
+        {<br class="">
+            vertexCount = pState->staticVertexCount;<br class="">
+        }<br class="">
+        else<br class="">
+        {<br class="">
+            // If emitted in shader, it should be the stored in the first dword of the output buffer<br class="">
+            vertexCount = *(uint32_t*)pInstanceBase;<br class="">
+        }<br class="">
<br class="">
       for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)<br class="">
       {<br class="">
-            uint32_t numEmittedVerts = pVertexCount[inputPrim];<br class="">
+            uint32_t numEmittedVerts = vertexCount;<br class="">
           if (numEmittedVerts == 0)<br class="">
           {<br class="">
               continue;<br class="">
           }<br class="">
<br class="">
-            uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;<br class="">
-            uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;<br class="">
+            uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;<br class="">
+            uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;<br class="">
+            uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;<br class="">
+<br class="">
+#if USE_SIMD16_FRONTEND<br class="">
+            TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);<br class="">
+#else<br class="">
+            TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);<br class="">
+#endif<br class="">
<br class="">
           uint32_t numAttribs = state.feNumAttributes;<br class="">
<br class="">
           for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)<br class="">
           {<br class="">
               bool processCutVerts = false;<br class="">
-<br class="">
               uint8_t* pCutBuffer = pCutBase;<br class="">
<br class="">
               // assign default stream ID, only relevant when GS is outputting a single stream<br class="">
@@ -910,16 +946,16 @@ static void GeometryShaderStage(<br class="">
                   }<br class="">
<br class="">
                   // multi-stream output, need to translate StreamID buffer to a cut buffer<br class="">
-                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);<br class="">
-                    pCutBuffer = (uint8_t*)pStreamCutBuffer;<br class="">
+                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);<br class="">
+                    pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;<br class="">
                   processCutVerts = false;<br class="">
               }<br class="">
<br class="">
#if USE_SIMD16_FRONTEND<br class="">
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
+                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
<br class="">
#else<br class="">
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
+                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);<br class="">
<br class="">
#endif<br class="">
               while (gsPa.GetNextStreamOutput())<br class="">
@@ -979,42 +1015,40 @@ static void GeometryShaderStage(<br class="">
/// @param state - API state<br class="">
/// @param ppGsOut - pointer to GS output buffer allocation<br class="">
/// @param ppCutBuffer - pointer to GS output cut buffer allocation<br class="">
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH><br class="">
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,<br class="">
-    void **ppStreamCutBuffer)<br class="">
+template<typename SIMD_T, uint32_t SIMD_WIDTH><br class="">
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)<br class="">
{<br class="">
   auto pArena = pDC->pArena;<br class="">
   SWR_ASSERT(pArena != nullptr);<br class="">
   SWR_ASSERT(state.gsState.gsEnable);<br class="">
<br class="">
-    // allocate arena space to hold GS output verts<br class="">
-    // @todo pack attribs<br class="">
-    // @todo support multiple streams<br class="">
+    const SWR_GS_STATE& gsState = state.gsState;<br class="">
<br class="">
-    const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);<br class="">
+    // Allocate storage for vertex inputs<br class="">
+    uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;<br class="">
+    pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);<br class="">
<br class="">
-    const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;<br class="">
+    // Allocate arena space to hold GS output verts<br class="">
+    const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;<br class="">
<br class="">
-    *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)<br class="">
+    {<br class="">
+        pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);<br class="">
+    }<br class="">
<br class="">
-    // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the<br class="">
-    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance<br class="">
+    // Allocate storage for transposed GS output<br class="">
+    uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;<br class="">
+    uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);<br class="">
+    pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);<br class="">
<br class="">
-    // allocate space for temporary per-stream cut buffer if multi-stream is enabled<br class="">
+    // Allocate storage to hold temporary stream->cut buffer, if necessary<br class="">
   if (state.gsState.isSingleStream)<br class="">
   {<br class="">
-        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;<br class="">
-<br class="">
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
-        *ppStreamCutBuffer = nullptr;<br class="">
+        pGsBuffers->pStreamCutBuffer = nullptr;<br class="">
   }<br class="">
   else<br class="">
   {<br class="">
-        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;<br class="">
-        const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;<br class="">
-<br class="">
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
-        *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));<br class="">
+        pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);<br class="">
   }<br class="">
}<br class="">
<br class="">
@@ -1062,9 +1096,7 @@ static void TessellationStages(<br class="">
   DRAW_CONTEXT *pDC,<br class="">
   uint32_t workerId,<br class="">
   PA_STATE& pa,<br class="">
-    void* pGsOut,<br class="">
-    void* pCutBuffer,<br class="">
-    void* pCutStreamBuffer,<br class="">
+    GsBuffers* pGsBuffers,<br class="">
   uint32_t* pSoPrimData,<br class="">
#if USE_SIMD16_FRONTEND<br class="">
   uint32_t numPrims_simd8,<br class="">
@@ -1264,17 +1296,16 @@ static void TessellationStages(<br class="">
           {<br class="">
#if USE_SIMD16_FRONTEND<br class="">
               tessPa.useAlternateOffset = false;<br class="">
-                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);<br class="">
+                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);<br class="">
<br class="">
               if (numPrims_hi)<br class="">
               {<br class="">
                   tessPa.useAlternateOffset = true;<br class="">
-                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);<br class="">
+                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);<br class="">
               }<br class="">
#else<br class="">
               GeometryShaderStage<HasStreamOutT, HasRastT>(<br class="">
-                    pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,<br class="">
-                    _simd_set1_epi32(dsContext.PrimitiveID));<br class="">
+                    pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));<br class="">
#endif<br class="">
           }<br class="">
           else<br class="">
@@ -1408,15 +1439,13 @@ void ProcessDraw(<br class="">
   uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);<br class="">
#endif<br class="">
<br class="">
-    void* pGsOut = nullptr;<br class="">
-    void* pCutBuffer = nullptr;<br class="">
-    void* pStreamCutBuffer = nullptr;<br class="">
+    GsBuffers gsBuffers;<br class="">
   if (HasGeometryShaderT::value)<br class="">
   {<br class="">
#if USE_SIMD16_FRONTEND<br class="">
-        AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);<br class="">
+        AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);<br class="">
#else<br class="">
-        AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);<br class="">
+        AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);<br class="">
#endif<br class="">
   }<br class="">
<br class="">
@@ -1672,23 +1701,23 @@ void ProcessDraw(<br class="">
                           if (HasTessellationT::value)<br class="">
                           {<br class="">
                               pa.useAlternateOffset = false;<br class="">
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);<br class="">
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);<br class="">
<br class="">
                               if (numPrims_hi)<br class="">
                               {<br class="">
                                   pa.useAlternateOffset = true;<br class="">
-                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);<br class="">
+                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);<br class="">
                               }<br class="">
                           }<br class="">
                           else if (HasGeometryShaderT::value)<br class="">
                           {<br class="">
                               pa.useAlternateOffset = false;<br class="">
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);<br class="">
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);<br class="">
<br class="">
                               if (numPrims_hi)<br class="">
                               {<br class="">
                                   pa.useAlternateOffset = true;<br class="">
-                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);<br class="">
+                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);<br class="">
                               }<br class="">
                           }<br class="">
                           else<br class="">
@@ -1847,12 +1876,12 @@ void ProcessDraw(<br class="">
                           if (HasTessellationT::value)<br class="">
                           {<br class="">
                               TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(<br class="">
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
+                                    pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
                           }<br class="">
                           else if (HasGeometryShaderT::value)<br class="">
                           {<br class="">
                               GeometryShaderStage<HasStreamOutT, HasRastT>(<br class="">
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
+                                    pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));<br class="">
                           }<br class="">
                           else<br class="">
                           {<br class="">
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h<br class="">
index 13c1d8b..f7c9308 100644<br class="">
--- a/src/gallium/drivers/swr/rasterizer/core/state.h<br class="">
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h<br class="">
@@ -301,13 +301,12 @@ struct SWR_DS_CONTEXT<br class="">
/////////////////////////////////////////////////////////////////////////<br class="">
struct SWR_GS_CONTEXT<br class="">
{<br class="">
-    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims<br class="">
-    simdscalari PrimitiveID;        // IN: input primitive ID generated from the draw call<br class="">
-    uint32_t InstanceID;            // IN: input instance ID<br class="">
-    simdscalari mask;               // IN: Active mask for shader<br class="">
-    uint8_t* pStream;               // OUT: output stream (contains vertices for all output streams)<br class="">
-    uint8_t* pCutOrStreamIdBuffer;  // OUT: cut or stream id buffer<br class="">
-    simdscalari vertexCount;        // OUT: num vertices emitted per SIMD lane<br class="">
+    simdvector* pVerts;                 // IN: input primitive data for SIMD prims<br class="">
+    uint32_t inputVertStride;           // IN: input vertex stride, in attributes<br class="">
+    simdscalari PrimitiveID;            // IN: input primitive ID generated from the draw call<br class="">
+    uint32_t InstanceID;                // IN: input instance ID<br class="">
+    simdscalari mask;                   // IN: Active mask for shader<br class="">
+    uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)<br class="">
};<br class="">
<br class="">
struct PixelPositions<br class="">
@@ -714,30 +713,56 @@ struct SWR_GS_STATE<br class="">
{<br class="">
   bool gsEnable;<br class="">
<br class="">
-    // number of input attributes per vertex. used by the frontend to<br class="">
+    // Number of input attributes per vertex. Used by the frontend to<br class="">
   // optimize assembling primitives for GS<br class="">
   uint32_t numInputAttribs;<br class="">
<br class="">
-    // output topology - can be point, tristrip, or linestrip<br class="">
+    // Stride of incoming verts in attributes<br class="">
+    uint32_t inputVertStride;<br class="">
+<br class="">
+    // Output topology - can be point, tristrip, or linestrip<br class="">
   PRIMITIVE_TOPOLOGY outputTopology;      // @llvm_enum<br class="">
<br class="">
-    // maximum number of verts that can be emitted by a single instance of the GS<br class="">
+    // Maximum number of verts that can be emitted by a single instance of the GS<br class="">
   uint32_t maxNumVerts;<br class="">
<br class="">
-    // instance count<br class="">
+    // Instance count<br class="">
   uint32_t instanceCount;<br class="">
<br class="">
-    // if true, geometry shader emits a single stream, with separate cut buffer.<br class="">
-    // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer<br class="">
+    // If true, geometry shader emits a single stream, with separate cut buffer.<br class="">
+    // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer<br class="">
   // to map vertices to streams<br class="">
   bool isSingleStream;<br class="">
<br class="">
-    // when single stream is enabled, singleStreamID dictates which stream is being output.<br class="">
+    // When single stream is enabled, singleStreamID dictates which stream is being output.<br class="">
   // field ignored if isSingleStream is false<br class="">
   uint32_t singleStreamID;<br class="">
<br class="">
-    // Offset to the start of the attributes of the input vertices, in simdvector units<br class="">
+    // Total amount of memory to allocate for one instance of the shader output in bytes<br class="">
+    uint32_t allocationSize;<br class="">
+<br class="">
+    // Offset to the start of the attributes of the input vertices, in simdvector units, as read by the GS<br class="">
   uint32_t vertexAttribOffset;<br class="">
+<br class="">
+    // Offset to the attributes as stored by the preceding shader stage.<br class="">
+    uint32_t srcVertexAttribOffset;<br class="">
+<br class="">
+    // Size of the control data section which contains cut or streamID data, in simdscalar units. Should be sized to handle<br class="">
+    // the maximum number of verts output by the GS. Can be 0 if there are no cuts or streamID bits.<br class="">
+    uint32_t controlDataSize;<br class="">
+<br class="">
+    // Offset to the control data section, in bytes<br class="">
+    uint32_t controlDataOffset;<br class="">
+<br class="">
+    // Total size of an output vertex, in simdvector units<br class="">
+    uint32_t outputVertexSize;<br class="">
+<br class="">
+    // Offset to the start of the vertex section, in bytes<br class="">
+    uint32_t outputVertexOffset;<br class="">
+<br class="">
+    // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, shader is<br class="">
+    // expected to store the final vertex count in the first dword of the gs output stream.<br class="">
+    uint32_t staticVertexCount;<br class="">
};<br class="">
<br class="">
<br class="">
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp<br class="">
index 0a81eaa..7f11e72 100644<br class="">
--- a/src/gallium/drivers/swr/swr_shader.cpp<br class="">
+++ b/src/gallium/drivers/swr/swr_shader.cpp<br class="">
@@ -347,18 +347,20 @@ BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_ifac<br class="">
   Value *attrib =<br class="">
      LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)}));<br class="">
<br class="">
-    Value *pInput =<br class="">
-       LOAD(GEP(iface->pGsCtx,<br class="">
-                {C(0),<br class="">
-                 C(SWR_GS_CONTEXT_vert),<br class="">
-                 unwrap(vertex_index),<br class="">
-                 C(0),<br class="">
-                 attrib,<br class="">
-                 unwrap(swizzle_index)}));<br class="">
+    Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});<br class="">
+    Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});<br class="">
+<br class="">
+    Value *pVector = ADD(MUL(unwrap(vertex_index), pInputVertStride), attrib);<br class="">
+<br class="">
+    Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));<br class="">
<br class="">
   return wrap(pInput);<br class="">
}<br class="">
<br class="">
+// GS output stream layout<br class="">
+#define VERTEX_COUNT_SIZE 32<br class="">
+#define CONTROL_HEADER_SIZE (8*32)<br class="">
+<br class="">
void<br class="">
BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,<br class="">
                          struct lp_build_tgsi_context * bld_base,<br class="">
@@ -366,41 +368,19 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base<br class="">
                          LLVMValueRef emitted_vertices_vec)<br class="">
{<br class="">
   swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;<br class="">
-    SWR_GS_STATE *pGS = iface->pGsState;<br class="">
<br class="">
   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
-    const uint32_t simdVertexStride = sizeof(simdvertex) * 2;<br class="">
-    const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);<br class="">
-#else<br class="">
-    const uint32_t simdVertexStride = sizeof(simdvertex);<br class="">
-    const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;<br class="">
-#endif<br class="">
-    const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;<br class="">
-<br class="">
-    Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });<br class="">
-    Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });<br class="">
-    Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));<br class="">
+    const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;<br class="">
+    const uint32_t attribSize = 4 * sizeof(float);<br class="">
+    const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS;<br class="">
+    Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize));<br class="">
<br class="">
-    Value *vOffsets = C({<br class="">
-          inputPrimStride * 0,<br class="">
-          inputPrimStride * 1,<br class="">
-          inputPrimStride * 2,<br class="">
-          inputPrimStride * 3,<br class="">
-          inputPrimStride * 4,<br class="">
-          inputPrimStride * 5,<br class="">
-          inputPrimStride * 6,<br class="">
-          inputPrimStride * 7 } );<br class="">
+    Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask});<br class="">
+    Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth));<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
-    const uint32_t simdShift = log2(mVWidth * 2);<br class="">
-    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);<br class="">
-#else<br class="">
-    const uint32_t simdShift = log2(mVWidth);<br class="">
-    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);<br class="">
-#endif<br class="">
-    Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);<br class="">
+    Value *pStack = STACKSAVE();<br class="">
+    Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking<br class="">
<br class="">
   for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {<br class="">
      uint32_t attribSlot = attrib;<br class="">
@@ -420,46 +400,36 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base<br class="">
         }<br class="">
      }<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
-       Value *vOffsetsAttrib =<br class="">
-          ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));<br class="">
-       vOffsetsAttrib =<br class="">
-          ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));<br class="">
-#else<br class="">
-       Value *vOffsetsAttrib =<br class="">
-          ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));<br class="">
-       vOffsetsAttrib =<br class="">
-          ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));<br class="">
-#endif<br class="">
-       vOffsetsAttrib =<br class="">
-          ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));<br class="">
+       Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ?<br class="">
<br class="">
-       for (uint32_t channel = 0; channel < 4; ++channel) {<br class="">
-          Value *vPtrs = GEP(pStream, vOffsetsAttrib);<br class="">
-          Value *vData;<br class="">
+       for (uint32_t lane = 0; lane < mVWidth; ++lane) {<br class="">
+          Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane));<br class="">
+          Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+          Value *pStreamOffset = GEP(pStream, pLaneOffset);<br class="">
+          pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy);<br class="">
<br class="">
-          if (attribSlot == VERTEX_SGV_SLOT)<br class="">
-             vData = LOAD(unwrap(outputs[attrib][0]));<br class="">
-          else<br class="">
-             vData = LOAD(unwrap(outputs[attrib][channel]));<br class="">
+          Value *pLaneMask = VEXTRACT(vMask1, C(lane));<br class="">
+          pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);<br class="">
<br class="">
-          if (attribSlot != VERTEX_SGV_SLOT ||<br class="">
-              sgvChannel == channel) {<br class="">
-             vPtrs = BITCAST(vPtrs,<br class="">
-                             VectorType::get(PointerType::get(mFP32Ty, 0), 8));<br class="">
+          for (uint32_t channel = 0; channel < 4; ++channel) {<br class="">
+             Value *vData;<br class="">
<br class="">
-             MASKED_SCATTER(vData, vPtrs, 32, vMask1);<br class="">
-          }<br class="">
+             if (attribSlot == VERTEX_SGV_SLOT)<br class="">
+                vData = LOAD(unwrap(outputs[attrib][0]));<br class="">
+             else<br class="">
+                vData = LOAD(unwrap(outputs[attrib][channel]));<br class="">
<br class="">
-#if USE_SIMD16_FRONTEND<br class="">
-          vOffsetsAttrib =<br class="">
-             ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));<br class="">
-#else<br class="">
-          vOffsetsAttrib =<br class="">
-             ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));<br class="">
-#endif<br class="">
+             if (attribSlot != VERTEX_SGV_SLOT ||<br class="">
+                 sgvChannel == channel) {<br class="">
+                vData = VEXTRACT(vData, C(lane));<br class="">
+                STORE(vData, pStreamOffset);<br class="">
+             }<br class="">
+             pStreamOffset = GEP(pStreamOffset, C(1));<br class="">
+          }<br class="">
      }<br class="">
   }<br class="">
+<br class="">
+    STACKRESTORE(pStack);<br class="">
}<br class="">
<br class="">
void<br class="">
@@ -469,12 +439,9 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba<br class="">
                            LLVMValueRef emitted_prims_vec)<br class="">
{<br class="">
   swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;<br class="">
-    SWR_GS_STATE *pGS = iface->pGsState;<br class="">
<br class="">
   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));<br class="">
<br class="">
-    Value *pCutBuffer =<br class="">
-       LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer});<br class="">
   Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });<br class="">
   Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));<br class="">
<br class="">
@@ -496,31 +463,29 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba<br class="">
   mask = AND(mask, cmpMask);<br class="">
   vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8));<br class="">
<br class="">
-    const uint32_t cutPrimStride =<br class="">
-       (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth;<br class="">
-    Value *vOffsets = C({<br class="">
-          (uint32_t)(cutPrimStride * 0),<br class="">
-          (uint32_t)(cutPrimStride * 1),<br class="">
-          (uint32_t)(cutPrimStride * 2),<br class="">
-          (uint32_t)(cutPrimStride * 3),<br class="">
-          (uint32_t)(cutPrimStride * 4),<br class="">
-          (uint32_t)(cutPrimStride * 5),<br class="">
-          (uint32_t)(cutPrimStride * 6),<br class="">
-          (uint32_t)(cutPrimStride * 7) } );<br class="">
-<br class="">
   vCount = SUB(vCount, VIMMED1(1));<br class="">
-    Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets);<br class="">
+    Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE));<br class="">
   Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8)));<br class="">
<br class="">
   vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8));<br class="">
<br class="">
-    Value *vPtrs = GEP(pCutBuffer, vOffset);<br class="">
-    vPtrs =<br class="">
-       BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), JM()->mVWidth));<br class="">
+    Value *pStack = STACKSAVE();<br class="">
+    Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking<br class="">
+<br class="">
+    for (uint32_t lane = 0; lane < mVWidth; ++lane) {<br class="">
+       Value *vLaneOffset = VEXTRACT(vOffset, C(lane));<br class="">
+       Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+       Value *pStreamOffset = GEP(pStream, vLaneOffset);<br class="">
+<br class="">
+       Value *pLaneMask = VEXTRACT(vMask1, C(lane));<br class="">
+       pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);<br class="">
<br class="">
-    Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1);<br class="">
-    vValue = OR(vGather, vValue);<br class="">
-    MASKED_SCATTER(vValue, vPtrs, 32, vMask1);<br class="">
+       Value *vVal = LOAD(pStreamOffset);<br class="">
+       vVal = OR(vVal, VEXTRACT(vValue, C(lane)));<br class="">
+       STORE(vVal, pStreamOffset);<br class="">
+    }<br class="">
+<br class="">
+    STACKRESTORE(pStack);<br class="">
}<br class="">
<br class="">
void<br class="">
@@ -533,7 +498,14 @@ BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base,<br class="">
<br class="">
  IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));<br class="">
<br class="">
-   STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, SWR_GS_CONTEXT_vertexCount});<br class="">
+   // Store emit count to each output stream in the first DWORD<br class="">
+   for (uint32_t lane = 0; lane < mVWidth; ++lane)<br class="">
+   {<br class="">
+      Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+      pStream = BITCAST(pStream, mInt32PtrTy);<br class="">
+      Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane));<br class="">
+      STORE(pLaneCount, pStream);<br class="">
+   }<br class="">
}<br class="">
<br class="">
PFN_GS_FUNC<br class="">
@@ -542,6 +514,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)<br class="">
  SWR_GS_STATE *pGS = &ctx->gs->gsState;<br class="">
  struct tgsi_shader_info *info = &ctx->gs->info.base;<br class="">
<br class="">
+   memset(pGS, 0, sizeof(*pGS));<br class="">
+<br class="">
  pGS->gsEnable = true;<br class="">
<br class="">
  pGS->numInputAttribs = info->num_inputs;<br class="">
@@ -555,6 +529,18 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)<br class="">
  pGS->singleStreamID = 0;<br class="">
<br class="">
  pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize<br class="">
+   pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize<br class="">
+   pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;<br class="">
+   pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;<br class="">
+   pGS->controlDataSize = 8; // GS ouputs max of 8 32B units<br class="">
+   pGS->controlDataOffset = 32;<br class="">
+   pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;<br class="">
+<br class="">
+   pGS->allocationSize =<br class="">
+      32 + // vertex count<br class="">
+      (8 * 32) + // control header<br class="">
+      (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex<br class="">
+      pGS->maxNumVerts; // num verts<br class="">
</blockquote>
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">Consider
 using VERTEX_COUNT_SIZE and CONTROL_HEADER_SIZE defines?</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">     pGS->controlDataOffset
 = VERTEX_COUNT_SIZE;</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">     pGS->outputVertexOffset
 = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">     pGS->allocationSize
 =</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">        VERTEX_COUNT_SIZE
 +<span class="Apple-converted-space"> </span></span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">        CONTROL_HEADER_SIZE</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">        (SWR_VTX_NUM_SLOTS
 * 16) * // sizeof vertex</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<span style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px; float: none; display: inline !important;" class="">        pGS->maxNumVerts;
 // num verts</span><br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<br style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; word-spacing: 0px; -webkit-text-stroke-width: 0px;" class="">
<blockquote type="cite" style="font-family: Helvetica; font-size: 12px; font-style: normal; font-variant-caps: normal; font-weight: normal; letter-spacing: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: auto; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px;" class="">
  struct swr_geometry_shader *gs = ctx->gs;<br class="">
<br class="">
@@ -635,10 +621,11 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)<br class="">
                      lp_type_float_vec(32, 32 * 8), wrap(mask_val));<br class="">
<br class="">
  // zero out cut buffer so we can load/modify/store bits<br class="">
-   MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}),<br class="">
-          C((char)0),<br class="">
-          pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth,<br class="">
-          sizeof(float) * KNOB_SIMD_WIDTH);<br class="">
+   for (uint32_t lane = 0; lane < mVWidth; ++lane)<br class="">
+   {<br class="">
+      Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});<br class="">
+      MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH);<br class="">
+   }<br class="">
<br class="">
  struct swr_gs_llvm_iface gs_iface;<br class="">
  gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input;<br class="">
--<span class="Apple-converted-space"> </span><br class="">
2.7.4<br class="">
<br class="">
_______________________________________________<br class="">
mesa-dev mailing list<br class="">
<a href="mailto:mesa-dev@lists.freedesktop.org" class="">mesa-dev@lists.freedesktop.org</a><br class="">
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" class="">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a></blockquote>
</div>
</blockquote>
</div>
<br class="">
</div>
</div>
</div>
</blockquote>
</div>
<br class="">
</body>
</html>