[Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API

Mon Sep 25 16:47:48 UTC 2017

On Sep 25, 2017, at 11:31 AM, Rowley, Timothy O <timothy.o.rowley at intel.com<mailto:timothy.o.rowley at intel.com>> wrote:

Ok, made the following changes - want a full v2 commit, or ok to do this on push?

I'm fine with doing it on push and don't need a full v2.  It simply replaces a couple magic numbers with their defines -- no functional change.
I'll mark the entire set rvb in patch 0/9.

--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
    pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
    pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
    pGS->controlDataSize = 8; // GS ouputs max of 8 32B units
-   pGS->controlDataOffset = 32;
-   pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;
+   pGS->controlDataOffset = VERTEX_COUNT_SIZE;
+   pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;

    pGS->allocationSize =
-      32 + // vertex count
-      (8 * 32) + // control header
+      VERTEX_COUNT_SIZE + // vertex count
+      CONTROL_HEADER_SIZE + // control header
       (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
       pGS->maxNumVerts; // num verts


On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce <bruce.cherniak at intel.com<mailto:bruce.cherniak at intel.com>> wrote:


On Sep 21, 2017, at 7:46 PM, Tim Rowley <timothy.o.rowley at intel.com<mailto:timothy.o.rowley at intel.com>> wrote:

One piglit regression, which was a false pass:
spec at glsl-1.50@execution at geometry@dynamic_input_array_index
---
.../drivers/swr/rasterizer/core/frontend.cpp       | 227 ++++++++++++---------
src/gallium/drivers/swr/rasterizer/core/state.h    |  55 +++--
src/gallium/drivers/swr/swr_shader.cpp             | 183 ++++++++---------
3 files changed, 253 insertions(+), 212 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f882869..26e76a9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num

THREAD SWR_GS_CONTEXT tlsGsContext;

-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
{
-    GsBufferInfo(const SWR_GS_STATE &gsState)
-    {
-        const uint32_t vertexCount = gsState.maxNumVerts;
-        const uint32_t vertexStride = sizeof(SIMDVERTEX);
-        const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
+    uint8_t* pGsIn;
+    uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+    uint8_t* pGsTransposed;
+    void* pStreamCutBuffer;
+};

-        vertexPrimitiveStride = vertexStride * numSimdBatches;
-        vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//////////////////////////////////////////////////////////////////////////
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template<typename SIMD_T, uint32_t SimdWidth>
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
+{
+    uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+    uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;

-        if (gsState.isSingleStream)
-        {
-            cutPrimitiveStride = (vertexCount + 7) / 8;
-            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+    OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];

-            streamCutPrimitiveStride = 0;
-            streamCutInstanceStride = 0;
-        }
-        else
-        {
-            cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-            cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
-            streamCutPrimitiveStride = (vertexCount + 7) / 8;
-            streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
-        }
+    for (uint32_t i = 0; i < SimdWidth; ++i)
+    {
+        gatherOffsets[i] = srcVertexStride * i;
   }
+    auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);

-    uint32_t vertexPrimitiveStride;
-    uint32_t vertexInstanceStride;
+    uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+    uint32_t remainingVerts = numVerts;

-    uint32_t cutPrimitiveStride;
-    uint32_t cutInstanceStride;
+    for (uint32_t s = 0; s < numSimd; ++s)
+    {
+        uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+        uint8_t* pDstBase = pDst + s * dstVertexStride;

-    uint32_t streamCutPrimitiveStride;
-    uint32_t streamCutInstanceStride;
-};
+        // Compute mask to prevent src overflow
+        uint32_t mask = std::min(remainingVerts, SimdWidth);
+        mask = GenMask(mask);
+        auto vMask = SIMD_T::vmask_ps(mask);
+        auto viMask = SIMD_T::castps_si(vMask);
+
+        for (uint32_t a = 0; a < numAttribs; ++a)
+        {
+            auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
+            auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
+            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
+            auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
+
+            SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
+
+            pSrcBase += sizeof(float) * 4;
+            pDstBase += sizeof(typename SIMD_T::Float) * 4;
+        }
+        remainingVerts -= SimdWidth;
+    }
+}

//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
@@ -763,9 +785,7 @@ static void GeometryShaderStage(
   DRAW_CONTEXT *pDC,
   uint32_t workerId,
   PA_STATE& pa,
-    void* pGsOut,
-    void* pCutBuffer,
-    void* pStreamCutBuffer,
+    GsBuffers* pGsBuffers,
   uint32_t* pSoPrimData,
#if USE_SIMD16_FRONTEND
   uint32_t numPrims_simd8,
@@ -779,25 +799,29 @@ static void GeometryShaderStage(
   const API_STATE& state = GetApiState(pDC);
   const SWR_GS_STATE* pState = &state.gsState;

-    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
-    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
+    static uint8_t sNullBuffer[1024] = { 0 };

-    tlsGsContext.pStream = (uint8_t*)pGsOut;
-    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+    {
+        tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];
+    }
+    tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
   tlsGsContext.PrimitiveID = primID;

   uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
   simdvector attrib[MAX_NUM_VERTS_PER_PRIM];

   // assemble all attributes for the input primitive
+    tlsGsContext.inputVertStride = pState->inputVertStride;
   for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
   {
+        uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
       uint32_t attribSlot = pState->vertexAttribOffset + slot;
-        pa.Assemble(attribSlot, attrib);
+        pa.Assemble(srcAttribSlot, attrib);

       for (uint32_t i = 0; i < numVertsPerPrim; ++i)
       {
-            tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
+            tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
       }
   }

@@ -805,15 +829,9 @@ static void GeometryShaderStage(
   pa.Assemble(VERTEX_POSITION_SLOT, attrib);
   for (uint32_t i = 0; i < numVertsPerPrim; ++i)
   {
-        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+        tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
   }

-#if USE_SIMD16_FRONTEND
-    const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
-#else
-    const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
-#endif
-
   // record valid prims from the frontend to avoid over binning the newly generated
   // prims from the GS
#if USE_SIMD16_FRONTEND
@@ -830,8 +848,10 @@ static void GeometryShaderStage(
       // execute the geometry shader
       state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);

-        tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
-        tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
+        for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+        {
+            tlsGsContext.pStreams[i] += pState->allocationSize;
+        }
   }

   // set up new binner and state for the GS output topology
@@ -865,32 +885,48 @@ static void GeometryShaderStage(
   // foreach input prim:
   // - setup a new PA based on the emitted verts for that prim
   // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
   uint32_t* pPrimitiveId = (uint32_t*)&primID;

   uint32_t totalPrimsGenerated = 0;
   for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
   {
-        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
-        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
+        uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
+
+        // Vertex count is either emitted by shader or static
+        uint32_t vertexCount = 0;
+        if (pState->staticVertexCount)
+        {
+            vertexCount = pState->staticVertexCount;
+        }
+        else
+        {
+            // If emitted in shader, it should be the stored in the first dword of the output buffer
+            vertexCount = *(uint32_t*)pInstanceBase;
+        }

       for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
       {
-            uint32_t numEmittedVerts = pVertexCount[inputPrim];
+            uint32_t numEmittedVerts = vertexCount;
           if (numEmittedVerts == 0)
           {
               continue;
           }

-            uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
-            uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
+            uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
+            uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
+            uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
+
+#if USE_SIMD16_FRONTEND
+            TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
+#else
+            TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
+#endif

           uint32_t numAttribs = state.feNumAttributes;

           for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
           {
               bool processCutVerts = false;
-
               uint8_t* pCutBuffer = pCutBase;

               // assign default stream ID, only relevant when GS is outputting a single stream
@@ -910,16 +946,16 @@ static void GeometryShaderStage(
                   }

                   // multi-stream output, need to translate StreamID buffer to a cut buffer
-                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
-                    pCutBuffer = (uint8_t*)pStreamCutBuffer;
+                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
+                    pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
                   processCutVerts = false;
               }

#if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);

#else
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);

#endif
               while (gsPa.GetNextStreamOutput())
@@ -979,42 +1015,40 @@ static void GeometryShaderStage(
/// @param state - API state
/// @param ppGsOut - pointer to GS output buffer allocation
/// @param ppCutBuffer - pointer to GS output cut buffer allocation
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
-    void **ppStreamCutBuffer)
+template<typename SIMD_T, uint32_t SIMD_WIDTH>
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
{
   auto pArena = pDC->pArena;
   SWR_ASSERT(pArena != nullptr);
   SWR_ASSERT(state.gsState.gsEnable);

-    // allocate arena space to hold GS output verts
-    // @todo pack attribs
-    // @todo support multiple streams
+    const SWR_GS_STATE& gsState = state.gsState;

-    const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
+    // Allocate storage for vertex inputs
+    uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
+    pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);

-    const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
+    // Allocate arena space to hold GS output verts
+    const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;

-    *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+    {
+        pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
+    }

-    // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
-    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
+    // Allocate storage for transposed GS output
+    uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
+    uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
+    pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);

-    // allocate space for temporary per-stream cut buffer if multi-stream is enabled
+    // Allocate storage to hold temporary stream->cut buffer, if necessary
   if (state.gsState.isSingleStream)
   {
-        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
-
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
-        *ppStreamCutBuffer = nullptr;
+        pGsBuffers->pStreamCutBuffer = nullptr;
   }
   else
   {
-        const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
-        const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
-
-        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
-        *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
+        pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
   }
}

@@ -1062,9 +1096,7 @@ static void TessellationStages(
   DRAW_CONTEXT *pDC,
   uint32_t workerId,
   PA_STATE& pa,
-    void* pGsOut,
-    void* pCutBuffer,
-    void* pCutStreamBuffer,
+    GsBuffers* pGsBuffers,
   uint32_t* pSoPrimData,
#if USE_SIMD16_FRONTEND
   uint32_t numPrims_simd8,
@@ -1264,17 +1296,16 @@ static void TessellationStages(
           {
#if USE_SIMD16_FRONTEND
               tessPa.useAlternateOffset = false;
-                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
+                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);

               if (numPrims_hi)
               {
                   tessPa.useAlternateOffset = true;
-                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
               }
#else
               GeometryShaderStage<HasStreamOutT, HasRastT>(
-                    pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
-                    _simd_set1_epi32(dsContext.PrimitiveID));
+                    pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
#endif
           }
           else
@@ -1408,15 +1439,13 @@ void ProcessDraw(
   uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
#endif

-    void* pGsOut = nullptr;
-    void* pCutBuffer = nullptr;
-    void* pStreamCutBuffer = nullptr;
+    GsBuffers gsBuffers;
   if (HasGeometryShaderT::value)
   {
#if USE_SIMD16_FRONTEND
-        AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+        AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
#else
-        AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+        AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
#endif
   }

@@ -1672,23 +1701,23 @@ void ProcessDraw(
                           if (HasTessellationT::value)
                           {
                               pa.useAlternateOffset = false;
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);

                               if (numPrims_hi)
                               {
                                   pa.useAlternateOffset = true;
-                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                               }
                           }
                           else if (HasGeometryShaderT::value)
                           {
                               pa.useAlternateOffset = false;
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);

                               if (numPrims_hi)
                               {
                                   pa.useAlternateOffset = true;
-                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                               }
                           }
                           else
@@ -1847,12 +1876,12 @@ void ProcessDraw(
                           if (HasTessellationT::value)
                           {
                               TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                    pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
                           }
                           else if (HasGeometryShaderT::value)
                           {
                               GeometryShaderStage<HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                    pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
                           }
                           else
                           {
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 13c1d8b..f7c9308 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -301,13 +301,12 @@ struct SWR_DS_CONTEXT
/////////////////////////////////////////////////////////////////////////
struct SWR_GS_CONTEXT
{
-    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims
-    simdscalari PrimitiveID;        // IN: input primitive ID generated from the draw call
-    uint32_t InstanceID;            // IN: input instance ID
-    simdscalari mask;               // IN: Active mask for shader
-    uint8_t* pStream;               // OUT: output stream (contains vertices for all output streams)
-    uint8_t* pCutOrStreamIdBuffer;  // OUT: cut or stream id buffer
-    simdscalari vertexCount;        // OUT: num vertices emitted per SIMD lane
+    simdvector* pVerts;                 // IN: input primitive data for SIMD prims
+    uint32_t inputVertStride;           // IN: input vertex stride, in attributes
+    simdscalari PrimitiveID;            // IN: input primitive ID generated from the draw call
+    uint32_t InstanceID;                // IN: input instance ID
+    simdscalari mask;                   // IN: Active mask for shader
+    uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)
};

struct PixelPositions
@@ -714,30 +713,56 @@ struct SWR_GS_STATE
{
   bool gsEnable;

-    // number of input attributes per vertex. used by the frontend to
+    // Number of input attributes per vertex. Used by the frontend to
   // optimize assembling primitives for GS
   uint32_t numInputAttribs;

-    // output topology - can be point, tristrip, or linestrip
+    // Stride of incoming verts in attributes
+    uint32_t inputVertStride;
+
+    // Output topology - can be point, tristrip, or linestrip
   PRIMITIVE_TOPOLOGY outputTopology;      // @llvm_enum

-    // maximum number of verts that can be emitted by a single instance of the GS
+    // Maximum number of verts that can be emitted by a single instance of the GS
   uint32_t maxNumVerts;

-    // instance count
+    // Instance count
   uint32_t instanceCount;

-    // if true, geometry shader emits a single stream, with separate cut buffer.
-    // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
+    // If true, geometry shader emits a single stream, with separate cut buffer.
+    // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
   // to map vertices to streams
   bool isSingleStream;

-    // when single stream is enabled, singleStreamID dictates which stream is being output.
+    // When single stream is enabled, singleStreamID dictates which stream is being output.
   // field ignored if isSingleStream is false
   uint32_t singleStreamID;

-    // Offset to the start of the attributes of the input vertices, in simdvector units
+    // Total amount of memory to allocate for one instance of the shader output in bytes
+    uint32_t allocationSize;
+
+    // Offset to the start of the attributes of the input vertices, in simdvector units, as read by the GS
   uint32_t vertexAttribOffset;
+
+    // Offset to the attributes as stored by the preceding shader stage.
+    uint32_t srcVertexAttribOffset;
+
+    // Size of the control data section which contains cut or streamID data, in simdscalar units. Should be sized to handle
+    // the maximum number of verts output by the GS. Can be 0 if there are no cuts or streamID bits.
+    uint32_t controlDataSize;
+
+    // Offset to the control data section, in bytes
+    uint32_t controlDataOffset;
+
+    // Total size of an output vertex, in simdvector units
+    uint32_t outputVertexSize;
+
+    // Offset to the start of the vertex section, in bytes
+    uint32_t outputVertexOffset;
+
+    // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, shader is
+    // expected to store the final vertex count in the first dword of the gs output stream.
+    uint32_t staticVertexCount;
};


diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
index 0a81eaa..7f11e72 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -347,18 +347,20 @@ BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_ifac
   Value *attrib =
      LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)}));

-    Value *pInput =
-       LOAD(GEP(iface->pGsCtx,
-                {C(0),
-                 C(SWR_GS_CONTEXT_vert),
-                 unwrap(vertex_index),
-                 C(0),
-                 attrib,
-                 unwrap(swizzle_index)}));
+    Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});
+    Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});
+
+    Value *pVector = ADD(MUL(unwrap(vertex_index), pInputVertStride), attrib);
+
+    Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));

   return wrap(pInput);
}

+// GS output stream layout
+#define VERTEX_COUNT_SIZE 32
+#define CONTROL_HEADER_SIZE (8*32)
+
void
BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,
                          struct lp_build_tgsi_context * bld_base,
@@ -366,41 +368,19 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
                          LLVMValueRef emitted_vertices_vec)
{
   swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-    SWR_GS_STATE *pGS = iface->pGsState;

   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));

-#if USE_SIMD16_FRONTEND
-    const uint32_t simdVertexStride = sizeof(simdvertex) * 2;
-    const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);
-#else
-    const uint32_t simdVertexStride = sizeof(simdvertex);
-    const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;
-#endif
-    const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;
-
-    Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });
-    Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
-    Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));
+    const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;
+    const uint32_t attribSize = 4 * sizeof(float);
+    const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS;
+    Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize));

-    Value *vOffsets = C({
-          inputPrimStride * 0,
-          inputPrimStride * 1,
-          inputPrimStride * 2,
-          inputPrimStride * 3,
-          inputPrimStride * 4,
-          inputPrimStride * 5,
-          inputPrimStride * 6,
-          inputPrimStride * 7 } );
+    Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask});
+    Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth));

-#if USE_SIMD16_FRONTEND
-    const uint32_t simdShift = log2(mVWidth * 2);
-    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);
-#else
-    const uint32_t simdShift = log2(mVWidth);
-    Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);
-#endif
-    Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);
+    Value *pStack = STACKSAVE();
+    Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking

   for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
      uint32_t attribSlot = attrib;
@@ -420,46 +400,36 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
         }
      }

-#if USE_SIMD16_FRONTEND
-       Value *vOffsetsAttrib =
-          ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));
-       vOffsetsAttrib =
-          ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));
-#else
-       Value *vOffsetsAttrib =
-          ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));
-       vOffsetsAttrib =
-          ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));
-#endif
-       vOffsetsAttrib =
-          ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
+       Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ?

-       for (uint32_t channel = 0; channel < 4; ++channel) {
-          Value *vPtrs = GEP(pStream, vOffsetsAttrib);
-          Value *vData;
+       for (uint32_t lane = 0; lane < mVWidth; ++lane) {
+          Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane));
+          Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+          Value *pStreamOffset = GEP(pStream, pLaneOffset);
+          pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy);

-          if (attribSlot == VERTEX_SGV_SLOT)
-             vData = LOAD(unwrap(outputs[attrib][0]));
-          else
-             vData = LOAD(unwrap(outputs[attrib][channel]));
+          Value *pLaneMask = VEXTRACT(vMask1, C(lane));
+          pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);

-          if (attribSlot != VERTEX_SGV_SLOT ||
-              sgvChannel == channel) {
-             vPtrs = BITCAST(vPtrs,
-                             VectorType::get(PointerType::get(mFP32Ty, 0), 8));
+          for (uint32_t channel = 0; channel < 4; ++channel) {
+             Value *vData;

-             MASKED_SCATTER(vData, vPtrs, 32, vMask1);
-          }
+             if (attribSlot == VERTEX_SGV_SLOT)
+                vData = LOAD(unwrap(outputs[attrib][0]));
+             else
+                vData = LOAD(unwrap(outputs[attrib][channel]));

-#if USE_SIMD16_FRONTEND
-          vOffsetsAttrib =
-             ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));
-#else
-          vOffsetsAttrib =
-             ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));
-#endif
+             if (attribSlot != VERTEX_SGV_SLOT ||
+                 sgvChannel == channel) {
+                vData = VEXTRACT(vData, C(lane));
+                STORE(vData, pStreamOffset);
+             }
+             pStreamOffset = GEP(pStreamOffset, C(1));
+          }
      }
   }
+
+    STACKRESTORE(pStack);
}

void
@@ -469,12 +439,9 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba
                            LLVMValueRef emitted_prims_vec)
{
   swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
-    SWR_GS_STATE *pGS = iface->pGsState;

   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));

-    Value *pCutBuffer =
-       LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer});
   Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
   Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));

@@ -496,31 +463,29 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba
   mask = AND(mask, cmpMask);
   vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8));

-    const uint32_t cutPrimStride =
-       (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth;
-    Value *vOffsets = C({
-          (uint32_t)(cutPrimStride * 0),
-          (uint32_t)(cutPrimStride * 1),
-          (uint32_t)(cutPrimStride * 2),
-          (uint32_t)(cutPrimStride * 3),
-          (uint32_t)(cutPrimStride * 4),
-          (uint32_t)(cutPrimStride * 5),
-          (uint32_t)(cutPrimStride * 6),
-          (uint32_t)(cutPrimStride * 7) } );
-
   vCount = SUB(vCount, VIMMED1(1));
-    Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets);
+    Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE));
   Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8)));

   vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8));

-    Value *vPtrs = GEP(pCutBuffer, vOffset);
-    vPtrs =
-       BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), JM()->mVWidth));
+    Value *pStack = STACKSAVE();
+    Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking
+
+    for (uint32_t lane = 0; lane < mVWidth; ++lane) {
+       Value *vLaneOffset = VEXTRACT(vOffset, C(lane));
+       Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+       Value *pStreamOffset = GEP(pStream, vLaneOffset);
+
+       Value *pLaneMask = VEXTRACT(vMask1, C(lane));
+       pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);

-    Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1);
-    vValue = OR(vGather, vValue);
-    MASKED_SCATTER(vValue, vPtrs, 32, vMask1);
+       Value *vVal = LOAD(pStreamOffset);
+       vVal = OR(vVal, VEXTRACT(vValue, C(lane)));
+       STORE(vVal, pStreamOffset);
+    }
+
+    STACKRESTORE(pStack);
}

void
@@ -533,7 +498,14 @@ BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base,

  IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));

-   STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, SWR_GS_CONTEXT_vertexCount});
+   // Store emit count to each output stream in the first DWORD
+   for (uint32_t lane = 0; lane < mVWidth; ++lane)
+   {
+      Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+      pStream = BITCAST(pStream, mInt32PtrTy);
+      Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane));
+      STORE(pLaneCount, pStream);
+   }
}

PFN_GS_FUNC
@@ -542,6 +514,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
  SWR_GS_STATE *pGS = &ctx->gs->gsState;
  struct tgsi_shader_info *info = &ctx->gs->info.base;

+   memset(pGS, 0, sizeof(*pGS));
+
  pGS->gsEnable = true;

  pGS->numInputAttribs = info->num_inputs;
@@ -555,6 +529,18 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
  pGS->singleStreamID = 0;

  pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
+   pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
+   pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
+   pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
+   pGS->controlDataSize = 8; // GS ouputs max of 8 32B units
+   pGS->controlDataOffset = 32;
+   pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;
+
+   pGS->allocationSize =
+      32 + // vertex count
+      (8 * 32) + // control header
+      (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
+      pGS->maxNumVerts; // num verts

Consider using VERTEX_COUNT_SIZE and CONTROL_HEADER_SIZE defines?

     pGS->controlDataOffset = VERTEX_COUNT_SIZE;
     pGS->outputVertexOffset = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;

     pGS->allocationSize =
        VERTEX_COUNT_SIZE +
        CONTROL_HEADER_SIZE
        (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
        pGS->maxNumVerts; // num verts

  struct swr_geometry_shader *gs = ctx->gs;

@@ -635,10 +621,11 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
                      lp_type_float_vec(32, 32 * 8), wrap(mask_val));

  // zero out cut buffer so we can load/modify/store bits
-   MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}),
-          C((char)0),
-          pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth,
-          sizeof(float) * KNOB_SIMD_WIDTH);
+   for (uint32_t lane = 0; lane < mVWidth; ++lane)
+   {
+      Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+      MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH);
+   }

  struct swr_gs_llvm_iface gs_iface;
  gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input;
--
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev at lists.freedesktop.org<mailto:mesa-dev at lists.freedesktop.org>
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170925/236003bd/attachment-0001.html>