[Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API
Cherniak, Bruce
bruce.cherniak at intel.com
Sun Sep 24 02:51:03 UTC 2017
> On Sep 21, 2017, at 7:46 PM, Tim Rowley <timothy.o.rowley at intel.com> wrote:
>
> One piglit regression, which was a false pass:
> spec at glsl-1.50@execution at geometry@dynamic_input_array_index
> ---
> .../drivers/swr/rasterizer/core/frontend.cpp | 227 ++++++++++++---------
> src/gallium/drivers/swr/rasterizer/core/state.h | 55 +++--
> src/gallium/drivers/swr/swr_shader.cpp | 183 ++++++++---------
> 3 files changed, 253 insertions(+), 212 deletions(-)
>
> diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
> index f882869..26e76a9 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
> +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
> @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
>
> THREAD SWR_GS_CONTEXT tlsGsContext;
>
> -template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
> -struct GsBufferInfo
> +// Buffers that are allocated if GS is enabled
> +struct GsBuffers
> {
> - GsBufferInfo(const SWR_GS_STATE &gsState)
> - {
> - const uint32_t vertexCount = gsState.maxNumVerts;
> - const uint32_t vertexStride = sizeof(SIMDVERTEX);
> - const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
> + uint8_t* pGsIn;
> + uint8_t* pGsOut[KNOB_SIMD_WIDTH];
> + uint8_t* pGsTransposed;
> + void* pStreamCutBuffer;
> +};
>
> - vertexPrimitiveStride = vertexStride * numSimdBatches;
> - vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
> +//////////////////////////////////////////////////////////////////////////
> +/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
> +/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
> +/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
> +/// @param numVerts - Number of vertices outputted by the GS
> +/// @param numAttribs - Number of attributes per vertex
> +template<typename SIMD_T, uint32_t SimdWidth>
> +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
> +{
> + uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
> + uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
>
> - if (gsState.isSingleStream)
> - {
> - cutPrimitiveStride = (vertexCount + 7) / 8;
> - cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
> + OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
>
> - streamCutPrimitiveStride = 0;
> - streamCutInstanceStride = 0;
> - }
> - else
> - {
> - cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
> - cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
> -
> - streamCutPrimitiveStride = (vertexCount + 7) / 8;
> - streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
> - }
> + for (uint32_t i = 0; i < SimdWidth; ++i)
> + {
> + gatherOffsets[i] = srcVertexStride * i;
> }
> + auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);
>
> - uint32_t vertexPrimitiveStride;
> - uint32_t vertexInstanceStride;
> + uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
> + uint32_t remainingVerts = numVerts;
>
> - uint32_t cutPrimitiveStride;
> - uint32_t cutInstanceStride;
> + for (uint32_t s = 0; s < numSimd; ++s)
> + {
> + uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
> + uint8_t* pDstBase = pDst + s * dstVertexStride;
>
> - uint32_t streamCutPrimitiveStride;
> - uint32_t streamCutInstanceStride;
> -};
> + // Compute mask to prevent src overflow
> + uint32_t mask = std::min(remainingVerts, SimdWidth);
> + mask = GenMask(mask);
> + auto vMask = SIMD_T::vmask_ps(mask);
> + auto viMask = SIMD_T::castps_si(vMask);
> +
> + for (uint32_t a = 0; a < numAttribs; ++a)
> + {
> + auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
> + auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
> + auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
> + auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
> +
> + SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
> + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
> + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
> + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
> +
> + pSrcBase += sizeof(float) * 4;
> + pDstBase += sizeof(typename SIMD_T::Float) * 4;
> + }
> + remainingVerts -= SimdWidth;
> + }
> +}
>
> //////////////////////////////////////////////////////////////////////////
> /// @brief Implements GS stage.
> @@ -763,9 +785,7 @@ static void GeometryShaderStage(
> DRAW_CONTEXT *pDC,
> uint32_t workerId,
> PA_STATE& pa,
> - void* pGsOut,
> - void* pCutBuffer,
> - void* pStreamCutBuffer,
> + GsBuffers* pGsBuffers,
> uint32_t* pSoPrimData,
> #if USE_SIMD16_FRONTEND
> uint32_t numPrims_simd8,
> @@ -779,25 +799,29 @@ static void GeometryShaderStage(
> const API_STATE& state = GetApiState(pDC);
> const SWR_GS_STATE* pState = &state.gsState;
>
> - SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
> - SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
> + static uint8_t sNullBuffer[1024] = { 0 };
>
> - tlsGsContext.pStream = (uint8_t*)pGsOut;
> - tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
> + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
> + {
> + tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];
> + }
> + tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
> tlsGsContext.PrimitiveID = primID;
>
> uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
> simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
>
> // assemble all attributes for the input primitive
> + tlsGsContext.inputVertStride = pState->inputVertStride;
> for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
> {
> + uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
> uint32_t attribSlot = pState->vertexAttribOffset + slot;
> - pa.Assemble(attribSlot, attrib);
> + pa.Assemble(srcAttribSlot, attrib);
>
> for (uint32_t i = 0; i < numVertsPerPrim; ++i)
> {
> - tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
> + tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
> }
> }
>
> @@ -805,15 +829,9 @@ static void GeometryShaderStage(
> pa.Assemble(VERTEX_POSITION_SLOT, attrib);
> for (uint32_t i = 0; i < numVertsPerPrim; ++i)
> {
> - tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
> + tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
> }
>
> -#if USE_SIMD16_FRONTEND
> - const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
> -#else
> - const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
> -#endif
> -
> // record valid prims from the frontend to avoid over binning the newly generated
> // prims from the GS
> #if USE_SIMD16_FRONTEND
> @@ -830,8 +848,10 @@ static void GeometryShaderStage(
> // execute the geometry shader
> state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
>
> - tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
> - tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
> + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
> + {
> + tlsGsContext.pStreams[i] += pState->allocationSize;
> + }
> }
>
> // set up new binner and state for the GS output topology
> @@ -865,32 +885,48 @@ static void GeometryShaderStage(
> // foreach input prim:
> // - setup a new PA based on the emitted verts for that prim
> // - loop over the new verts, calling PA to assemble each prim
> - uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
> uint32_t* pPrimitiveId = (uint32_t*)&primID;
>
> uint32_t totalPrimsGenerated = 0;
> for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
> {
> - uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
> - uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
> + uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
> +
> + // Vertex count is either emitted by shader or static
> + uint32_t vertexCount = 0;
> + if (pState->staticVertexCount)
> + {
> + vertexCount = pState->staticVertexCount;
> + }
> + else
> + {
> + // If emitted in shader, it should be the stored in the first dword of the output buffer
> + vertexCount = *(uint32_t*)pInstanceBase;
> + }
>
> for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
> {
> - uint32_t numEmittedVerts = pVertexCount[inputPrim];
> + uint32_t numEmittedVerts = vertexCount;
> if (numEmittedVerts == 0)
> {
> continue;
> }
>
> - uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
> - uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
> + uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
> + uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
> + uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
> +
> +#if USE_SIMD16_FRONTEND
> + TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
> +#else
> + TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
> +#endif
>
> uint32_t numAttribs = state.feNumAttributes;
>
> for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
> {
> bool processCutVerts = false;
> -
> uint8_t* pCutBuffer = pCutBase;
>
> // assign default stream ID, only relevant when GS is outputting a single stream
> @@ -910,16 +946,16 @@ static void GeometryShaderStage(
> }
>
> // multi-stream output, need to translate StreamID buffer to a cut buffer
> - ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
> - pCutBuffer = (uint8_t*)pStreamCutBuffer;
> + ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
> + pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
> processCutVerts = false;
> }
>
> #if USE_SIMD16_FRONTEND
> - PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
> + PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
>
> #else
> - PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
> + PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
>
> #endif
> while (gsPa.GetNextStreamOutput())
> @@ -979,42 +1015,40 @@ static void GeometryShaderStage(
> /// @param state - API state
> /// @param ppGsOut - pointer to GS output buffer allocation
> /// @param ppCutBuffer - pointer to GS output cut buffer allocation
> -template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
> -static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
> - void **ppStreamCutBuffer)
> +template<typename SIMD_T, uint32_t SIMD_WIDTH>
> +static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
> {
> auto pArena = pDC->pArena;
> SWR_ASSERT(pArena != nullptr);
> SWR_ASSERT(state.gsState.gsEnable);
>
> - // allocate arena space to hold GS output verts
> - // @todo pack attribs
> - // @todo support multiple streams
> + const SWR_GS_STATE& gsState = state.gsState;
>
> - const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
> + // Allocate storage for vertex inputs
> + uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
> + pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
>
> - const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
> + // Allocate arena space to hold GS output verts
> + const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
>
> - *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
> + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
> + {
> + pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
> + }
>
> - // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
> - // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
> + // Allocate storage for transposed GS output
> + uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
> + uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
> + pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
>
> - // allocate space for temporary per-stream cut buffer if multi-stream is enabled
> + // Allocate storage to hold temporary stream->cut buffer, if necessary
> if (state.gsState.isSingleStream)
> {
> - const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
> -
> - *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
> - *ppStreamCutBuffer = nullptr;
> + pGsBuffers->pStreamCutBuffer = nullptr;
> }
> else
> {
> - const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
> - const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
> -
> - *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
> - *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
> + pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
> }
> }
>
> @@ -1062,9 +1096,7 @@ static void TessellationStages(
> DRAW_CONTEXT *pDC,
> uint32_t workerId,
> PA_STATE& pa,
> - void* pGsOut,
> - void* pCutBuffer,
> - void* pCutStreamBuffer,
> + GsBuffers* pGsBuffers,
> uint32_t* pSoPrimData,
> #if USE_SIMD16_FRONTEND
> uint32_t numPrims_simd8,
> @@ -1264,17 +1296,16 @@ static void TessellationStages(
> {
> #if USE_SIMD16_FRONTEND
> tessPa.useAlternateOffset = false;
> - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
> + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
>
> if (numPrims_hi)
> {
> tessPa.useAlternateOffset = true;
> - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
> + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
> }
> #else
> GeometryShaderStage<HasStreamOutT, HasRastT>(
> - pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
> - _simd_set1_epi32(dsContext.PrimitiveID));
> + pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
> #endif
> }
> else
> @@ -1408,15 +1439,13 @@ void ProcessDraw(
> uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
> #endif
>
> - void* pGsOut = nullptr;
> - void* pCutBuffer = nullptr;
> - void* pStreamCutBuffer = nullptr;
> + GsBuffers gsBuffers;
> if (HasGeometryShaderT::value)
> {
> #if USE_SIMD16_FRONTEND
> - AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
> + AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
> #else
> - AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
> + AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
> #endif
> }
>
> @@ -1672,23 +1701,23 @@ void ProcessDraw(
> if (HasTessellationT::value)
> {
> pa.useAlternateOffset = false;
> - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
> + TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
>
> if (numPrims_hi)
> {
> pa.useAlternateOffset = true;
> - TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
> + TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
> }
> }
> else if (HasGeometryShaderT::value)
> {
> pa.useAlternateOffset = false;
> - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
> + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
>
> if (numPrims_hi)
> {
> pa.useAlternateOffset = true;
> - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
> + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
> }
> }
> else
> @@ -1847,12 +1876,12 @@ void ProcessDraw(
> if (HasTessellationT::value)
> {
> TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
> - pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
> + pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
> }
> else if (HasGeometryShaderT::value)
> {
> GeometryShaderStage<HasStreamOutT, HasRastT>(
> - pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
> + pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
> }
> else
> {
> diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
> index 13c1d8b..f7c9308 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/state.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/state.h
> @@ -301,13 +301,12 @@ struct SWR_DS_CONTEXT
> /////////////////////////////////////////////////////////////////////////
> struct SWR_GS_CONTEXT
> {
> - simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims
> - simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call
> - uint32_t InstanceID; // IN: input instance ID
> - simdscalari mask; // IN: Active mask for shader
> - uint8_t* pStream; // OUT: output stream (contains vertices for all output streams)
> - uint8_t* pCutOrStreamIdBuffer; // OUT: cut or stream id buffer
> - simdscalari vertexCount; // OUT: num vertices emitted per SIMD lane
> + simdvector* pVerts; // IN: input primitive data for SIMD prims
> + uint32_t inputVertStride; // IN: input vertex stride, in attributes
> + simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call
> + uint32_t InstanceID; // IN: input instance ID
> + simdscalari mask; // IN: Active mask for shader
> + uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)
> };
>
> struct PixelPositions
> @@ -714,30 +713,56 @@ struct SWR_GS_STATE
> {
> bool gsEnable;
>
> - // number of input attributes per vertex. used by the frontend to
> + // Number of input attributes per vertex. Used by the frontend to
> // optimize assembling primitives for GS
> uint32_t numInputAttribs;
>
> - // output topology - can be point, tristrip, or linestrip
> + // Stride of incoming verts in attributes
> + uint32_t inputVertStride;
> +
> + // Output topology - can be point, tristrip, or linestrip
> PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum
>
> - // maximum number of verts that can be emitted by a single instance of the GS
> + // Maximum number of verts that can be emitted by a single instance of the GS
> uint32_t maxNumVerts;
>
> - // instance count
> + // Instance count
> uint32_t instanceCount;
>
> - // if true, geometry shader emits a single stream, with separate cut buffer.
> - // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
> + // If true, geometry shader emits a single stream, with separate cut buffer.
> + // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
> // to map vertices to streams
> bool isSingleStream;
>
> - // when single stream is enabled, singleStreamID dictates which stream is being output.
> + // When single stream is enabled, singleStreamID dictates which stream is being output.
> // field ignored if isSingleStream is false
> uint32_t singleStreamID;
>
> - // Offset to the start of the attributes of the input vertices, in simdvector units
> + // Total amount of memory to allocate for one instance of the shader output in bytes
> + uint32_t allocationSize;
> +
> + // Offset to the start of the attributes of the input vertices, in simdvector units, as read by the GS
> uint32_t vertexAttribOffset;
> +
> + // Offset to the attributes as stored by the preceding shader stage.
> + uint32_t srcVertexAttribOffset;
> +
> + // Size of the control data section which contains cut or streamID data, in simdscalar units. Should be sized to handle
> + // the maximum number of verts output by the GS. Can be 0 if there are no cuts or streamID bits.
> + uint32_t controlDataSize;
> +
> + // Offset to the control data section, in bytes
> + uint32_t controlDataOffset;
> +
> + // Total size of an output vertex, in simdvector units
> + uint32_t outputVertexSize;
> +
> + // Offset to the start of the vertex section, in bytes
> + uint32_t outputVertexOffset;
> +
> + // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, shader is
> + // expected to store the final vertex count in the first dword of the gs output stream.
> + uint32_t staticVertexCount;
> };
>
>
> diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
> index 0a81eaa..7f11e72 100644
> --- a/src/gallium/drivers/swr/swr_shader.cpp
> +++ b/src/gallium/drivers/swr/swr_shader.cpp
> @@ -347,18 +347,20 @@ BuilderSWR::swr_gs_llvm_fetch_input(const struct lp_build_tgsi_gs_iface *gs_ifac
> Value *attrib =
> LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)}));
>
> - Value *pInput =
> - LOAD(GEP(iface->pGsCtx,
> - {C(0),
> - C(SWR_GS_CONTEXT_vert),
> - unwrap(vertex_index),
> - C(0),
> - attrib,
> - unwrap(swizzle_index)}));
> + Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});
> + Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});
> +
> + Value *pVector = ADD(MUL(unwrap(vertex_index), pInputVertStride), attrib);
> +
> + Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));
>
> return wrap(pInput);
> }
>
> +// GS output stream layout
> +#define VERTEX_COUNT_SIZE 32
> +#define CONTROL_HEADER_SIZE (8*32)
> +
> void
> BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,
> struct lp_build_tgsi_context * bld_base,
> @@ -366,41 +368,19 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
> LLVMValueRef emitted_vertices_vec)
> {
> swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
> - SWR_GS_STATE *pGS = iface->pGsState;
>
> IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
>
> -#if USE_SIMD16_FRONTEND
> - const uint32_t simdVertexStride = sizeof(simdvertex) * 2;
> - const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);
> -#else
> - const uint32_t simdVertexStride = sizeof(simdvertex);
> - const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;
> -#endif
> - const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;
> -
> - Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });
> - Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
> - Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));
> + const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;
> + const uint32_t attribSize = 4 * sizeof(float);
> + const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS;
> + Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize));
>
> - Value *vOffsets = C({
> - inputPrimStride * 0,
> - inputPrimStride * 1,
> - inputPrimStride * 2,
> - inputPrimStride * 3,
> - inputPrimStride * 4,
> - inputPrimStride * 5,
> - inputPrimStride * 6,
> - inputPrimStride * 7 } );
> + Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask});
> + Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth));
>
> -#if USE_SIMD16_FRONTEND
> - const uint32_t simdShift = log2(mVWidth * 2);
> - Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);
> -#else
> - const uint32_t simdShift = log2(mVWidth);
> - Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);
> -#endif
> - Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);
> + Value *pStack = STACKSAVE();
> + Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking
>
> for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
> uint32_t attribSlot = attrib;
> @@ -420,46 +400,36 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base
> }
> }
>
> -#if USE_SIMD16_FRONTEND
> - Value *vOffsetsAttrib =
> - ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));
> - vOffsetsAttrib =
> - ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));
> -#else
> - Value *vOffsetsAttrib =
> - ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));
> - vOffsetsAttrib =
> - ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));
> -#endif
> - vOffsetsAttrib =
> - ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
> + Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ?
>
> - for (uint32_t channel = 0; channel < 4; ++channel) {
> - Value *vPtrs = GEP(pStream, vOffsetsAttrib);
> - Value *vData;
> + for (uint32_t lane = 0; lane < mVWidth; ++lane) {
> + Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane));
> + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
> + Value *pStreamOffset = GEP(pStream, pLaneOffset);
> + pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy);
>
> - if (attribSlot == VERTEX_SGV_SLOT)
> - vData = LOAD(unwrap(outputs[attrib][0]));
> - else
> - vData = LOAD(unwrap(outputs[attrib][channel]));
> + Value *pLaneMask = VEXTRACT(vMask1, C(lane));
> + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);
>
> - if (attribSlot != VERTEX_SGV_SLOT ||
> - sgvChannel == channel) {
> - vPtrs = BITCAST(vPtrs,
> - VectorType::get(PointerType::get(mFP32Ty, 0), 8));
> + for (uint32_t channel = 0; channel < 4; ++channel) {
> + Value *vData;
>
> - MASKED_SCATTER(vData, vPtrs, 32, vMask1);
> - }
> + if (attribSlot == VERTEX_SGV_SLOT)
> + vData = LOAD(unwrap(outputs[attrib][0]));
> + else
> + vData = LOAD(unwrap(outputs[attrib][channel]));
>
> -#if USE_SIMD16_FRONTEND
> - vOffsetsAttrib =
> - ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));
> -#else
> - vOffsetsAttrib =
> - ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));
> -#endif
> + if (attribSlot != VERTEX_SGV_SLOT ||
> + sgvChannel == channel) {
> + vData = VEXTRACT(vData, C(lane));
> + STORE(vData, pStreamOffset);
> + }
> + pStreamOffset = GEP(pStreamOffset, C(1));
> + }
> }
> }
> +
> + STACKRESTORE(pStack);
> }
>
> void
> @@ -469,12 +439,9 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba
> LLVMValueRef emitted_prims_vec)
> {
> swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
> - SWR_GS_STATE *pGS = iface->pGsState;
>
> IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
>
> - Value *pCutBuffer =
> - LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer});
> Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
> Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));
>
> @@ -496,31 +463,29 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct lp_build_tgsi_gs_iface *gs_ba
> mask = AND(mask, cmpMask);
> vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8));
>
> - const uint32_t cutPrimStride =
> - (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth;
> - Value *vOffsets = C({
> - (uint32_t)(cutPrimStride * 0),
> - (uint32_t)(cutPrimStride * 1),
> - (uint32_t)(cutPrimStride * 2),
> - (uint32_t)(cutPrimStride * 3),
> - (uint32_t)(cutPrimStride * 4),
> - (uint32_t)(cutPrimStride * 5),
> - (uint32_t)(cutPrimStride * 6),
> - (uint32_t)(cutPrimStride * 7) } );
> -
> vCount = SUB(vCount, VIMMED1(1));
> - Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets);
> + Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE));
> Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8)));
>
> vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8));
>
> - Value *vPtrs = GEP(pCutBuffer, vOffset);
> - vPtrs =
> - BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), JM()->mVWidth));
> + Value *pStack = STACKSAVE();
> + Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking
> +
> + for (uint32_t lane = 0; lane < mVWidth; ++lane) {
> + Value *vLaneOffset = VEXTRACT(vOffset, C(lane));
> + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
> + Value *pStreamOffset = GEP(pStream, vLaneOffset);
> +
> + Value *pLaneMask = VEXTRACT(vMask1, C(lane));
> + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);
>
> - Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1);
> - vValue = OR(vGather, vValue);
> - MASKED_SCATTER(vValue, vPtrs, 32, vMask1);
> + Value *vVal = LOAD(pStreamOffset);
> + vVal = OR(vVal, VEXTRACT(vValue, C(lane)));
> + STORE(vVal, pStreamOffset);
> + }
> +
> + STACKRESTORE(pStack);
> }
>
> void
> @@ -533,7 +498,14 @@ BuilderSWR::swr_gs_llvm_epilogue(const struct lp_build_tgsi_gs_iface *gs_base,
>
> IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
>
> - STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, SWR_GS_CONTEXT_vertexCount});
> + // Store emit count to each output stream in the first DWORD
> + for (uint32_t lane = 0; lane < mVWidth; ++lane)
> + {
> + Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
> + pStream = BITCAST(pStream, mInt32PtrTy);
> + Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane));
> + STORE(pLaneCount, pStream);
> + }
> }
>
> PFN_GS_FUNC
> @@ -542,6 +514,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
> SWR_GS_STATE *pGS = &ctx->gs->gsState;
> struct tgsi_shader_info *info = &ctx->gs->info.base;
>
> + memset(pGS, 0, sizeof(*pGS));
> +
> pGS->gsEnable = true;
>
> pGS->numInputAttribs = info->num_inputs;
> @@ -555,6 +529,18 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
> pGS->singleStreamID = 0;
>
> pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
> + pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
> + pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
> + pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
> + pGS->controlDataSize = 8; // GS ouputs max of 8 32B units
> + pGS->controlDataOffset = 32;
> + pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32;
> +
> + pGS->allocationSize =
> + 32 + // vertex count
> + (8 * 32) + // control header
> + (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
> + pGS->maxNumVerts; // num verts
Consider using VERTEX_COUNT_SIZE and CONTROL_HEADER_SIZE defines?
pGS->controlDataOffset = VERTEX_COUNT_SIZE;
pGS->outputVertexOffset = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;
pGS->allocationSize =
VERTEX_COUNT_SIZE +
CONTROL_HEADER_SIZE
(SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
pGS->maxNumVerts; // num verts
> struct swr_geometry_shader *gs = ctx->gs;
>
> @@ -635,10 +621,11 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
> lp_type_float_vec(32, 32 * 8), wrap(mask_val));
>
> // zero out cut buffer so we can load/modify/store bits
> - MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}),
> - C((char)0),
> - pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth,
> - sizeof(float) * KNOB_SIMD_WIDTH);
> + for (uint32_t lane = 0; lane < mVWidth; ++lane)
> + {
> + Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
> + MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH);
> + }
>
> struct swr_gs_llvm_iface gs_iface;
> gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input;
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list