[Mesa-dev] [PATCH 16/16] swr/rast: Fix read-back of viewport array index

Tim Rowley timothy.o.rowley at intel.com
Thu Jun 15 18:37:17 UTC 2017


Binner/clipper read viewport array index from the vertex header as needed.
Move viewport state to BACKEND_STATE.
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp    |   4 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 129 +++++++++++++++++----
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  24 ++--
 src/gallium/drivers/swr/rasterizer/core/clip.h     |  63 +++++++---
 src/gallium/drivers/swr/rasterizer/core/context.h  |   4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp       |  49 +-------
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   8 +-
 src/gallium/drivers/swr/rasterizer/core/state.h    |   4 +-
 src/gallium/drivers/swr/swr_shader.cpp             |   2 -
 src/gallium/drivers/swr/swr_state.cpp              |  12 +-
 10 files changed, 182 insertions(+), 117 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index eacce1c..ae9ced2 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -680,7 +680,7 @@ void SwrSetBlendFunc(
 // update guardband multipliers for the viewport
 void updateGuardbands(API_STATE *pState)
 {
-    uint32_t numGbs = pState->backendState.readRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
 
     for(uint32_t i = 0; i < numGbs; ++i)
     {
@@ -736,7 +736,7 @@ void SwrSetScissorRects(
 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 {
     API_STATE *pState = &pDC->pState->state;
-    uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
     pState->scissorsTileAligned = true;
 
     for (uint32_t index = 0; index < numScissors; ++index)
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index a73816b..036d8b1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -434,8 +434,7 @@ void BinTriangles(
     uint32_t workerId,
     simdvector tri[3],
     uint32_t triMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -451,6 +450,21 @@ void BinTriangles(
     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
 
+    // Read viewport array index if needed
+    simdscalari viewportIdx = _simd_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simdvector vpiAttrib[3];
+        pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
+        simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd_and_si(vClearMask, vpai);
+    }
+
     if (feState.vpTransformDisable)
     {
         // RHW is passed in directly when VP transform is disabled
@@ -478,7 +492,7 @@ void BinTriangles(
         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
 
         // Viewport transform to screen space coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
         }
@@ -661,7 +675,7 @@ void BinTriangles(
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.gsState.emitsViewportArrayIndex)
+    if (state.backendState.readViewportArrayIndex)
     {
         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
             scisXmin, scisYmin, scisXmax, scisYmax);
@@ -863,8 +877,7 @@ void SIMDAPI BinTriangles_simd16(
     uint32_t workerId,
     simd16vector tri[3],
     uint32_t triMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -880,6 +893,20 @@ void SIMDAPI BinTriangles_simd16(
     simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
     simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
     simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
+    
+    simd16scalari viewportIdx = _simd16_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simd16vector vpiAttrib[3];
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
+        simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd16_and_si(vClearMask, vpai);
+    }
 
     if (feState.vpTransformDisable)
     {
@@ -908,7 +935,7 @@ void SIMDAPI BinTriangles_simd16(
         tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
 
         // Viewport transform to screen space coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
         }
@@ -1101,7 +1128,7 @@ void SIMDAPI BinTriangles_simd16(
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
     simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
 
-    if (state.gsState.emitsViewportArrayIndex)
+    if (state.backendState.readViewportArrayIndex)
     {
         GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
             scisXmin, scisYmin, scisXmax, scisYmax);
@@ -1524,7 +1551,7 @@ void BinPostSetupPoints(
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
         simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
                 scisXmin, scisYmin, scisXmax, scisYmax);
@@ -1672,8 +1699,7 @@ void BinPoints(
     uint32_t workerId,
     simdvector prim[3],
     uint32_t primMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari primID)
 {
     simdvector& primVerts = prim[0];
 
@@ -1681,6 +1707,21 @@ void BinPoints(
     const SWR_FRONTEND_STATE& feState = state.frontendState;
     const SWR_RASTSTATE& rastState = state.rastState;
 
+    // Read back viewport index if required
+    simdscalari viewportIdx = _simd_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simdvector vpiAttrib[1];
+        pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+        simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+        // OOB indices => forced to zero.
+        vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
+        simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd_and_si(vClearMask, vpai);
+    }
+
     if (!feState.vpTransformDisable)
     {
         // perspective divide
@@ -1690,7 +1731,7 @@ void BinPoints(
         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
 
         // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
         }
@@ -1898,7 +1939,7 @@ void BinPostSetupPoints_simd16(
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
         simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
                 scisXmin, scisYmin, scisXmax, scisYmax);
@@ -2040,8 +2081,7 @@ void SIMDAPI BinPoints_simd16(
     uint32_t workerId,
     simd16vector prim[3],
     uint32_t primMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari primID)
 {
     simd16vector& primVerts = prim[0];
 
@@ -2049,6 +2089,21 @@ void SIMDAPI BinPoints_simd16(
     const SWR_FRONTEND_STATE& feState = state.frontendState;
     const SWR_RASTSTATE& rastState = state.rastState;
 
+    // Read back viewport index if required
+    simd16scalari viewportIdx = _simd16_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simd16vector vpiAttrib[1];
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai)
+        simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd16_and_si(vClearMask, vpai);
+    }
+
     if (!feState.vpTransformDisable)
     {
         // perspective divide
@@ -2059,7 +2114,7 @@ void SIMDAPI BinPoints_simd16(
         primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
 
         // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
         }
@@ -2165,7 +2220,7 @@ void BinPostSetupLines(
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.gsState.emitsViewportArrayIndex)
+    if (state.backendState.readViewportArrayIndex)
     {
         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
             scisXmin, scisYmin, scisXmax, scisYmax);
@@ -2370,7 +2425,7 @@ void BinPostSetupLines_simd16(
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
 
-    if (state.gsState.emitsViewportArrayIndex)
+    if (state.backendState.readViewportArrayIndex)
     {
         GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
             scisXmin, scisYmin, scisXmax, scisYmax);
@@ -2533,8 +2588,7 @@ void BinLines(
     uint32_t workerId,
     simdvector prim[],
     uint32_t primMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari primID)
 {
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -2542,6 +2596,20 @@ void BinLines(
 
     simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
 
+    simdscalari viewportIdx = _simd_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simdvector vpiAttrib[2];
+        pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+        simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
+
+        // OOB indices => forced to zero.
+        simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd_and_si(vClearMask, vpai);
+    }
+
     if (!feState.vpTransformDisable)
     {
         // perspective divide
@@ -2558,7 +2626,7 @@ void BinLines(
         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
 
         // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
         }
@@ -2594,8 +2662,7 @@ void SIMDAPI BinLines_simd16(
     uint32_t workerId,
     simd16vector prim[3],
     uint32_t primMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari primID)
 {
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -2603,6 +2670,20 @@ void SIMDAPI BinLines_simd16(
 
     simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
 
+    simd16scalari viewportIdx = _simd16_set1_epi32(0);
+    if (state.backendState.readViewportArrayIndex)
+    {
+        simd16vector vpiAttrib[2];
+        pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+        // OOB indices => forced to zero.
+        simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
+        simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+        simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+        viewportIdx = _simd16_and_si(vClearMask, vpai);
+    }
+
     if (!feState.vpTransformDisable)
     {
         // perspective divide
@@ -2619,7 +2700,7 @@ void SIMDAPI BinLines_simd16(
         prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
 
         // viewport transform to screen coords
-        if (state.gsState.emitsViewportArrayIndex)
+        if (state.backendState.readViewportArrayIndex)
         {
             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index c93e0fb..bd62b58 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -160,35 +160,35 @@ int ClipTriToPlane( const float *pInPts, int numInPts,
     return i;
 }
 
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
     Clipper<3> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
     AR_END(FEClipTriangles, 1);
 }
 
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
     Clipper<2> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
     AR_END(FEClipLines, 1);
 }
 
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
     Clipper<1> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
     AR_END(FEClipPoints, 1);
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
@@ -198,12 +198,12 @@ void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t work
     Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
 
     pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
 
     AR_END(FEClipTriangles, 1);
 }
 
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
@@ -213,12 +213,12 @@ void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId
     Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
 
     pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
 
     AR_END(FEClipLines, 1);
 }
 
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
@@ -228,7 +228,7 @@ void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerI
     Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
 
     pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
 
     AR_END(FEClipPoints, 1);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 9235618..12b52c5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -459,7 +459,7 @@ public:
 
 #endif
     // clip SIMD primitives
-    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
+    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
     {
         // input/output vertex store for clipper
         simdvertex vertices[7]; // maximum 7 verts generated per triangle
@@ -559,7 +559,6 @@ public:
         
         uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
         uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-        uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
 
         const simdscalari vOffsets = _mm256_set_epi32(
             0 * sizeof(simdvertex),  // unused lane
@@ -697,7 +696,7 @@ public:
                         }
 
                         clipPa.useAlternateOffset = false;
-                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
                     }
 #else
                     simdvector attrib[NumVertsPerPrim];
@@ -705,7 +704,7 @@ public:
                     if (assemble)
                     {
                         static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
-                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
                     }
 #endif
                 } while (clipPa.NextPrim());
@@ -717,7 +716,7 @@ public:
     }
     
 #if USE_SIMD16_FRONTEND
-    void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId, const simd16scalari& vViewportIdx)
+    void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId)
     {
         // input/output vertex store for clipper
         simd16vertex vertices[7]; // maximum 7 verts generated per triangle
@@ -817,7 +816,6 @@ public:
 
         uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
         uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-        uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
 
         const simdscalari vOffsets = _simd_set_epi32(
             0 * sizeof(simd16vertex),   // unused lane
@@ -928,7 +926,7 @@ public:
                         static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff };
 
                         clipPa.useAlternateOffset = false;
-                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]), _simd16_set1_epi32(pViewportIdx[inputPrim]));
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]));
                     }
 
                 } while (clipPa.NextPrim());
@@ -945,7 +943,7 @@ public:
 
 #endif
     // execute the clipper stage
-    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
+    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
     {
         SWR_ASSERT(this->pDC != nullptr);
         SWR_CONTEXT* pContext = this->pDC->pContext;
@@ -973,6 +971,20 @@ public:
         // update clipper invocations pipeline stat
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
         UPDATE_STAT_FE(CInvocations, numInvoc);
+        
+        // Read back viewport index if required
+        simdscalari viewportIdx = _simd_set1_epi32(0);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            simdvector vpiAttrib[NumVertsPerPrim];
+            pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
+            simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+            // OOB indices => forced to zero.
+            simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+            simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
+            viewportIdx = _simd_and_si(vClearMask, vpai);
+        }
 
         ComputeClipCodes(prim, viewportIdx);
 
@@ -1001,7 +1013,7 @@ public:
             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
-            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
+            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
             AR_END(FEGuardbandClip, 1);
         }
         else if (validMask)
@@ -1010,12 +1022,12 @@ public:
             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
-            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
+            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
         }
     }
 
 #if USE_SIMD16_FRONTEND
-    void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+    void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId)
     {
         SWR_ASSERT(pa.pDC != nullptr);
         SWR_CONTEXT* pContext = pa.pDC->pContext;
@@ -1043,6 +1055,19 @@ public:
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
         UPDATE_STAT_FE(CInvocations, numInvoc);
 
+        // Read back viewport index if required
+        simd16scalari viewportIdx = _simd16_set1_epi32(0);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            simd16vector vpiAttrib[NumVertsPerPrim];
+            pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+
+            // OOB indices => forced to zero.
+            simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+            simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+            simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
+            viewportIdx = _simd16_and_si(vClearMask, vpai);
+        }
         ComputeClipCodes(prim, viewportIdx);
 
         // cull prims with NAN coords
@@ -1070,7 +1095,7 @@ public:
             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
-            ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId, viewportIdx);
+            ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId);
             AR_END(FEGuardbandClip, 1);
         }
         else if (validMask)
@@ -1079,7 +1104,7 @@ public:
             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
-            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
+            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
         }
     }
 
@@ -1854,12 +1879,12 @@ private:
 
 
 // pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index f60ddfd..81bf9ff 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -214,12 +214,12 @@ struct PA_STATE;
 
 // function signature for pipeline stages that execute after primitive assembly
 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
-    uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+    uint32_t primMask, simdscalari primID);
 
 #if ENABLE_AVX512_SIMD16
 // function signature for pipeline stages that execute after primitive assembly
 typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
-    uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+    uint32_t primMask, simd16scalari primID);
 
 #endif
 OSALIGNLINE(struct) API_STATE
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index c11a35a..1cd166d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -950,48 +950,11 @@ static void GeometryShaderStage(
 #if USE_SIMD16_FRONTEND
                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
 
-                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
-                                simd16scalari vViewPortIdx;
-                                if (state.gsState.emitsViewportArrayIndex)
-                                {
-                                    simd16vector vpiAttrib[3];
-                                    gsPa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
-
-                                    // OOB indices => forced to zero.
-                                    simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-                                    simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
-                                    vViewPortIdx = _simd16_and_si(vClearMask, vpai);
-                                }
-                                else
-                                {
-                                    vViewPortIdx = _simd16_set1_epi32(0);
-                                }
-
                                 gsPa.useAlternateOffset = false;
-                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
 #else
                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
-
-                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
-                                simdscalari vViewPortIdx;
-                                if (state.gsState.emitsViewportArrayIndex)
-                                {
-                                    simdvector vpiAttrib[3];
-                                    gsPa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-                                    simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-
-                                    // OOB indices => forced to zero.
-                                    simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
-                                    vViewPortIdx = _simd_and_si(vClearMask, vpai);
-                                }
-                                else
-                                {
-                                    vViewPortIdx = _simd_set1_epi32(0);
-                                }
-
-                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 #endif
                             }
                         }
@@ -1340,10 +1303,10 @@ static void TessellationStages(
                     SWR_ASSERT(pfnClipFunc);
 #if USE_SIMD16_FRONTEND
                     tessPa.useAlternateOffset = false;
-                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
+                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
 #else
                     pfnClipFunc(pDC, tessPa, workerId, prim,
-                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
+                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
 #endif
                 }
             }
@@ -1702,7 +1665,7 @@ void ProcessDraw(
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
 
                                     pa.useAlternateOffset = false;
-                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
+                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
                                 }
                             }
                         }
@@ -1864,7 +1827,7 @@ void ProcessDraw(
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
 
                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
-                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
                                 }
                             }
                         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 65b7f02..3c2361e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -388,10 +388,10 @@ PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
 #endif
 
 struct PA_STATE_BASE;  // forward decl
-void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
-void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
-void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 94a5071..2440d44 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -710,9 +710,6 @@ struct SWR_GS_STATE
     // instance count
     uint32_t instanceCount;
 
-    // geometry shader emits ViewportArrayIndex
-    bool emitsViewportArrayIndex;
-
     // if true, geometry shader emits a single stream, with separate cut buffer.
     // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
     // to map vertices to streams
@@ -1049,6 +1046,7 @@ struct SWR_BACKEND_STATE
     SWR_ATTRIB_SWIZZLE swizzleMap[32];
 
     bool readRenderTargetArrayIndex;    // Forward render target array index from last FE stage to the backend
+    bool readViewportArrayIndex;        // Read viewport array index from last FE stage during binning
 };
 
 
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
index f4029be..dfc54fa 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -547,8 +547,6 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
    pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
    pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS];
 
-   pGS->emitsViewportArrayIndex = info->writes_viewport_index;
-
    // XXX: single stream for now...
    pGS->isSingleStream = true;
    pGS->singleStreamID = 0;
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
index 0620860..b4f4ce0 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -1744,12 +1744,12 @@ swr_update_derived(struct pipe_context *pipe,
       (ctx->rasterizer->flatshade ? ctx->fs->flatConstantMask : 0);
    backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask;
 
-   if (ctx->gs)
-      backendState.readRenderTargetArrayIndex =
-         ctx->gs->info.base.writes_layer;
-   else
-      backendState.readRenderTargetArrayIndex =
-         ctx->vs->info.base.writes_layer;
+   struct tgsi_shader_info *pLastFE =
+      ctx->gs ?
+      &ctx->gs->info.base :
+      &ctx->vs->info.base;
+   backendState.readRenderTargetArrayIndex = pLastFE->writes_layer;
+   backendState.readViewportArrayIndex = pLastFE->writes_viewport_index;
 
    SwrSetBackendState(ctx->swrContext, &backendState);
 
-- 
2.7.4



More information about the mesa-dev mailing list