Mesa (master): swr: [rasterizer core] split FE and BE stats

Tim Rowley torowley at kemper.freedesktop.org
Wed Aug 10 18:17:13 UTC 2016


Module: Mesa
Branch: master
Commit: 4e8763cb0904c30d1962cf5ad52fe3a87be7b4bd
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=4e8763cb0904c30d1962cf5ad52fe3a87be7b4bd

Author: Tim Rowley <timothy.o.rowley at intel.com>
Date:   Sat Aug  6 20:10:14 2016 -0600

swr: [rasterizer core] split FE and BE stats

Separated FE stats out into its own structure.  There are 17 FE vs 3 BE
stat fields.  Since there is only one FE thread per DC then we don't have
to loop over all threads and sum up FE stats over all the worker threads.
This also reduces size of DC since we only need to store one copy of the
FE stats and not one per worker.  Finally, we can use the new FE callback
mechanism to update these.

Signed-off-by: Tim Rowley <timothy.o.rowley at intel.com>

---

 src/gallium/drivers/swr/rasterizer/core/api.cpp    |  1 +
 src/gallium/drivers/swr/rasterizer/core/api.h      | 21 +++++++++----
 src/gallium/drivers/swr/rasterizer/core/clip.h     |  6 ++--
 src/gallium/drivers/swr/rasterizer/core/context.h  | 15 ++++++----
 .../drivers/swr/rasterizer/core/frontend.cpp       | 18 +++++------
 src/gallium/drivers/swr/rasterizer/core/state.h    | 16 ++++++++--
 .../drivers/swr/rasterizer/core/threads.cpp        | 21 ++++---------
 src/gallium/drivers/swr/swr_context.cpp            | 19 ++++++++++--
 src/gallium/drivers/swr/swr_context.h              |  1 +
 src/gallium/drivers/swr/swr_query.cpp              | 35 ++++++++++++----------
 src/gallium/drivers/swr/swr_query.h                |  1 +
 11 files changed, 95 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 0797c8a..d6aa80d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -144,6 +144,7 @@ HANDLE SwrCreateContext(
     pContext->pfnClearTile = pCreateInfo->pfnClearTile;
     pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
     pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
+    pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
 
     // pass pointer to bucket manager back to caller
 #ifdef KNOB_ENABLE_RDTSC
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 4ee04dc..ed18fe0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -95,6 +95,16 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
 typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext,
     const SWR_STATS* pStats);
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Callback to allow driver to update their copy of FE stats.
+/// @note Its optimal to have a separate callback for FE stats since
+///       there is only one DC per FE thread. This means we do not have
+///       to sum up the stats across all of the workers.
+/// @param hPrivateContext - handle to private data
+/// @param pStats - pointer to draw stats
+typedef void(SWR_API *PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext,
+    const SWR_STATS_FE* pStats);
+
 class BucketManager;
 
 //////////////////////////////////////////////////////////////////////////
@@ -121,11 +131,12 @@ struct SWR_CREATECONTEXT_INFO
     uint32_t privateStateSize;
 
     // Callback functions
-    PFN_LOAD_TILE pfnLoadTile;
-    PFN_STORE_TILE pfnStoreTile;
-    PFN_CLEAR_TILE pfnClearTile;
-    PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS pfnUpdateStats;
+    PFN_LOAD_TILE               pfnLoadTile;
+    PFN_STORE_TILE              pfnStoreTile;
+    PFN_CLEAR_TILE              pfnClearTile;
+    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
+    PFN_UPDATE_STATS            pfnUpdateStats;
+    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
 
     // Pointer to rdtsc buckets mgr returned to the caller.
     // Only populated when KNOB_ENABLE_RDTSC is set
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index b2b3bb4..a2ba769 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -495,7 +495,7 @@ public:
 
         // update global pipeline stat
         SWR_CONTEXT* pContext = this->pDC->pContext;
-        UPDATE_STAT(CPrimitives, numClippedPrims);
+        UPDATE_STAT_FE(CPrimitives, numClippedPrims);
     }
     
     // execute the clipper stage
@@ -523,7 +523,7 @@ public:
         // update clipper invocations pipeline stat
         SWR_CONTEXT* pContext = this->pDC->pContext;
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
-        UPDATE_STAT(CInvocations, numInvoc);
+        UPDATE_STAT_FE(CInvocations, numInvoc);
 
         ComputeClipCodes(prim);
 
@@ -559,7 +559,7 @@ public:
         {
             // update CPrimitives pipeline state
             SWR_CONTEXT* pContext = this->pDC->pContext;
-            UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
+            UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
             pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index c478ee9..144fcef 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -365,7 +365,8 @@ struct DRAW_DYNAMIC_STATE
     uint32_t SoWriteOffset[4];
     bool     SoWriteOffsetDirty[4];
 
-    SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+    SWR_STATS_FE statsFE;   // Only one FE thread per DC.
+    SWR_STATS    stats[KNOB_MAX_NUM_THREADS];
 };
 
 // Draw Context
@@ -470,11 +471,12 @@ struct SWR_CONTEXT
     HotTileMgr *pHotTileMgr;
 
     // Callback functions, passed in at create context time
-    PFN_LOAD_TILE pfnLoadTile;
-    PFN_STORE_TILE pfnStoreTile;
-    PFN_CLEAR_TILE pfnClearTile;
-    PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS pfnUpdateStats;
+    PFN_LOAD_TILE               pfnLoadTile;
+    PFN_STORE_TILE              pfnStoreTile;
+    PFN_CLEAR_TILE              pfnClearTile;
+    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
+    PFN_UPDATE_STATS            pfnUpdateStats;
+    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
 
     // Global Stats
     SWR_STATS stats[KNOB_MAX_NUM_THREADS];
@@ -492,3 +494,4 @@ void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
 void WakeAllThreads(SWR_CONTEXT *pContext);
 
 #define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; }
+#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index e32f743..3014c7d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -580,8 +580,8 @@ static void StreamOut(
         }
     }
 
-    UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
-    UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+    UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
+    UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 
     RDTSC_STOP(FEStreamout, 1, 0);
 }
@@ -843,8 +843,8 @@ static void GeometryShaderStage(
     }
 
     // update GS pipeline stats
-    UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
-    UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
+    UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
+    UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 
     RDTSC_STOP(FEGeometryShader, 1, 0);
 }
@@ -1009,7 +1009,7 @@ static void TessellationStages(
     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
     RDTSC_STOP(FEHullShader, 0, 0);
 
-    UPDATE_STAT(HsInvocations, numPrims);
+    UPDATE_STAT_FE(HsInvocations, numPrims);
 
     const uint32_t* pPrimId = (const uint32_t*)&primID;
 
@@ -1065,7 +1065,7 @@ static void TessellationStages(
 
             dsInvocations += KNOB_SIMD_WIDTH;
         }
-        UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
+        UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
 
         PA_TESS tessPa(
             pDC,
@@ -1302,7 +1302,7 @@ void ProcessDraw(
                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
                 }
 
-                UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
+                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
 
 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
@@ -1312,7 +1312,7 @@ void ProcessDraw(
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
                     RDTSC_STOP(FEVertexShader, 0, 0);
 
-                    UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
+                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
             }
 
@@ -1335,7 +1335,7 @@ void ProcessDraw(
                     {
                         if (assemble)
                         {
-                            UPDATE_STAT(IaPrimitives, pa.NumPrims());
+                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
 
                             if (HasTessellationT::value)
                             {
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index fdf5d7e..988de75 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -564,17 +564,27 @@ struct SWR_STATS
     uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
 
     // Pipeline Stats
+    uint64_t PsInvocations;  // Number of Pixel Shader invocations
+    uint64_t CsInvocations;  // Number of Compute Shader invocations
+
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_STATS
+///
+/// @brief All statistics generated by FE.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STATS_FE
+{
     uint64_t IaVertices;    // Number of Fetch Shader vertices
     uint64_t IaPrimitives;  // Number of PA primitives.
     uint64_t VsInvocations; // Number of Vertex Shader invocations
     uint64_t HsInvocations; // Number of Hull Shader invocations
     uint64_t DsInvocations; // Number of Domain Shader invocations
     uint64_t GsInvocations; // Number of Geometry Shader invocations
-    uint64_t PsInvocations; // Number of Pixel Shader invocations
-    uint64_t CsInvocations; // Number of Compute Shader invocations
+    uint64_t GsPrimitives;  // Number of prims GS outputs.
     uint64_t CInvocations;  // Number of clipper invocations
     uint64_t CPrimitives;   // Number of clipper primitives.
-    uint64_t GsPrimitives;  // Number of prims GS outputs.
 
     // Streamout Stats
     uint64_t SoPrimStorageNeeded[4];
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index fb17af1..dce23b2 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -322,23 +322,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
         stats.DepthPassCount += dynState.stats[i].DepthPassCount;
-        stats.IaVertices     += dynState.stats[i].IaVertices;
-        stats.IaPrimitives   += dynState.stats[i].IaPrimitives;
-        stats.VsInvocations  += dynState.stats[i].VsInvocations;
-        stats.HsInvocations  += dynState.stats[i].HsInvocations;
-        stats.DsInvocations  += dynState.stats[i].DsInvocations;
-        stats.GsInvocations  += dynState.stats[i].GsInvocations;
+
         stats.PsInvocations  += dynState.stats[i].PsInvocations;
-        stats.CInvocations   += dynState.stats[i].CInvocations;
         stats.CsInvocations  += dynState.stats[i].CsInvocations;
-        stats.CPrimitives    += dynState.stats[i].CPrimitives;
-        stats.GsPrimitives   += dynState.stats[i].GsPrimitives;
-
-        for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
-        {
-            stats.SoPrimStorageNeeded[stream] += dynState.stats[i].SoPrimStorageNeeded[stream];
-            stats.SoNumPrimsWritten[stream]   += dynState.stats[i].SoNumPrimsWritten[stream];
-        }
     }
 
     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
@@ -560,6 +546,11 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
     _ReadWriteBarrier();
 
+    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
+    {
+        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
+    }
+
     if (pContext->pfnUpdateSoWriteOffset)
     {
         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index 53d2b93..15e60cd 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -355,15 +355,29 @@ swr_UpdateStats(HANDLE hPrivateContext, const SWR_STATS *pStats)
    struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx;
 
    SWR_STATS *pSwrStats = &ctx->stats;
+
    pSwrStats->DepthPassCount += pStats->DepthPassCount;
+   pSwrStats->PsInvocations += pStats->PsInvocations;
+   pSwrStats->CsInvocations += pStats->CsInvocations;
+}
+
+static void
+swr_UpdateStatsFE(HANDLE hPrivateContext, const SWR_STATS_FE *pStats)
+{
+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
+
+   if (!pDC)
+      return;
+
+   struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx;
+
+   SWR_STATS_FE *pSwrStats = &ctx->statsFE;
    pSwrStats->IaVertices += pStats->IaVertices;
    pSwrStats->IaPrimitives += pStats->IaPrimitives;
    pSwrStats->VsInvocations += pStats->VsInvocations;
    pSwrStats->HsInvocations += pStats->HsInvocations;
    pSwrStats->DsInvocations += pStats->DsInvocations;
    pSwrStats->GsInvocations += pStats->GsInvocations;
-   pSwrStats->PsInvocations += pStats->PsInvocations;
-   pSwrStats->CsInvocations += pStats->CsInvocations;
    pSwrStats->CInvocations += pStats->CInvocations;
    pSwrStats->CPrimitives += pStats->CPrimitives;
    pSwrStats->GsPrimitives += pStats->GsPrimitives;
@@ -389,6 +403,7 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
    createInfo.pfnStoreTile = swr_StoreHotTile;
    createInfo.pfnClearTile = swr_StoreHotTileClear;
    createInfo.pfnUpdateStats = swr_UpdateStats;
+   createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
    ctx->swrContext = SwrCreateContext(&createInfo);
 
    /* Init Load/Store/ClearTiles Tables */
diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
index 4133720..b4553fb 100644
--- a/src/gallium/drivers/swr/swr_context.h
+++ b/src/gallium/drivers/swr/swr_context.h
@@ -159,6 +159,7 @@ struct swr_context {
    struct swr_draw_context swrDC;
 
    SWR_STATS stats;
+   SWR_STATS_FE statsFE;
 
    unsigned dirty; /**< Mask of SWR_NEW_x flags */
 };
diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp
index 35d0e53..c51c529 100644
--- a/src/gallium/drivers/swr/swr_query.cpp
+++ b/src/gallium/drivers/swr/swr_query.cpp
@@ -94,6 +94,7 @@ swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq)
       /* TODO: should fence instead of stalling pipeline */
       SwrWaitForIdle(ctx->swrContext);
       memcpy(&result->core, &ctx->stats, sizeof(result->core));
+      memcpy(&result->coreFE, &ctx->statsFE, sizeof(result->coreFE));
 
 #if 0
       if (!pq->fence) {
@@ -150,17 +151,17 @@ swr_get_query_result(struct pipe_context *pipe,
       result->u64 = end->timestamp - start->timestamp;
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
-      result->u64 = end->core.IaPrimitives - start->core.IaPrimitives;
+      result->u64 = end->coreFE.IaPrimitives - start->coreFE.IaPrimitives;
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      result->u64 = end->core.SoNumPrimsWritten[index]
-         - start->core.SoNumPrimsWritten[index];
+      result->u64 = end->coreFE.SoNumPrimsWritten[index]
+         - start->coreFE.SoNumPrimsWritten[index];
       break;
    /* Structures */
    case PIPE_QUERY_SO_STATISTICS: {
       struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
-      struct SWR_STATS *start = &pq->start.core;
-      struct SWR_STATS *end = &pq->end.core;
+      struct SWR_STATS_FE *start = &pq->start.coreFE;
+      struct SWR_STATS_FE *end = &pq->end.coreFE;
       so_stats->num_primitives_written =
          end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index];
       so_stats->primitives_storage_needed =
@@ -176,21 +177,23 @@ swr_get_query_result(struct pipe_context *pipe,
          &result->pipeline_statistics;
       struct SWR_STATS *start = &pq->start.core;
       struct SWR_STATS *end = &pq->end.core;
-      p_stats->ia_vertices = end->IaVertices - start->IaVertices;
-      p_stats->ia_primitives = end->IaPrimitives - start->IaPrimitives;
-      p_stats->vs_invocations = end->VsInvocations - start->VsInvocations;
-      p_stats->gs_invocations = end->GsInvocations - start->GsInvocations;
-      p_stats->gs_primitives = end->GsPrimitives - start->GsPrimitives;
-      p_stats->c_invocations = end->CPrimitives - start->CPrimitives;
-      p_stats->c_primitives = end->CPrimitives - start->CPrimitives;
+      struct SWR_STATS_FE *startFE = &pq->start.coreFE;
+      struct SWR_STATS_FE *endFE = &pq->end.coreFE;
+      p_stats->ia_vertices = endFE->IaVertices - startFE->IaVertices;
+      p_stats->ia_primitives = endFE->IaPrimitives - startFE->IaPrimitives;
+      p_stats->vs_invocations = endFE->VsInvocations - startFE->VsInvocations;
+      p_stats->gs_invocations = endFE->GsInvocations - startFE->GsInvocations;
+      p_stats->gs_primitives = endFE->GsPrimitives - startFE->GsPrimitives;
+      p_stats->c_invocations = endFE->CPrimitives - startFE->CPrimitives;
+      p_stats->c_primitives = endFE->CPrimitives - startFE->CPrimitives;
       p_stats->ps_invocations = end->PsInvocations - start->PsInvocations;
-      p_stats->hs_invocations = end->HsInvocations - start->HsInvocations;
-      p_stats->ds_invocations = end->DsInvocations - start->DsInvocations;
+      p_stats->hs_invocations = endFE->HsInvocations - startFE->HsInvocations;
+      p_stats->ds_invocations = endFE->DsInvocations - startFE->DsInvocations;
       p_stats->cs_invocations = end->CsInvocations - start->CsInvocations;
     } break;
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
-      struct SWR_STATS *start = &pq->start.core;
-      struct SWR_STATS *end = &pq->end.core;
+      struct SWR_STATS_FE *start = &pq->start.coreFE;
+      struct SWR_STATS_FE *end = &pq->end.coreFE;
       uint64_t num_primitives_written =
          end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index];
       uint64_t primitives_storage_needed =
diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h
index 0ab034d..931d687 100644
--- a/src/gallium/drivers/swr/swr_query.h
+++ b/src/gallium/drivers/swr/swr_query.h
@@ -29,6 +29,7 @@
 
 struct swr_query_result {
    SWR_STATS core;
+   SWR_STATS_FE coreFE;
    uint64_t timestamp;
 };
 




More information about the mesa-commit mailing list