Mesa (main): freedreno/pps: Expose same counters as blob

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Feb 10 15:59:45 UTC 2022


Module: Mesa
Branch: main
Commit: b84f0596808574bb0d37355a896eaaf1aafe277f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b84f0596808574bb0d37355a896eaaf1aafe277f

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Tue Dec 28 21:44:55 2021 +0200

freedreno/pps: Expose same counters as blob

Expose most of the counters exposed by blob. By faking the value of
counters returned from kgsl I found the exact underlying counters and
constant coefficients being used.

Note, coefficients for counters that depend on time are NOT verified.

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14323>

---

 src/freedreno/ds/fd_pps_driver.cc | 331 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 318 insertions(+), 13 deletions(-)

diff --git a/src/freedreno/ds/fd_pps_driver.cc b/src/freedreno/ds/fd_pps_driver.cc
index 97305c911ae..deadeae11c9 100644
--- a/src/freedreno/ds/fd_pps_driver.cc
+++ b/src/freedreno/ds/fd_pps_driver.cc
@@ -16,6 +16,27 @@
 namespace pps
 {
 
+double
+safe_div(uint64_t a, uint64_t b)
+{
+   if (b == 0)
+      return 0;
+
+   return a / static_cast<double>(b);
+}
+
+float
+percent(uint64_t a, uint64_t b)
+{
+   /* Sometimes we get bogus values but we want for the timeline
+    * to look nice without higher than 100% values.
+    */
+   if (b == 0 || a > b)
+      return 0;
+
+   return 100.f * (a / static_cast<double>(b));
+}
+
 uint64_t
 FreedrenoDriver::get_min_sampling_period_ns()
 {
@@ -45,14 +66,58 @@ FreedrenoDriver::setup_a6xx_counters()
    auto PERF_CP_ALWAYS_COUNT = countable("PERF_CP_ALWAYS_COUNT");
    auto PERF_CP_BUSY_CYCLES  = countable("PERF_CP_BUSY_CYCLES");
    auto PERF_RB_3D_PIXELS    = countable("PERF_RB_3D_PIXELS");
-   auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
-   auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
    auto PERF_TP_L1_CACHELINE_MISSES = countable("PERF_TP_L1_CACHELINE_MISSES");
+   auto PERF_TP_L1_CACHELINE_REQUESTS = countable("PERF_TP_L1_CACHELINE_REQUESTS");
+
+   auto PERF_TP_OUTPUT_PIXELS  = countable("PERF_TP_OUTPUT_PIXELS");
+   auto PERF_TP_OUTPUT_PIXELS_ANISO  = countable("PERF_TP_OUTPUT_PIXELS_ANISO");
+   auto PERF_TP_OUTPUT_PIXELS_BILINEAR = countable("PERF_TP_OUTPUT_PIXELS_BILINEAR");
+   auto PERF_TP_OUTPUT_PIXELS_POINT = countable("PERF_TP_OUTPUT_PIXELS_POINT");
+   auto PERF_TP_OUTPUT_PIXELS_ZERO_LOD = countable("PERF_TP_OUTPUT_PIXELS_ZERO_LOD");
+
+   auto PERF_TSE_INPUT_PRIM  = countable("PERF_TSE_INPUT_PRIM");
+   auto PERF_TSE_CLIPPED_PRIM  = countable("PERF_TSE_CLIPPED_PRIM");
+   auto PERF_TSE_TRIVAL_REJ_PRIM  = countable("PERF_TSE_TRIVAL_REJ_PRIM");
+   auto PERF_TSE_OUTPUT_VISIBLE_PRIM = countable("PERF_TSE_OUTPUT_VISIBLE_PRIM");
+
    auto PERF_SP_BUSY_CYCLES  = countable("PERF_SP_BUSY_CYCLES");
+   auto PERF_SP_ALU_WORKING_CYCLES = countable("PERF_SP_ALU_WORKING_CYCLES");
+   auto PERF_SP_EFU_WORKING_CYCLES = countable("PERF_SP_EFU_WORKING_CYCLES");
+   auto PERF_SP_VS_STAGE_EFU_INSTRUCTIONS = countable("PERF_SP_VS_STAGE_EFU_INSTRUCTIONS");
+   auto PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS");
+   auto PERF_SP_VS_STAGE_TEX_INSTRUCTIONS = countable("PERF_SP_VS_STAGE_TEX_INSTRUCTIONS");
+   auto PERF_SP_FS_STAGE_EFU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_EFU_INSTRUCTIONS");
+   auto PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS");
+   auto PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS = countable("PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS");
+   auto PERF_SP_STALL_CYCLES_TP = countable("PERF_SP_STALL_CYCLES_TP");
+   auto PERF_SP_ANY_EU_WORKING_FS_STAGE = countable("PERF_SP_ANY_EU_WORKING_FS_STAGE");
+   auto PERF_SP_ANY_EU_WORKING_VS_STAGE = countable("PERF_SP_ANY_EU_WORKING_VS_STAGE");
+   auto PERF_SP_ANY_EU_WORKING_CS_STAGE = countable("PERF_SP_ANY_EU_WORKING_CS_STAGE");
+
+   auto PERF_UCHE_STALL_CYCLES_ARBITER = countable("PERF_UCHE_STALL_CYCLES_ARBITER");
+   auto PERF_UCHE_VBIF_READ_BEATS_TP = countable("PERF_UCHE_VBIF_READ_BEATS_TP");
+   auto PERF_UCHE_VBIF_READ_BEATS_VFD = countable("PERF_UCHE_VBIF_READ_BEATS_VFD");
+   auto PERF_UCHE_VBIF_READ_BEATS_SP = countable("PERF_UCHE_VBIF_READ_BEATS_SP");
+   auto PERF_UCHE_READ_REQUESTS_TP = countable("PERF_UCHE_READ_REQUESTS_TP");
+
+   auto PERF_PC_STALL_CYCLES_VFD = countable("PERF_PC_STALL_CYCLES_VFD");
+   auto PERF_PC_VS_INVOCATIONS = countable("PERF_PC_VS_INVOCATIONS");
+   auto PERF_PC_VERTEX_HITS = countable("PERF_PC_VERTEX_HITS");
+
+   auto PERF_HLSQ_QUADS = countable("PERF_HLSQ_QUADS"); /* Quads (fragments / 4) produced */
+
+   auto PERF_CP_NUM_PREEMPTIONS = countable("PERF_CP_NUM_PREEMPTIONS");
+   auto PERF_CP_PREEMPTION_REACTION_DELAY = countable("PERF_CP_PREEMPTION_REACTION_DELAY");
+
+   /* TODO: resolve() tells there is no PERF_CMPDECMP_VBIF_READ_DATA */
+   // auto PERF_CMPDECMP_VBIF_READ_DATA = countable("PERF_CMPDECMP_VBIF_READ_DATA");
 
    /*
     * And then setup the derived counters that we are exporting to
-    * pps based on the captured countable values
+    * pps based on the captured countable values.
+    *
+    * We try to expose the same counters as blob:
+    * https://gpuinspector.dev/docs/gpu-counters/qualcomm
     */
 
    counter("GPU Frequency", Counter::Units::Hertz, [=]() {
@@ -61,29 +126,269 @@ FreedrenoDriver::setup_a6xx_counters()
    );
 
    counter("GPU % Utilization", Counter::Units::Percent, [=]() {
-         return 100.0 * (PERF_CP_BUSY_CYCLES / time) / max_freq;
+         return percent(PERF_CP_BUSY_CYCLES / time, max_freq);
       }
    );
 
-   // This one is a bit of a guess, but seems plausible..
-   counter("ALU / Fragment", Counter::Units::None, [=]() {
+   counter("TP L1 Cache Misses", Counter::Units::None, [=]() {
+         return PERF_TP_L1_CACHELINE_MISSES / time;
+      }
+   );
+
+   counter("Shader Core Utilization", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_BUSY_CYCLES / time, max_freq * info->num_sp_cores);
+      }
+   );
+
+   /* TODO: verify */
+   counter("(?) % Texture Fetch Stall", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_STALL_CYCLES_TP / time, max_freq * info->num_sp_cores);
+      }
+   );
+
+   /* TODO: verify */
+   counter("(?) % Vertex Fetch Stall", Counter::Units::Percent, [=]() {
+         return percent(PERF_PC_STALL_CYCLES_VFD / time, max_freq * info->num_sp_cores);
+      }
+   );
+
+   counter("L1 Texture Cache Miss Per Pixel", Counter::Units::None, [=]() {
+         return safe_div(PERF_TP_L1_CACHELINE_MISSES, PERF_HLSQ_QUADS * 4);
+      }
+   );
+
+   counter("% Texture L1 Miss", Counter::Units::Percent, [=]() {
+         return percent(PERF_TP_L1_CACHELINE_MISSES, PERF_TP_L1_CACHELINE_REQUESTS);
+      }
+   );
+
+   counter("% Texture L2 Miss", Counter::Units::Percent, [=]() {
+         return percent(PERF_UCHE_VBIF_READ_BEATS_TP / 2, PERF_UCHE_READ_REQUESTS_TP);
+      }
+   );
+
+   /* TODO: verify */
+   counter("(?) % Stalled on System Memory", Counter::Units::Percent, [=]() {
+         return percent(PERF_UCHE_STALL_CYCLES_ARBITER / time, max_freq * info->num_sp_cores);
+      }
+   );
+
+   counter("Pre-clipped Polygons / Second", Counter::Units::None, [=]() {
+         return PERF_TSE_INPUT_PRIM * (1.f / time);
+      }
+   );
+
+   counter("% Prims Trivially Rejected", Counter::Units::Percent, [=]() {
+         return percent(PERF_TSE_TRIVAL_REJ_PRIM, PERF_TSE_INPUT_PRIM);
+      }
+   );
+
+   counter("% Prims Clipped", Counter::Units::Percent, [=]() {
+         return percent(PERF_TSE_CLIPPED_PRIM, PERF_TSE_INPUT_PRIM);
+      }
+   );
+
+   counter("Average Vertices / Polygon", Counter::Units::None, [=]() {
+         return PERF_PC_VS_INVOCATIONS / PERF_TSE_INPUT_PRIM;
+      }
+   );
+
+   counter("Reused Vertices / Second", Counter::Units::None, [=]() {
+         return PERF_PC_VERTEX_HITS * (1.f / time);
+      }
+   );
+
+   counter("Average Polygon Area", Counter::Units::None, [=]() {
+         return safe_div(PERF_HLSQ_QUADS * 4, PERF_TSE_OUTPUT_VISIBLE_PRIM);
+      }
+   );
+
+   /* TODO: find formula */
+   // counter("% Shaders Busy", Counter::Units::Percent, [=]() {
+   //       return 100.0 * 0;
+   //    }
+   // );
+
+   counter("Vertices Shaded / Second", Counter::Units::None, [=]() {
+         return PERF_PC_VS_INVOCATIONS * (1.f / time);
+      }
+   );
+
+   counter("Fragments Shaded / Second", Counter::Units::None, [=]() {
+         return PERF_HLSQ_QUADS * 4 * (1.f / time);
+      }
+   );
+
+   counter("Vertex Instructions / Second", Counter::Units::None, [=]() {
+         return (PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
+                 PERF_SP_VS_STAGE_EFU_INSTRUCTIONS) * (1.f / time);
+      }
+   );
+
+   counter("Fragment Instructions / Second", Counter::Units::None, [=]() {
          return (PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
-               PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / PERF_RB_3D_PIXELS;
+                 PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2 +
+                 PERF_SP_FS_STAGE_EFU_INSTRUCTIONS) * (1.f / time);
       }
    );
 
-   counter("TP L1 Cache Misses", Counter::Units::None, [=]() {
-         return PERF_TP_L1_CACHELINE_MISSES / time;
+   counter("Fragment ALU Instructions / Sec (Full)", Counter::Units::None, [=]() {
+         return PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS * (1.f / time);
       }
    );
 
-   counter("Shader Core Utilization", Counter::Units::Percent, [=]() {
-         return 100.0 * (PERF_SP_BUSY_CYCLES / time) / (max_freq * info->num_sp_cores);
+   counter("Fragment ALU Instructions / Sec (Half)", Counter::Units::None, [=]() {
+         return PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS * (1.f / time);
+      }
+   );
+
+   counter("Fragment EFU Instructions / Second", Counter::Units::None, [=]() {
+         return PERF_SP_FS_STAGE_EFU_INSTRUCTIONS * (1.f / time);
+      }
+   );
+
+   counter("Textures / Vertex", Counter::Units::None, [=]() {
+         return safe_div(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
+      }
+   );
+
+   counter("Textures / Fragment", Counter::Units::None, [=]() {
+         return safe_div(PERF_TP_OUTPUT_PIXELS, PERF_HLSQ_QUADS * 4);
+      }
+   );
+
+   counter("ALU / Vertex", Counter::Units::None, [=]() {
+         return safe_div(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
+      }
+   );
+
+   counter("EFU / Vertex", Counter::Units::None, [=]() {
+         return safe_div(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, PERF_PC_VS_INVOCATIONS);
+      }
+   );
+
+   counter("ALU / Fragment", Counter::Units::None, [=]() {
+         return safe_div(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
+                         PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2, PERF_HLSQ_QUADS);
+      }
+   );
+
+   counter("EFU / Fragment", Counter::Units::None, [=]() {
+         return safe_div(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, PERF_HLSQ_QUADS);
+      }
+   );
+
+   counter("% Time Shading Vertices", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_ANY_EU_WORKING_VS_STAGE,
+                        (PERF_SP_ANY_EU_WORKING_VS_STAGE +
+                         PERF_SP_ANY_EU_WORKING_FS_STAGE +
+                         PERF_SP_ANY_EU_WORKING_CS_STAGE));
+      }
+   );
+
+   counter("% Time Shading Fragments", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_ANY_EU_WORKING_FS_STAGE,
+                        (PERF_SP_ANY_EU_WORKING_VS_STAGE +
+                         PERF_SP_ANY_EU_WORKING_FS_STAGE +
+                         PERF_SP_ANY_EU_WORKING_CS_STAGE));
+      }
+   );
+
+   counter("% Time Compute", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_ANY_EU_WORKING_CS_STAGE,
+                        (PERF_SP_ANY_EU_WORKING_VS_STAGE +
+                         PERF_SP_ANY_EU_WORKING_FS_STAGE +
+                         PERF_SP_ANY_EU_WORKING_CS_STAGE));
+      }
+   );
+
+   counter("% Shader ALU Capacity Utilized", Counter::Units::Percent, [=]() {
+         return percent((PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS +
+                         PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS +
+                         PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS / 2) / 64,
+                        PERF_SP_BUSY_CYCLES);
+      }
+   );
+
+   counter("% Time ALUs Working", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_ALU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
       }
    );
 
-   // TODO add more.. see https://gpuinspector.dev/docs/gpu-counters/qualcomm
-   // for what blob exposes
+   counter("% Time EFUs Working", Counter::Units::Percent, [=]() {
+         return percent(PERF_SP_EFU_WORKING_CYCLES / 2, PERF_SP_BUSY_CYCLES);
+      }
+   );
+
+   counter("% Anisotropic Filtered", Counter::Units::Percent, [=]() {
+         return percent(PERF_TP_OUTPUT_PIXELS_ANISO, PERF_TP_OUTPUT_PIXELS);
+      }
+   );
+
+   counter("% Linear Filtered", Counter::Units::Percent, [=]() {
+         return percent(PERF_TP_OUTPUT_PIXELS_BILINEAR, PERF_TP_OUTPUT_PIXELS);
+      }
+   );
+
+   counter("% Nearest Filtered", Counter::Units::Percent, [=]() {
+         return percent(PERF_TP_OUTPUT_PIXELS_POINT, PERF_TP_OUTPUT_PIXELS);
+      }
+   );
+
+   counter("% Non-Base Level Textures", Counter::Units::Percent, [=]() {
+         return percent(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, PERF_TP_OUTPUT_PIXELS);
+      }
+   );
+
+   /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=63 */
+   // counter("Read Total (Bytes/sec)", Counter::Units::Byte, [=]() {
+   //       return  * (1.f / time);
+   //    }
+   // );
+
+   /* Reads from KGSL_PERFCOUNTER_GROUP_VBIF countable=84 */
+   // counter("Write Total (Bytes/sec)", Counter::Units::Byte, [=]() {
+   //       return  * (1.f / time);
+   //    }
+   // );
+
+   /* Cannot get PERF_CMPDECMP_VBIF_READ_DATA countable */
+   // counter("Texture Memory Read BW (Bytes/Second)", Counter::Units::Byte, [=]() {
+   //       return (PERF_CMPDECMP_VBIF_READ_DATA + PERF_UCHE_VBIF_READ_BEATS_TP) * (1.f / time);
+   //    }
+   // );
+
+   /* TODO: verify */
+   counter("(?) Vertex Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() {
+         return PERF_UCHE_VBIF_READ_BEATS_VFD * 32 * (1.f / time);
+      }
+   );
+
+   /* TODO: verify */
+   counter("SP Memory Read (Bytes/Second)", Counter::Units::Byte, [=]() {
+         return PERF_UCHE_VBIF_READ_BEATS_SP * 32 * (1.f / time);
+      }
+   );
+
+   counter("Avg Bytes / Fragment", Counter::Units::Byte, [=]() {
+         return safe_div(PERF_UCHE_VBIF_READ_BEATS_TP * 32, PERF_HLSQ_QUADS * 4);
+      }
+   );
+
+   counter("Avg Bytes / Vertex", Counter::Units::Byte, [=]() {
+         return safe_div(PERF_UCHE_VBIF_READ_BEATS_VFD * 32, PERF_PC_VS_INVOCATIONS);
+      }
+   );
+
+   counter("Preemptions / second", Counter::Units::None, [=]() {
+         return PERF_CP_NUM_PREEMPTIONS * (1.f / time);
+      }
+   );
+
+   counter("Avg Preemption Delay", Counter::Units::None, [=]() {
+         return PERF_CP_PREEMPTION_REACTION_DELAY * (1.f / time);
+      }
+   );
 }
 
 /**



More information about the mesa-commit mailing list