Mesa (main): tu: Fix prim gen query and pipeline stats query interaction

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Jul 8 08:40:22 UTC 2022


Module: Mesa
Branch: main
Commit: bf4c160909863fcf37c85cc098da720448a9e4f2
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bf4c160909863fcf37c85cc098da720448a9e4f2

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Mon Jun 27 19:01:08 2022 +0300

tu: Fix prim gen query and pipeline stats query interaction

Fixed:
- VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT was able to stop prim counter
  when pipeline stats query is running.
  - This may have happened when prim gen query was in secondary cmdbuf.
- VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile.
- VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile
  when pipeline stats query is started inside prim gen query and inside
  a renderpass.

The matter of VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT and pipeline stats
interaction is solved by tracking whether pipeline stats query is
running both on CPU (for non secondary cmdbuf case) and on GPU (for
secondary cmdbuf).

Note, prim gen query is not allowed with secondary command buffers, so
only pipeline stats query is tracked on gpu.
See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3142

Counting geometry per each tile is solved by:
- Conditionally executing START/STOP_PRIMITIVE_CTRS to not run in tiling
  pass. Solves the case when prim gen query is inside a renderpass.
- Stop prim counters before executing `draw_cs` and restarting them
  afterwards. Solves prim gen query being outside a renderpass.

Fixes GL CTS tests with Zink + `TU_DEBUG=gmem`:
 GTF-GL46.gtf30.GL3Tests.transform_feedback.transform_feedback_max_separate
 GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_basic
 GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_framebuffer
 GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_overflow
 GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_queried
 GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states

Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6602

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17163>

---

 src/freedreno/vulkan/tu_cmd_buffer.c               |  12 ++
 src/freedreno/vulkan/tu_device.c                   |   1 +
 src/freedreno/vulkan/tu_private.h                  |  10 ++
 src/freedreno/vulkan/tu_query.c                    | 127 ++++++++++++++++++---
 .../drivers/zink/ci/zink-tu-a630-flakes.txt        |   3 -
 5 files changed, 136 insertions(+), 17 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 88d6a3d4b50..85bf1f509bb 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -1382,8 +1382,17 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
 
    trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
 
+   /* Primitives that passed all tests are still counted in in each
+    * tile even with HW binning beforehand. Do not permit it.
+    */
+   if (cmd->state.prim_generated_query_running_before_rp)
+      tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS);
+
    tu_cs_emit_call(cs, &cmd->draw_cs);
 
+   if (cmd->state.prim_generated_query_running_before_rp)
+      tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS);
+
    if (use_hw_binning(cmd)) {
       tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
       tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
@@ -1747,6 +1756,9 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    } else if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
       assert(pBeginInfo->pInheritanceInfo);
 
+      cmd_buffer->inherited_pipeline_statistics =
+         pBeginInfo->pInheritanceInfo->pipelineStatistics;
+
       vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {
          switch (ext->sType) {
          case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 9c09bbd647d..45688837f66 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -1868,6 +1868,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    struct tu6_global *global = device->global_bo->map;
    tu_init_clear_blit_shaders(device);
    global->predicate = 0;
+   global->vtx_stats_query_not_running = 1;
    global->dbg_one = (uint32_t)-1;
    global->dbg_gmem_total_loads = 0;
    global->dbg_gmem_taken_loads = 0;
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 38735010d83..c3dbd7d795b 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -484,6 +484,8 @@ struct tu6_global
 
    ALIGN16 uint32_t cs_indirect_xyz[3];
 
+   volatile uint32_t vtx_stats_query_not_running;
+
    /* To know when renderpass stats for autotune are valid */
    volatile uint32_t autotune_fence;
 
@@ -1258,6 +1260,12 @@ struct tu_cmd_state
     */
    uint32_t drawcall_bandwidth_per_sample_sum;
 
+   /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
+    * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
+    * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
+    */
+   uint32_t prim_counters_running;
+
    bool prim_generated_query_running_before_rp;
    bool has_prim_generated_query_in_rp;
 
@@ -1304,6 +1312,8 @@ struct tu_cmd_buffer
    VkCommandBufferUsageFlags usage_flags;
    enum tu_cmd_buffer_status status;
 
+   VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
+
    struct tu_cmd_state state;
    uint32_t queue_family_index;
 
diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c
index 20d5ee40294..ab003a276e3 100644
--- a/src/freedreno/vulkan/tu_query.c
+++ b/src/freedreno/vulkan/tu_query.c
@@ -426,9 +426,9 @@ statistics_index(uint32_t *statistics)
 }
 
 static bool
-is_pipeline_query_with_vertex_stage(struct tu_query_pool *pool)
+is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics)
 {
-   return pool->pipeline_statistics &
+   return pipeline_statistics &
           (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
            VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
            VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
@@ -441,16 +441,16 @@ is_pipeline_query_with_vertex_stage(struct tu_query_pool *pool)
 }
 
 static bool
-is_pipeline_query_with_fragment_stage(struct tu_query_pool *pool)
+is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics)
 {
-   return pool->pipeline_statistics &
+   return pipeline_statistics &
           VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
 }
 
 static bool
-is_pipeline_query_with_compute_stage(struct tu_query_pool *pool)
+is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics)
 {
-   return pool->pipeline_statistics &
+   return pipeline_statistics &
           VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
 }
 
@@ -871,15 +871,35 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf,
    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
    uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin);
 
-   if (is_pipeline_query_with_vertex_stage(pool)) {
+   if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
+      bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running;
+      cmdbuf->state.prim_counters_running++;
+
+      /* Prevent starting primitive counters when it is supposed to be stopped
+       * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query.
+       */
+      if (need_cond_exec) {
+         tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
+                        CP_COND_REG_EXEC_0_SYSMEM |
+                        CP_COND_REG_EXEC_0_BINNING);
+      }
+
       tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
+
+      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
+      tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
+      tu_cs_emit(cs, 0);
+
+      if (need_cond_exec) {
+         tu_cond_exec_end(cs);
+      }
    }
 
-   if (is_pipeline_query_with_fragment_stage(pool)) {
+   if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
       tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS);
    }
 
-   if (is_pipeline_query_with_compute_stage(pool)) {
+   if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
       tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS);
    }
 
@@ -1008,6 +1028,17 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
       cmdbuf->state.prim_generated_query_running_before_rp = true;
    }
 
+   cmdbuf->state.prim_counters_running++;
+
+   if (cmdbuf->state.pass) {
+      /* Primitives that passed all tests are still counted in in each
+       * tile even with HW binning beforehand. Do not permit it.
+       */
+      tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
+                           CP_COND_REG_EXEC_0_SYSMEM |
+                           CP_COND_REG_EXEC_0_BINNING);
+   }
+
    tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS);
 
    tu_cs_emit_wfi(cs);
@@ -1017,6 +1048,10 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
                   CP_REG_TO_MEM_0_CNT(2) |
                   CP_REG_TO_MEM_0_64B);
    tu_cs_emit_qw(cs, begin_iova);
+
+   if (cmdbuf->state.pass) {
+      tu_cond_exec_end(cs);
+   }
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -1152,6 +1187,53 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
    tu_cs_emit_qw(cs, 0x1);
 }
 
+/* PRIMITIVE_CTRS is used for two distinct queries:
+ * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT
+ * - VK_QUERY_TYPE_PIPELINE_STATISTICS
+ * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted
+ * only for outer query.
+ *
+ * Also, pipeline stat query could run outside of renderpass and prim gen
+ * query inside of secondary cmd buffer - for such case we ought to track
+ * the status of pipeline stats query.
+ */
+static void
+emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf,
+                         struct tu_cs *cs,
+                         enum VkQueryType query_type)
+{
+   bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY;
+   cmdbuf->state.prim_counters_running--;
+   if (cmdbuf->state.prim_counters_running == 0) {
+      bool need_cond_exec =
+         is_secondary &&
+         query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT &&
+         is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics);
+
+      if (!need_cond_exec) {
+         tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
+      } else {
+         tu_cs_reserve(cs, 7 + 2);
+         /* Check that pipeline stats query is not running, only then
+          * we count stop the counter.
+          */
+         tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
+         tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
+         tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
+         tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
+         tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */
+
+         tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
+      }
+   }
+
+   if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
+      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
+      tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running));
+      tu_cs_emit(cs, 1);
+   }
+}
+
 static void
 emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
                     struct tu_query_pool *pool,
@@ -1164,15 +1246,19 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf,
    uint64_t stat_start_iova;
    uint64_t stat_stop_iova;
 
-   if (is_pipeline_query_with_vertex_stage(pool)) {
-      tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
+   if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) {
+      /* No need to conditionally execute STOP_PRIMITIVE_CTRS when
+       * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a
+       * renderpass, because it is already stopped.
+       */
+      emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS);
    }
 
-   if (is_pipeline_query_with_fragment_stage(pool)) {
+   if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) {
       tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS);
    }
 
-   if (is_pipeline_query_with_compute_stage(pool)) {
+   if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) {
       tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS);
    }
 
@@ -1355,7 +1441,11 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
    uint64_t result_iova = primitives_generated_query_iova(pool, query, result);
    uint64_t available_iova = query_available_iova(pool, query);
 
-   tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS);
+   if (cmdbuf->state.pass) {
+      tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
+                             CP_COND_REG_EXEC_0_SYSMEM |
+                             CP_COND_REG_EXEC_0_BINNING);
+   }
 
    tu_cs_emit_wfi(cs);
 
@@ -1375,6 +1465,15 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf,
 
    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 
+   /* Should be after waiting for mem writes to have up to date info
+    * about which query is running.
+    */
+   emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
+
+   if (cmdbuf->state.pass) {
+      tu_cond_exec_end(cs);
+   }
+
    if (cmdbuf->state.pass)
       cs = &cmdbuf->draw_epilogue_cs;
 
diff --git a/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt b/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt
index 8d342317bd0..268c4c5dc19 100644
--- a/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt
+++ b/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt
@@ -1,6 +1,3 @@
-# #6603
-GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states
-
 GTF-GL46.gtf32.GL3Tests.packed_pixels.packed_pixels_pixelstore
 KHR-Single-GL46.arrays_of_arrays_gl.ConstructorsAndUnsizedDeclConstructorSizing1
 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_write_dynamic_read_vertex



More information about the mesa-commit mailing list