Mesa (main): turnip: do not re-emit same vs params

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Jun 25 13:54:34 UTC 2021


Module: Mesa
Branch: main
Commit: 815a85dd7c44f4be4cbf5b351836c921a7a923fd
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=815a85dd7c44f4be4cbf5b351836c921a7a923fd

Author: Danylo Piliaiev <dpiliaiev at igalia.com>
Date:   Tue Jun 22 17:24:43 2021 +0300

turnip: do not re-emit same vs params

Improves drawoverhead perf through Zink up to 260%

Before:
  1, DrawElements ( 1 VBO| 0 UBO|  0    ) w/ no state change, 1518
After:
  1, DrawElements ( 1 VBO| 0 UBO|  0    ) w/ no state change, 3981

This brings it close to Freedreno, which has around 4300.

In vkQuake vs params re-emission now occurs in 0.23% of draw calls.

Signed-off-by: Danylo Piliaiev <dpiliaiev at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11556>

---

 src/freedreno/vulkan/tu_cmd_buffer.c | 57 ++++++++++++++++++++++++++----------
 src/freedreno/vulkan/tu_private.h    | 11 ++++++-
 2 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 9938105229e..2de699ada27 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -1515,6 +1515,10 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
    cmd_buffer->state.index_size = 0xff; /* dirty restart index */
 
+   cmd_buffer->state.last_vs_params.first_instance = -1;
+   cmd_buffer->state.last_vs_params.params_offset = -1;
+   cmd_buffer->state.last_vs_params.vertex_offset = -1;
+
    tu_cache_init(&cmd_buffer->state.cache);
    tu_cache_init(&cmd_buffer->state.renderpass_cache);
    cmd_buffer->usage_flags = pBeginInfo->flags;
@@ -3738,8 +3742,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
          ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +
          ((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
          ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
-         (dirty_lrz ? 2 : 0) +
-         1; /* vs_params */
+         ((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
+         (dirty_lrz ? 2 : 0);
 
       if ((cmd->state.dirty & TU_CMD_DIRTY_VB_STRIDE) &&
           (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
@@ -3747,7 +3751,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
          draw_state_count += 1;
       }
 
-      tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
+      if (draw_state_count > 0)
+         tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
 
       /* We may need to re-emit tess consts if the current draw call is
          * sufficiently larger than the last draw call. */
@@ -3765,7 +3770,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE,
                                cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]);
       }
-      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
+      if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS)
+         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
 
       if (dirty_lrz) {
          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
@@ -3843,22 +3849,35 @@ vs_params_offset(struct tu_cmd_buffer *cmd)
    return const_state->offsets.driver_param;
 }
 
-static struct tu_draw_state
+static void
+tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)
+{
+   if (cmd->state.vs_params.iova) {
+      cmd->state.vs_params = (struct tu_draw_state) {};
+      cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
+   }
+}
+
+static void
 tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
                    uint32_t vertex_offset,
                    uint32_t first_instance)
 {
    uint32_t offset = vs_params_offset(cmd);
 
+   if (offset == cmd->state.last_vs_params.params_offset &&
+       vertex_offset == cmd->state.last_vs_params.vertex_offset &&
+       first_instance == cmd->state.last_vs_params.first_instance) {
+      return;
+   }
+
    struct tu_cs cs;
    VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
    if (result != VK_SUCCESS) {
       cmd->record_result = result;
-      return (struct tu_draw_state) {};
+      return;
    }
 
-   /* TODO: don't make a new draw state when it doesn't change */
-
    tu_cs_emit_regs(&cs,
                    A6XX_VFD_INDEX_OFFSET(vertex_offset),
                    A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
@@ -3879,8 +3898,14 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
       tu_cs_emit(&cs, 0);
    }
 
+   cmd->state.last_vs_params.params_offset = offset;
+   cmd->state.last_vs_params.vertex_offset = vertex_offset;
+   cmd->state.last_vs_params.first_instance = first_instance;
+
    struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
-   return (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
+   cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
+
+   cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -3893,7 +3918,7 @@ tu_CmdDraw(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
-   cmd->state.vs_params = tu6_emit_vs_params(cmd, firstVertex, firstInstance);
+   tu6_emit_vs_params(cmd, firstVertex, firstInstance);
 
    tu6_draw_common(cmd, cs, false, vertexCount);
 
@@ -3914,7 +3939,7 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
-   cmd->state.vs_params = tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
+   tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
 
    tu6_draw_common(cmd, cs, true, indexCount);
 
@@ -3952,7 +3977,7 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_buffer, buf, _buffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
-   cmd->state.vs_params = (struct tu_draw_state) {};
+   tu6_emit_empty_vs_params(cmd);
 
    /* The latest known a630_sqe.fw fails to wait for WFI before reading the
     * indirect buffer when using CP_DRAW_INDIRECT_MULTI, so we have to fall
@@ -3986,7 +4011,7 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_buffer, buf, _buffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
-   cmd->state.vs_params = (struct tu_draw_state) {};
+   tu6_emit_empty_vs_params(cmd);
 
    if (cmd->device->physical_device->gpu_id != 650)
       draw_wfm(cmd);
@@ -4018,7 +4043,7 @@ tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
-   cmd->state.vs_params = (struct tu_draw_state) {};
+   tu6_emit_empty_vs_params(cmd);
 
    /* It turns out that the firmware we have for a650 only partially fixed the
     * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
@@ -4053,7 +4078,7 @@ tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
    struct tu_cs *cs = &cmd->draw_cs;
 
-   cmd->state.vs_params = (struct tu_draw_state) {};
+   tu6_emit_empty_vs_params(cmd);
 
    draw_wfm(cmd);
 
@@ -4091,7 +4116,7 @@ tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
     */
    draw_wfm(cmd);
 
-   cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance);
+   tu6_emit_vs_params(cmd, 0, firstInstance);
 
    tu6_draw_common(cmd, cs, false, 0);
 
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 2137d70deb3..90fd45bcff8 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -699,8 +699,9 @@ enum tu_cmd_dirty_bits
    TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
    TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
    TU_CMD_DIRTY_LRZ = BIT(8),
+   TU_CMD_DIRTY_VS_PARAMS = BIT(9),
    /* all draw states were disabled and need to be re-enabled: */
-   TU_CMD_DIRTY_DRAW_STATE = BIT(9)
+   TU_CMD_DIRTY_DRAW_STATE = BIT(10)
 };
 
 /* There are only three cache domains we have to care about: the CCU, or
@@ -882,6 +883,12 @@ struct tu_lrz_state
    enum tu_lrz_direction prev_direction;
 };
 
+struct tu_vs_params {
+   uint32_t params_offset;
+   uint32_t vertex_offset;
+   uint32_t first_instance;
+};
+
 struct tu_cmd_state
 {
    uint32_t dirty;
@@ -955,6 +962,8 @@ struct tu_cmd_state
    struct tu_lrz_state lrz;
 
    struct tu_draw_state depth_plane_state;
+
+   struct tu_vs_params last_vs_params;
 };
 
 struct tu_cmd_pool



More information about the mesa-commit mailing list