Mesa (main): turnip: use SUBDRAW_SIZE and constant sized tess bos

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Dec 1 17:52:19 UTC 2021


Module: Mesa
Branch: main
Commit: fd11d992546a1e4cd176653ce6c4d6afc2665f9d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fd11d992546a1e4cd176653ce6c4d6afc2665f9d

Author: Jonathan Marek <jonathan at marek.ca>
Date:   Mon Jul 27 10:06:46 2020 -0400

turnip: use SUBDRAW_SIZE and constant sized tess bos

This fixes the problem of large indirect draws, and at the same time avoids
allocating too large buffers for tessellation.

Reworked by @anholt to use a separate tess factor BO so we can skip the
WFIs to set the TESSFACTOR_ADDR.

Signed-off-by: Jonathan Marek <jonathan at marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6089>

---

 src/freedreno/vulkan/tu_cmd_buffer.c | 155 ++++++++++-------------------------
 src/freedreno/vulkan/tu_pipeline.c   |  35 ++++++--
 src/freedreno/vulkan/tu_private.h    |  11 ++-
 3 files changed, 77 insertions(+), 124 deletions(-)

diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index be11ef839aa..816aba2a5d8 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -64,6 +64,23 @@ tu6_emit_event_write(struct tu_cmd_buffer *cmd,
    }
 }
 
+/* Emits the tessfactor address to the top-level CS if it hasn't been already.
+ * Updating this register requires a WFI if outstanding drawing is using it, but
+ * tu6_init_hardware() will have WFIed before we started and no other draws
+ * could be using the tessfactor address yet since we only emit one per cmdbuf.
+ */
+static void
+tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer *cmd)
+{
+   if (cmd->state.tessfactor_addr_set)
+      return;
+
+   assert(cmd->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   tu_cs_emit_regs(&cmd->cs, A6XX_PC_TESSFACTOR_ADDR(.qword = cmd->device->tess_bo.iova));
+   cmd->state.tessfactor_addr_set = true;
+}
+
 static void
 tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
                  struct tu_cs *cs,
@@ -2215,6 +2232,14 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
    }
 
+   if (cmd->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+       (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) {
+      /* Set up the tess factor address if this is the first tess pipeline bound
+       * to the primary cmdbuf.
+      */
+      tu6_lazy_emit_tessfactor_addr(cmd);
+   }
+
    if (cmd->state.line_mode != pipeline->line_mode) {
       cmd->state.line_mode = pipeline->line_mode;
 
@@ -2983,8 +3008,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
             break;
          }
 
-         if (secondary->state.has_tess)
+         /* Set up the tess factor address if this is the first time a tess
+          * pipeline has been executed on this primary cmdbuf.
+          */
+         if (secondary->state.has_tess) {
+            tu6_lazy_emit_tessfactor_addr(cmd);
             cmd->state.has_tess = true;
+         }
          if (secondary->state.has_subpass_predication)
             cmd->state.has_subpass_predication = true;
          if (secondary->state.disable_gmem)
@@ -3477,103 +3507,17 @@ tu6_emit_consts_geom(struct tu_cmd_buffer *cmd,
    return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
 }
 
-static uint64_t
-get_tess_param_bo_size(const struct tu_pipeline *pipeline,
-                       uint32_t draw_count)
-{
-   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
-    * Still not sure what to do here, so just allocate a reasonably large
-    * BO and hope for the best for now. */
-   if (!draw_count)
-      draw_count = 2048;
-
-   /* the tess param BO is pipeline->tess.param_stride bytes per patch,
-    * which includes both the per-vertex outputs and per-patch outputs
-    * build_primitive_map in ir3 calculates this stride
-    */
-   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
-   uint32_t num_patches = draw_count / verts_per_patch;
-   return num_patches * pipeline->tess.param_stride;
-}
-
-static uint64_t
-get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
-                        uint32_t draw_count)
-{
-   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
-    * Still not sure what to do here, so just allocate a reasonably large
-    * BO and hope for the best for now. */
-   if (!draw_count)
-      draw_count = 2048;
-
-   /* Each distinct patch gets its own tess factor output. */
-   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
-   uint32_t num_patches = draw_count / verts_per_patch;
-   uint32_t factor_stride = ir3_tess_factor_stride(pipeline->tess.patch_type);
-   return factor_stride * num_patches;
-}
-
 static VkResult
-tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
-                     uint32_t draw_count,
-                     const struct tu_pipeline *pipeline,
-                     struct tu_draw_state *state,
-                     uint64_t *factor_iova)
+tu6_setup_tess(struct tu_cmd_buffer *cmd,
+               const struct tu_pipeline *pipeline,
+               uint32_t *subdraw_size)
 {
-   struct tu_cs cs;
-   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);
-   if (result != VK_SUCCESS)
-      return result;
-
-   const struct tu_program_descriptor_linkage *hs_link =
-      &pipeline->program.link[MESA_SHADER_TESS_CTRL];
-   bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen;
+   /* maximum number of patches that can fit in tess factor/param buffers */
+   *subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
+                        TU_TESS_PARAM_SIZE / pipeline->tess.param_stride);
+   /* convert from # of patches to draw count */
+   *subdraw_size *= (pipeline->ia.primtype - DI_PT_PATCHES0);
 
-   const struct tu_program_descriptor_linkage *ds_link =
-      &pipeline->program.link[MESA_SHADER_TESS_EVAL];
-   bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen;
-
-   uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
-   uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
-   uint64_t tess_bo_size =  tess_factor_size + tess_param_size;
-   if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) {
-      struct tu_bo *tess_bo;
-      result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
-      if (result != VK_SUCCESS)
-         return result;
-
-      uint64_t tess_factor_iova = tess_bo->iova;
-      uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
-
-      if (hs_uses_bo) {
-         tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
-         tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
-               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
-               CP_LOAD_STATE6_0_NUM_UNIT(1));
-         tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-         tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-         tu_cs_emit_qw(&cs, tess_param_iova);
-         tu_cs_emit_qw(&cs, tess_factor_iova);
-      }
-
-      if (ds_uses_bo) {
-         tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
-         tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
-               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
-               CP_LOAD_STATE6_0_NUM_UNIT(1));
-         tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-         tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-         tu_cs_emit_qw(&cs, tess_param_iova);
-         tu_cs_emit_qw(&cs, tess_factor_iova);
-      }
-
-      *factor_iova = tess_factor_iova;
-   }
-   *state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
    return VK_SUCCESS;
 }
 
@@ -3928,25 +3872,16 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
          tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
    }
 
-   struct tu_draw_state tess_consts = {};
    if (has_tess) {
-      uint64_t tess_factor_iova = 0;
+      uint32_t subdraw_size;
 
       cmd->state.has_tess = true;
-      result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);
+      result = tu6_setup_tess(cmd, pipeline, &subdraw_size);
       if (result != VK_SUCCESS)
          return result;
 
-      /* this sequence matches what the blob does before every tess draw
-       * PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi
-       * before writing to it
-       */
-      tu_cs_emit_wfi(cs);
-
-      tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova));
-
       tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
-      tu_cs_emit(cs, draw_count);
+      tu_cs_emit(cs, subdraw_size);
    }
 
    /* for the first draw in a renderpass, re-emit all the draw states
@@ -3965,7 +3900,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
-      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
       tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
@@ -3991,7 +3925,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
        */
       bool emit_binding_stride = false;
       uint32_t draw_state_count =
-         has_tess +
          ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +
          ((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
          ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
@@ -4007,10 +3940,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
       if (draw_state_count > 0)
          tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
 
-      /* We may need to re-emit tess consts if the current draw call is
-         * sufficiently larger than the last draw call. */
-      if (has_tess)
-         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
       if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index dc8d663a2c6..136f30224dd 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -1560,6 +1560,8 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
                           const struct ir3_shader_variant *gs,
                           uint32_t cps_per_patch)
 {
+   struct tu_device *dev = cs->device;
+
    uint32_t num_vertices =
          hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
 
@@ -1575,29 +1577,49 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
 
    if (hs) {
       assert(ds->type != MESA_SHADER_NONE);
-      uint32_t hs_params[4] = {
+
+      /* Create the shared tess factor BO the first time tess is used on the device. */
+      mtx_lock(&dev->mutex);
+      if (!dev->tess_bo.size)
+         tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS);
+      mtx_unlock(&dev->mutex);
+
+      uint64_t tess_factor_iova = dev->tess_bo.iova;
+      uint64_t tess_param_iova = tess_factor_iova + TU_TESS_FACTOR_SIZE;
+
+      uint32_t hs_params[8] = {
          vs->output_size * num_vertices * 4,  /* hs primitive stride */
          vs->output_size * 4,                 /* hs vertex stride */
          hs->output_size,
          cps_per_patch,
+         tess_param_iova,
+         tess_param_iova >> 32,
+         tess_factor_iova,
+         tess_factor_iova >> 32,
       };
 
       uint32_t hs_base = hs->const_state->offsets.primitive_param;
+      uint32_t hs_param_dwords = MIN2((hs->constlen - hs_base) * 4, ARRAY_SIZE(hs_params));
       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
-                     ARRAY_SIZE(hs_params), hs_params);
+                     hs_param_dwords, hs_params);
       if (gs)
          num_vertices = gs->shader->nir->info.gs.vertices_in;
 
-      uint32_t ds_params[4] = {
+      uint32_t ds_params[8] = {
          ds->output_size * num_vertices * 4,  /* ds primitive stride */
          ds->output_size * 4,                 /* ds vertex stride */
          hs->output_size,                     /* hs vertex stride (dwords) */
-         hs->shader->nir->info.tess.tcs_vertices_out
+         hs->shader->nir->info.tess.tcs_vertices_out,
+         tess_param_iova,
+         tess_param_iova >> 32,
+         tess_factor_iova,
+         tess_factor_iova >> 32,
       };
 
       uint32_t ds_base = ds->const_state->offsets.primitive_param;
+      uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params));
       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
-                     ARRAY_SIZE(ds_params), ds_params);
+                     ds_param_dwords, ds_params);
    }
 
    if (gs) {
@@ -2716,10 +2738,7 @@ tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
    pipeline->tess.upper_left_domain_origin = !domain_info ||
          domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
    const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
-   const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
    pipeline->tess.param_stride = hs->output_size * 4;
-   pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
-   pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
 }
 
 static void
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index d88fff2ace1..4982416e069 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -399,6 +399,13 @@ struct tu_device
 
    struct tu_bo global_bo;
 
+   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
+#define TU_TESS_FACTOR_SIZE (8 * 1024)
+#define TU_TESS_PARAM_SIZE (128 * 1024)
+#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
+   /* Lazily allocated, protected by the device mutex. */
+   struct tu_bo tess_bo;
+
    struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
    uint64_t global_shader_va[GLOBAL_SH_COUNT];
 
@@ -536,7 +543,6 @@ enum tu_draw_state_group_id
    TU_DRAW_STATE_PROGRAM_CONFIG,
    TU_DRAW_STATE_PROGRAM,
    TU_DRAW_STATE_PROGRAM_BINNING,
-   TU_DRAW_STATE_TESS,
    TU_DRAW_STATE_VB,
    TU_DRAW_STATE_VI,
    TU_DRAW_STATE_VI_BINNING,
@@ -1025,6 +1031,7 @@ struct tu_cmd_state
 
    bool xfb_used;
    bool has_tess;
+   bool tessfactor_addr_set;
    bool has_subpass_predication;
    bool predication_active;
    bool disable_gmem;
@@ -1253,8 +1260,6 @@ struct tu_pipeline
    {
       uint32_t patch_type;
       uint32_t param_stride;
-      uint32_t hs_bo_regid;
-      uint32_t ds_bo_regid;
       bool upper_left_domain_origin;
    } tess;
 



More information about the mesa-commit mailing list