Mesa (main): turnip: consider render pass costs in autotune

Wed Jun 8 13:20:11 UTC 2022

Module: Mesa
Branch: main
Commit: 5c17a042826b5cd26b4cd2763f07784d464b5645
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=5c17a042826b5cd26b4cd2763f07784d464b5645

Author: Chia-I Wu <olvaffe at gmail.com>
Date:   Thu May 26 14:40:49 2022 -0700

turnip: consider render pass costs in autotune

To be able to sum drawcall cost and render pass cost, the units of costs
are changed to bytes.  With that, tu_autotune_use_bypass can make
decisions by comparing the costs of sysmem rendering and gmem rendering.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16733>

---

 src/freedreno/vulkan/tu_autotune.c   | 86 +++++++++++++++++++++++-------------
 src/freedreno/vulkan/tu_cmd_buffer.c | 14 +++---
 src/freedreno/vulkan/tu_pipeline.c   | 36 +++++++++++----
 src/freedreno/vulkan/tu_private.h    | 30 ++++++-------
 4 files changed, 107 insertions(+), 59 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c
index 57ebd11cda3..2446cc6a129 100644
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -491,6 +491,24 @@ fallback_use_bypass(const struct tu_render_pass *pass,
    return true;
 }
 
+static uint32_t
+get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
+{
+   const VkExtent2D *extent = &cmd->state.render_area.extent;
+   return extent->width * extent->height;
+}
+
+static uint64_t
+estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
+                            uint32_t avg_renderpass_sample_count)
+{
+   const struct tu_cmd_state *state = &cmd->state;
+
+   /* sample count times drawcall_bandwidth_per_sample */
+   return (uint64_t)avg_renderpass_sample_count *
+      state->drawcall_bandwidth_per_sample_sum / state->drawcall_count;
+}
+
 bool
 tu_autotune_use_bypass(struct tu_autotune *at,
                        struct tu_cmd_buffer *cmd_buffer,
@@ -539,40 +557,46 @@ tu_autotune_use_bypass(struct tu_autotune *at,
 
    uint32_t avg_samples = 0;
    if (get_history(at, renderpass_key, &avg_samples)) {
-      /* TODO we should account for load/stores/clears/resolves especially
-       * with low drawcall count and ~fb_size samples passed, in D3D11 games
-       * we are seeing many renderpasses like:
-       *  - color attachment load
-       *  - single fullscreen draw
-       *  - color attachment store
+      const uint32_t pass_pixel_count =
+         get_render_pass_pixel_count(cmd_buffer);
+      uint64_t sysmem_bandwidth =
+         (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
+      uint64_t gmem_bandwidth =
+         (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
+
+      const uint64_t total_draw_call_bandwidth =
+         estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
+
+      /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
+      sysmem_bandwidth += total_draw_call_bandwidth;
+
+      /* drawcalls access gmem in gmem rendering, but we do not want to ignore
+       * them completely.  The state changes between tiles also have an
+       * overhead.  The magic numbers of 11 and 10 are randomly chosen.
        */
+      gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
 
-      /* Low sample count could mean there was only a clear.. or there was
-       * a clear plus draws that touch no or few samples
-       */
-      if (avg_samples < 500) {
-         if (TU_AUTOTUNE_DEBUG_LOG) {
-            mesa_logi("%016"PRIx64":%u\t avg_samples=%u selecting sysmem",
-               renderpass_key, cmd_buffer->state.drawcall_count, avg_samples);
-         }
-         return true;
-      }
-
-      /* Cost-per-sample is an estimate for the average number of reads+
-       * writes for a given passed sample.
-       */
-      float sample_cost = cmd_buffer->state.total_drawcalls_cost;
-      sample_cost /= cmd_buffer->state.drawcall_count;
-
-      float single_draw_cost = (avg_samples * sample_cost) / cmd_buffer->state.drawcall_count;
-
-      bool select_sysmem = single_draw_cost < 6000.0;
-
+      const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
       if (TU_AUTOTUNE_DEBUG_LOG) {
-         mesa_logi("%016"PRIx64":%u\t avg_samples=%u, "
-             "sample_cost=%f, single_draw_cost=%f selecting %s",
-             renderpass_key, cmd_buffer->state.drawcall_count, avg_samples,
-             sample_cost, single_draw_cost, select_sysmem ? "sysmem" : "gmem");
+         const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
+         const float drawcall_bandwidth_per_sample =
+            (float)cmd_buffer->state.drawcall_bandwidth_per_sample_sum /
+            cmd_buffer->state.drawcall_count;
+
+         mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
+               renderpass_key,
+               cmd_buffer->state.drawcall_count,
+               select_sysmem ? "sysmem" : "gmem");
+         mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
+               avg_samples,
+               drawcall_bandwidth_per_sample,
+               total_draw_call_bandwidth);
+         mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
+               extent->width, extent->height,
+               pass->sysmem_bandwidth_per_pixel,
+               pass->gmem_bandwidth_per_pixel);
+         mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
+               sysmem_bandwidth, gmem_bandwidth);
       }
 
       return select_sysmem;
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 4ef64585922..65f15eec0cc 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -3991,17 +3991,21 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
    /* Fill draw stats for autotuner */
    cmd->state.drawcall_count++;
 
-   cmd->state.total_drawcalls_cost += cmd->state.pipeline->drawcall_base_cost;
+   cmd->state.drawcall_bandwidth_per_sample_sum +=
+      cmd->state.pipeline->color_bandwidth_per_sample;
 
    /* add depth memory bandwidth cost */
+   const uint32_t depth_bandwidth = cmd->state.pipeline->depth_cpp_per_sample;
    if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE)
-      cmd->state.total_drawcalls_cost++;
+      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
    if (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE)
-      cmd->state.total_drawcalls_cost++;
+      cmd->state.drawcall_bandwidth_per_sample_sum += depth_bandwidth;
 
    /* add stencil memory bandwidth cost */
+   const uint32_t stencil_bandwidth =
+      cmd->state.pipeline->stencil_cpp_per_sample;
    if (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE)
-      cmd->state.total_drawcalls_cost += 2;
+      cmd->state.drawcall_bandwidth_per_sample_sum += stencil_bandwidth * 2;
 
    tu_emit_cache_flush_renderpass(cmd, cs);
 
@@ -4808,7 +4812,7 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
    cmd_buffer->state.has_subpass_predication = false;
    cmd_buffer->state.disable_gmem = false;
    cmd_buffer->state.drawcall_count = 0;
-   cmd_buffer->state.total_drawcalls_cost = 0;
+   cmd_buffer->state.drawcall_bandwidth_per_sample_sum = 0;
 
    /* LRZ is not valid next time we use it */
    cmd_buffer->state.lrz.valid = false;
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 04f69883154..d7cdc7936ca 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -2145,10 +2145,10 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
                          const VkPipelineColorBlendStateCreateInfo *blend_info,
                          const VkFormat attachment_formats[MAX_RTS],
                          uint32_t *blend_enable_mask,
-                         uint8_t *drawcall_base_cost)
+                         uint32_t *color_bandwidth_per_sample)
 {
    *blend_enable_mask = 0;
-   *drawcall_base_cost = 0;
+   *color_bandwidth_per_sample = 0;
 
    bool rop_reads_dst = false;
    uint32_t rb_mrt_control_rop = 0;
@@ -2159,7 +2159,7 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
          A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(blend_info->logicOp));
    }
 
-   uint32_t total_comps = 0;
+   uint32_t total_bpp = 0;
    for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
       const VkPipelineColorBlendAttachmentState *att =
          &blend_info->pAttachments[i];
@@ -2169,17 +2169,29 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
       uint32_t rb_mrt_blend_control = 0;
       if (format != VK_FORMAT_UNDEFINED) {
          const bool has_alpha = vk_format_has_alpha(format);
-         const uint32_t write_comps = util_bitcount(att->colorWriteMask);
 
          rb_mrt_control =
             tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
          rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
 
-         total_comps += write_comps;
+         /* calculate bpp based on format and write mask */
+         uint32_t write_bpp = 0;
+         if (att->colorWriteMask == 0xf) {
+            write_bpp = vk_format_get_blocksizebits(format);
+         } else {
+            const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
+            for (uint32_t i = 0; i < 4; i++) {
+               if (att->colorWriteMask & (1 << i)) {
+                  write_bpp += util_format_get_component_bits(pipe_format,
+                        UTIL_FORMAT_COLORSPACE_RGB, i);
+               }
+            }
+         }
+         total_bpp += write_bpp;
 
          if (att->blendEnable || rop_reads_dst) {
             *blend_enable_mask |= 1 << i;
-            total_comps += write_comps;
+            total_bpp += write_bpp;
          }
       }
 
@@ -2188,7 +2200,7 @@ tu6_emit_rb_mrt_controls(struct tu_cs *cs,
       tu_cs_emit(cs, rb_mrt_blend_control);
    }
 
-   *drawcall_base_cost = total_comps / 4;
+   *color_bandwidth_per_sample = total_bpp / 8;
 }
 
 static void
@@ -3364,6 +3376,8 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
       builder->create_info->pDepthStencilState;
    const VkPipelineRasterizationStateCreateInfo *rast_info =
       builder->create_info->pRasterizationState;
+   const enum pipe_format pipe_format =
+      vk_format_to_pipe_format(builder->depth_attachment_format);
    uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
    struct tu_cs cs;
 
@@ -3387,6 +3401,9 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
 
       if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
          tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
+
+      pipeline->depth_cpp_per_sample = util_format_get_component_bits(
+            pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
    } else {
       /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
        * to 0 when this pipeline is used, as enabling depth test when there
@@ -3416,6 +3433,9 @@ tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
             A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
       }
+
+      pipeline->stencil_cpp_per_sample = util_format_get_component_bits(
+            pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
    }
 
    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
@@ -3505,7 +3525,7 @@ tu_pipeline_builder_parse_multisample_and_color_blend(
    tu6_emit_rb_mrt_controls(&cs, blend_info,
                             builder->color_attachment_formats,
                             &blend_enable_mask,
-                            &pipeline->drawcall_base_cost);
+                            &pipeline->color_bandwidth_per_sample);
 
    tu6_emit_blend_control(&cs, blend_enable_mask,
                           builder->use_dual_src_blend, msaa_info);
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 2992d40aeba..2f7444c6d46 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -1223,27 +1223,24 @@ struct tu_cmd_state
     * to:
     *
     *    foreach_draw (...) {
-    *      cost += num_frag_outputs;
-    *      if (blend_enabled)
-    *        cost += num_blend_enabled;
+    *      sum += pipeline->color_bandwidth_per_sample;
     *      if (depth_test_enabled)
-    *        cost++;
+    *        sum += pipeline->depth_cpp_per_sample;
     *      if (depth_write_enabled)
-    *        cost++;
+    *        sum += pipeline->depth_cpp_per_sample;
+    *      if (stencil_write_enabled)
+    *        sum += pipeline->stencil_cpp_per_sample * 2;
     *    }
+    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
     *
-    * The idea is that each sample-passed minimally does one write
-    * per MRT.  If blend is enabled, the hw will additionally do
-    * a framebuffer read per sample-passed (for each MRT with blend
-    * enabled).  If depth-test is enabled, the hw will additionally
-    * a depth buffer read.  If depth-write is enable, the hw will
-    * additionally do a depth buffer write.
+    * It allows us to estimate the total bandwidth of drawcalls later, by
+    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
     *
     * This does ignore depth buffer traffic for samples which do not
-    * pass do to depth-test fail, and some other details.  But it is
+    * pass due to depth-test fail, and some other details.  But it is
     * just intended to be a rough estimate that is easy to calculate.
     */
-   uint32_t total_drawcalls_cost;
+   uint32_t drawcall_bandwidth_per_sample_sum;
 
    struct tu_lrz_state lrz;
 
@@ -1506,8 +1503,11 @@ struct tu_pipeline
 
    bool z_negative_one_to_one;
 
-   /* Base drawcall cost for sysmem vs gmem autotuner */
-   uint8_t drawcall_base_cost;
+   /* memory bandwidth cost (in bytes) for color attachments */
+   uint32_t color_bandwidth_per_sample;
+
+   uint32_t depth_cpp_per_sample;
+   uint32_t stencil_cpp_per_sample;
 
    void *executables_mem_ctx;
    /* tu_pipeline_executable */