[Mesa-dev] [PATCH 4/4] OPTIONAL: anv/gen9: Optimize slice and subslice load balancing behavior.

Sat Aug 10 00:20:39 UTC 2019

See "i965/gen9: Optimize slice and subslice load balancing behavior."
for the rationale.  Marked optional because no performance evaluation
has been done on this commit, it is provided to match the hashing
settings of the Iris driver.  Test reports welcome.
---
 src/intel/vulkan/anv_genX.h        |  4 ++
 src/intel/vulkan/anv_private.h     |  6 ++
 src/intel/vulkan/genX_blorp_exec.c |  6 ++
 src/intel/vulkan/genX_cmd_buffer.c | 96 ++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+)

diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index a5435e566a3..06c6b467acf 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -44,6 +44,10 @@ void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
 
+void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
+                                        unsigned width, unsigned height,
+                                        unsigned scale);
+
 void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
 void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
 
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 2465f264354..b381386a716 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2421,6 +2421,12 @@ struct anv_cmd_state {
 
    bool                                         conditional_render_enabled;
 
+   /**
+    * Last rendering scale argument provided to
+    * genX(cmd_buffer_emit_hashing_mode)().
+    */
+   unsigned                                     current_hash_scale;
+
    /**
     * Array length is anv_cmd_state::pass::attachment_count. Array content is
     * valid only when recording a render pass instance.
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 1592e7f7e3d..e9eedc06696 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -223,6 +223,12 @@ genX(blorp_exec)(struct blorp_batch *batch,
       genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
    }
 
+   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
+   if (cmd_buffer->state.current_hash_scale != scale) {
+      genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
+                                         params->y1 - params->y0, scale);
+   }
+
 #if GEN_GEN >= 11
    /* The PIPE_CONTROL command description says:
     *
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 86ef1663ac4..e9e5570d49f 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1595,6 +1595,7 @@ genX(CmdExecuteCommands)(
     */
    primary->state.current_pipeline = UINT32_MAX;
    primary->state.current_l3_config = NULL;
+   primary->state.current_hash_scale = 0;
 
    /* Each of the secondary command buffers will use its own state base
     * address.  We need to re-emit state base address for the primary after
@@ -2663,6 +2664,9 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
 
    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->urb.l3_config);
 
+   if (cmd_buffer->state.current_hash_scale != 1)
+      genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
+
    genX(flush_pipeline_select_3d)(cmd_buffer);
 
    if (vb_emit) {
@@ -3925,6 +3929,98 @@ genX(cmd_buffer_emit_gen7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
    }
 }
 
+/**
+ * Update the pixel hashing modes that determine the balancing of PS threads
+ * across subslices and slices.
+ *
+ * \param width Width bound of the rendering area (already scaled down if \p
+ *              scale is greater than 1).
+ * \param height Height bound of the rendering area (already scaled down if \p
+ *               scale is greater than 1).
+ * \param scale The number of framebuffer samples that could potentially be
+ *              affected by an individual channel of the PS thread.  This is
+ *              typically one for single-sampled rendering, but for operations
+ *              like CCS resolves and fast clears a single PS invocation may
+ *              update a huge number of pixels, in which case a finer
+ *              balancing is desirable in order to maximally utilize the
+ *              bandwidth available.  UINT_MAX can be used as shorthand for
+ *              "finest hashing mode available".
+ */
+void
+genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
+                                   unsigned width, unsigned height,
+                                   unsigned scale)
+{
+#if GEN_GEN == 9
+   const struct gen_device_info *devinfo = &cmd_buffer->device->info;
+   const unsigned slice_hashing[] = {
+      /* Because all Gen9 platforms with more than one slice require
+       * three-way subslice hashing, a single "normal" 16x16 slice hashing
+       * block is guaranteed to suffer from substantial imbalance, with one
+       * subslice receiving twice as much work as the other two in the
+       * slice.
+       *
+       * The performance impact of that would be particularly severe when
+       * three-way hashing is also in use for slice balancing (which is the
+       * case for all Gen9 GT4 platforms), because one of the slices
+       * receives one every three 16x16 blocks in either direction, which
+       * is roughly the periodicity of the underlying subslice imbalance
+       * pattern ("roughly" because in reality the hardware's
+       * implementation of three-way hashing doesn't do exact modulo 3
+       * arithmetic, which somewhat decreases the magnitude of this effect
+       * in practice).  This leads to a systematic subslice imbalance
+       * within that slice regardless of the size of the primitive.  The
+       * 32x32 hashing mode guarantees that the subslice imbalance within a
+       * single slice hashing block is minimal, largely eliminating this
+       * effect.
+       */
+      _32x32,
+      /* Finest slice hashing mode available. */
+      NORMAL
+   };
+   const unsigned subslice_hashing[] = {
+      /* 16x16 would provide a slight cache locality benefit especially
+       * visible in the sampler L1 cache efficiency of low-bandwidth
+       * non-LLC platforms, but it comes at the cost of greater subslice
+       * imbalance for primitives of dimensions approximately intermediate
+       * between 16x4 and 16x16.
+       */
+      _16x4,
+      /* Finest subslice hashing mode available. */
+      _8x4
+   };
+   /* Dimensions of the smallest hashing block of a given hashing mode.  If
+    * the rendering area is smaller than this there can't possibly be any
+    * benefit from switching to this mode, so we optimize out the
+    * transition.
+    */
+   const unsigned min_size[][2] = {
+         { 16, 4 },
+         { 8, 4 }
+   };
+   const unsigned idx = scale > 1;
+
+   if (width > min_size[idx][0] || height > min_size[idx][1]) {
+      uint32_t gt_mode;
+
+      anv_pack_struct(&gt_mode, GENX(GT_MODE),
+                      .SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0),
+                      .SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0),
+                      .SubsliceHashing = subslice_hashing[idx],
+                      .SubsliceHashingMask = -1);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.StallAtPixelScoreboard = true;
+         pc.CommandStreamerStallEnable = true;
+      }
+
+      emit_lri(&cmd_buffer->batch, GENX(GT_MODE_num), gt_mode);
+
+      cmd_buffer->state.current_hash_scale = scale;
+   }
+#endif
+}
+
 static void
 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 {
-- 
2.22.0