Mesa (main): intel: Only set VectorMaskEnable when needed

Fri May 27 22:13:03 UTC 2022

Module: Mesa
Branch: main
Commit: dfedeccc1395e7a43d41165dc09d9ab4e5f16c3c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=dfedeccc1395e7a43d41165dc09d9ab4e5f16c3c

Author: Jason Ekstrand <jason at jlekstrand.net>
Date:   Fri Jun  7 18:17:36 2019 -0500

intel: Only set VectorMaskEnable when needed

For cases with lots of very small primitives, this may improve
performance because we're not executing those dead channels all the
time.

Shader-db reports no instruction or cycle-count changes.  However, by
hacking up the driver to report when this optimization triggers, it
appears to affect about 10% of shader-db.

v2 (Kenneth Graunke): Always enable VMask prior to XeHP for now,
because using VMask on those platforms allows us to perform the
eliminate_find_live_channel() optimization.  However, XeHP doesn't
seem to have packed fragment shader dispatch, so we lose that
optimization regardless, and there's no reason not to avoid vmask.

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/1054>

---

 src/gallium/drivers/crocus/crocus_state.c |  2 +-
 src/gallium/drivers/iris/iris_state.c     |  2 +-
 src/intel/compiler/brw_compiler.h         |  5 ++++-
 src/intel/compiler/brw_fs.cpp             | 14 ++++++++++++--
 src/intel/compiler/brw_fs_generator.cpp   | 12 +++++++++---
 src/intel/vulkan/genX_pipeline.c          |  3 ++-
 6 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c
index 440a7870652..74549291a81 100644
--- a/src/gallium/drivers/crocus/crocus_state.c
+++ b/src/gallium/drivers/crocus/crocus_state.c
@@ -6441,7 +6441,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
           * incorrect for subspans where some of the pixels are unlit.  We believe
           * the bit just didn't take effect in previous generations.
           */
-         ps.VectorMaskEnable = GFX_VER >= 8;
+         ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
 
          ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
          ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 24dae802611..00114113062 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -4654,7 +4654,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo,
    uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
 
    iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
-      ps.VectorMaskEnable = true;
+      ps.VectorMaskEnable = wm_prog_data->uses_vmask;
       ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
       ps.FloatingPointMode = prog_data->use_alt_mode;
       ps.MaximumNumberofThreadsPerPSD =
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index ef4c8650ede..62433320107 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -887,6 +887,7 @@ struct brw_wm_prog_data {
    bool uses_src_w;
    bool uses_depth_w_coefficients;
    bool uses_sample_mask;
+   bool uses_vmask;
    bool has_render_target_reads;
    bool has_side_effects;
    bool pulls_bary;
@@ -1967,7 +1968,9 @@ brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
        */
       const struct brw_wm_prog_data *wm_prog_data =
          (const struct brw_wm_prog_data *)prog_data;
-      return devinfo->verx10 < 125 && !wm_prog_data->persample_dispatch;
+      return devinfo->verx10 < 125 &&
+             !wm_prog_data->persample_dispatch &&
+             wm_prog_data->uses_vmask;
    }
    case MESA_SHADER_COMPUTE:
       /* Compute shaders will be spawned with either a fully enabled dispatch
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c02faf65021..84167496b99 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -9824,6 +9824,14 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader,
       (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
       !prog_data->computed_stencil;
 
+   /* We choose to always enable VMask prior to XeHP, as it would cause
+    * us to lose out on the eliminate_find_live_channel() optimization.
+    */
+   prog_data->uses_vmask = devinfo->verx10 < 125 ||
+                           shader->info.fs.needs_quad_helper_invocations ||
+                           shader->info.fs.needs_all_helper_invocations ||
+                           prog_data->per_coarse_pixel_dispatch;
+
    prog_data->uses_src_w =
       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
    prog_data->uses_src_depth =
@@ -10569,13 +10577,15 @@ static UNUSED void
 brw_fs_test_dispatch_packing(const fs_builder &bld)
 {
    const gl_shader_stage stage = bld.shader->stage;
+   const bool uses_vmask =
+      stage == MESA_SHADER_FRAGMENT &&
+      brw_wm_prog_data(bld.shader->stage_prog_data)->uses_vmask;
 
    if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
                                      bld.shader->stage_prog_data)) {
       const fs_builder ubld = bld.exec_all().group(1, 0);
       const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
-      const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
-                           brw_dmask_reg());
+      const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
 
       ubld.ADD(tmp, mask, brw_imm_ud(1));
       ubld.AND(tmp, mask, tmp);
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 54db3b6721b..5a4b2fb35e6 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2411,21 +2411,27 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+         const bool uses_vmask =
+            stage == MESA_SHADER_FRAGMENT &&
+            brw_wm_prog_data(this->prog_data)->uses_vmask;
          const struct brw_reg mask =
             brw_stage_has_packed_dispatch(devinfo, stage,
                                           prog_data) ? brw_imm_ud(~0u) :
-            stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
-            brw_dmask_reg();
+            uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
 
          brw_find_live_channel(p, dst, mask, false);
          break;
       }
       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
+         const bool uses_vmask =
+            stage == MESA_SHADER_FRAGMENT &&
+            brw_wm_prog_data(this->prog_data)->uses_vmask;
+
          /* ce0 doesn't consider the thread dispatch mask, so if we want
           * to find the true last enabled channel, we need to apply that too.
           */
          const struct brw_reg mask =
-            stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : brw_dmask_reg();
+            uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
 
          brw_find_live_channel(p, dst, mask, true);
          break;
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 0fa95ed86b6..192fec6b6e3 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -2351,7 +2351,8 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
                                brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
 
       ps.SingleProgramFlow          = false;
-      ps.VectorMaskEnable           = GFX_VER >= 8;
+      ps.VectorMaskEnable           = GFX_VER >= 8 &&
+                                      wm_prog_data->uses_vmask;
       /* Wa_1606682166 */
       ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
       ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;