Mesa (master): anv: Emit pushed UBO bounds checking code in the back-end compiler

Fri Apr 17 14:57:14 UTC 2020

Module: Mesa
Branch: master
Commit: d0d039a4d3f49f3cc89e167b46edae54cf32a6be
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d0d039a4d3f49f3cc89e167b46edae54cf32a6be

Author: Jason Ekstrand <jason at jlekstrand.net>
Date:   Fri Apr  3 20:20:53 2020 -0500

anv: Emit pushed UBO bounds checking code in the back-end compiler

This commit fixes performance regressions introduced by e03f9652801ad7
in which we started bounds checking our push constants.  This added a
LOT of shader code to shaders which use the robustBufferAccess feature
and led to substantial spilling.  The checking we just added to the FS
back-end is far more efficient for two reasons:

 1. It can be done at a whole register granularity rather than per-
    scalar and so we emit one SIMD8 SEL per 32B GRF rather than one
    SIMD16 SEL (executed as two SELs) for each component loaded.

 2. Because we do it with NoMask instructions, we can do it on whole
    pushed GRFs without splatting them out to SIMD8 or SIME16 values.
    This means that robust buffer access no longer explodes our register
    pressure for no good reason.

As a tiny side-benefit, we're now using can use AND instead of SEL which
means no need for the flag and better scheduling.

Vulkan pipeline database results on ICL:

    Instructions in all programs: 293586059 -> 238009118 (-18.9%)
    SENDs in all programs: 13568515 -> 13568515 (+0.0%)
    Loops in all programs: 149720 -> 149720 (+0.0%)
    Cycles in all programs: 88499234498 -> 84348917496 (-4.7%)
    Spills in all programs: 1229018 -> 184339 (-85.0%)
    Fills in all programs: 1348397 -> 246061 (-81.8%)

This also improves the performance of a few apps:

 - Shadow of the Tomb Raider: +4%
 - Witcher 3: +3.5%
 - UE4 Shooter demo: +2%

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira at intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>

---

 src/intel/compiler/brw_compiler.h              |  13 ++
 src/intel/compiler/brw_fs.cpp                  |  43 ++++++
 src/intel/vulkan/anv_nir_compute_push_layout.c | 190 +++++++------------------
 src/intel/vulkan/anv_private.h                 |   4 +-
 src/intel/vulkan/genX_cmd_buffer.c             |  25 +++-
 5 files changed, 131 insertions(+), 144 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 62c7e85e55f..ab39af22684 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -657,6 +657,19 @@ struct brw_stage_prog_data {
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
 
+   /* zero_push_reg is a bitfield which indicates what push registers (if any)
+    * should be zeroed by SW at the start of the shader.  The corresponding
+    * push_reg_mask_param specifies the param index (in 32-bit units) where
+    * the actual runtime 64-bit mask will be pushed.  The shader will zero
+    * push reg i if
+    *
+    *    reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
+    *
+    * If this field is set, brw_compiler::compact_params must be false.
+    */
+   uint64_t zero_push_reg;
+   unsigned push_reg_mask_param;
+
    unsigned curb_read_length;
    unsigned total_scratch;
    unsigned total_shared;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4e13dcca54a..b578d82a252 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1604,6 +1604,8 @@ fs_visitor::assign_curb_setup()
 
    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
 
+   uint64_t used = 0;
+
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
@@ -1625,6 +1627,9 @@ fs_visitor::assign_curb_setup()
                constant_nr = 0;
             }
 
+            assert(constant_nr / 8 < 64);
+            used |= BITFIELD64_BIT(constant_nr / 8);
+
 	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
@@ -1639,6 +1644,44 @@ fs_visitor::assign_curb_setup()
       }
    }
 
+   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
+   if (want_zero) {
+      assert(!compiler->compact_params);
+      fs_builder ubld = bld.exec_all().group(8, 0).at(
+         cfg->first_block(), cfg->first_block()->start());
+
+      /* push_reg_mask_param is in 32-bit units */
+      unsigned mask_param = stage_prog_data->push_reg_mask_param;
+      struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
+                                                            mask_param % 8);
+
+      fs_reg b32;
+      for (unsigned i = 0; i < 64; i++) {
+         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+            fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
+            ubld.SHL(horiz_offset(shifted, 8),
+                     byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
+                     brw_imm_v(0x01234567));
+            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+
+            fs_builder ubld16 = ubld.group(16, 0);
+            b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
+            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
+         }
+
+         if (want_zero & BITFIELD64_BIT(i)) {
+            assert(i < prog_data->curb_read_length);
+            struct brw_reg push_reg =
+               retype(brw_vec8_grf(payload.num_regs + i, 0),
+                      BRW_REGISTER_TYPE_D);
+
+            ubld.AND(push_reg, push_reg, component(b32, i % 16));
+         }
+      }
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
 }
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
index 960fb1ca4a3..9fcc2f74e22 100644
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -80,13 +80,15 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
    if (push_ubo_ranges && robust_buffer_access) {
       /* We can't on-the-fly adjust our push ranges because doing so would
        * mess up the layout in the shader.  When robustBufferAccess is
-       * enabled, we have to manually bounds check our pushed UBO accesses.
+       * enabled, we push a mask into the shader indicating which pushed
+       * registers are valid and we zero out the invalid ones at the top of
+       * the shader.
        */
-      const uint32_t ubo_size_start =
-         offsetof(struct anv_push_constants, push_ubo_sizes);
-      const uint32_t ubo_size_end = ubo_size_start + (4 * sizeof(uint32_t));
-      push_start = MIN2(push_start, ubo_size_start);
-      push_end = MAX2(push_end, ubo_size_end);
+      const uint32_t push_reg_mask_start =
+         offsetof(struct anv_push_constants, push_reg_mask);
+      const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t);
+      push_start = MIN2(push_start, push_reg_mask_start);
+      push_end = MAX2(push_end, push_reg_mask_end);
    }
 
    if (nir->info.stage == MESA_SHADER_COMPUTE) {
@@ -121,8 +123,32 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       .length = DIV_ROUND_UP(push_end - push_start, 32),
    };
 
-   /* Mapping from brw_ubo_range to anv_push_range */
-   int push_range_idx_map[4] = { -1, -1, -1, -1 };
+   if (has_push_intrinsic) {
+      nir_foreach_function(function, nir) {
+         if (!function->impl)
+            continue;
+
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+               switch (intrin->intrinsic) {
+               case nir_intrinsic_load_push_constant:
+                  intrin->intrinsic = nir_intrinsic_load_uniform;
+                  nir_intrinsic_set_base(intrin,
+                                         nir_intrinsic_base(intrin) -
+                                         push_start);
+                  break;
+
+               default:
+                  break;
+               }
+            }
+         }
+      }
+   }
 
    if (push_ubo_ranges) {
       brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
@@ -144,6 +170,16 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       if (push_constant_range.length > 0)
          map->push_ranges[n++] = push_constant_range;
 
+      if (robust_buffer_access) {
+         const uint32_t push_reg_mask_offset =
+            offsetof(struct anv_push_constants, push_reg_mask);
+         assert(push_reg_mask_offset >= push_start);
+         prog_data->push_reg_mask_param =
+            (push_reg_mask_offset - push_start) / 4;
+      }
+
+      unsigned range_start_reg = push_constant_range.length;
+
       for (int i = 0; i < 4; i++) {
          struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
          if (ubo_range->length == 0)
@@ -157,7 +193,6 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
          const struct anv_pipeline_binding *binding =
             &map->surface_to_descriptor[ubo_range->block];
 
-         push_range_idx_map[i] = n;
          map->push_ranges[n++] = (struct anv_push_range) {
             .set = binding->set,
             .index = binding->index,
@@ -165,6 +200,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
             .start = ubo_range->start,
             .length = ubo_range->length,
          };
+
+         /* We only bother to shader-zero pushed client UBOs */
+         if (binding->set < MAX_SETS && robust_buffer_access) {
+            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
+                                                         ubo_range->length);
+         }
+
+         range_start_reg += ubo_range->length;
       }
    } else {
       /* For Ivy Bridge, the push constants packets have a different
@@ -178,135 +221,6 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       map->push_ranges[0] = push_constant_range;
    }
 
-   if (has_push_intrinsic || (push_ubo_ranges && robust_buffer_access)) {
-      nir_foreach_function(function, nir) {
-         if (!function->impl)
-            continue;
-
-         nir_builder b;
-         nir_builder_init(&b, function->impl);
-
-         nir_foreach_block(block, function->impl) {
-            nir_foreach_instr_safe(instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                  continue;
-
-               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-               switch (intrin->intrinsic) {
-               case nir_intrinsic_load_ubo: {
-                  if (!robust_buffer_access)
-                     break;
-
-                  if (!nir_src_is_const(intrin->src[0]) ||
-                      !nir_src_is_const(intrin->src[1]))
-                     break;
-
-                  uint32_t index = nir_src_as_uint(intrin->src[0]);
-                  uint64_t offset = nir_src_as_uint(intrin->src[1]);
-                  uint32_t size = intrin->num_components *
-                                  (intrin->dest.ssa.bit_size / 8);
-
-                  int ubo_range_idx = -1;
-                  for (unsigned i = 0; i < 4; i++) {
-                     const struct brw_ubo_range *range =
-                        &prog_data->ubo_ranges[i];
-                     if (range->block == index &&
-                         offset + size > range->start * 32 &&
-                         offset < (range->start + range->length) * 32) {
-                        ubo_range_idx = i;
-                        break;
-                     }
-                  }
-
-                  if (ubo_range_idx < 0)
-                     break;
-
-                  b.cursor = nir_after_instr(&intrin->instr);
-
-                  assert(push_range_idx_map[ubo_range_idx] >= 0);
-                  const uint32_t ubo_size_offset =
-                     offsetof(struct anv_push_constants, push_ubo_sizes) +
-                     push_range_idx_map[ubo_range_idx] * sizeof(uint32_t);
-
-                  nir_intrinsic_instr *load_size =
-                     nir_intrinsic_instr_create(b.shader,
-                                                nir_intrinsic_load_uniform);
-                  load_size->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
-                  nir_intrinsic_set_base(load_size,
-                                         ubo_size_offset - push_start);
-                  nir_intrinsic_set_range(load_size, 4);
-                  nir_intrinsic_set_type(load_size, nir_type_uint32);
-                  load_size->num_components = 1;
-                  nir_ssa_dest_init(&load_size->instr, &load_size->dest,
-                                    1, 32, NULL);
-                  nir_builder_instr_insert(&b, &load_size->instr);
-
-                  /* Do the size checks per-component.  Thanks to scalar block
-                   * layout, we could end up with a single vector straddling a
-                   * 32B boundary.
-                   *
-                   * We intentionally push a size starting from the UBO
-                   * binding in the descriptor set rather than starting from
-                   * the started of the pushed range.  This prevents us from
-                   * accidentally flagging things as out-of-bounds due to
-                   * roll-over if a vector access crosses the push range
-                   * boundary.
-                   *
-                   * We align up to 32B so that we can get better CSE.
-                   *
-                   * We check
-                   *
-                   *    offset + size - 1 < push_ubo_sizes[i]
-                   *
-                   * rather than
-                   *
-                   *    offset + size <= push_ubo_sizes[i]
-                   *
-                   * because it properly returns OOB for the case where
-                   * offset + size == 0.
-                   */
-                  nir_const_value last_byte_const[NIR_MAX_VEC_COMPONENTS];
-                  for (unsigned c = 0; c < intrin->dest.ssa.num_components; c++) {
-                     assert(intrin->dest.ssa.bit_size % 8 == 0);
-                     const unsigned comp_size_B = intrin->dest.ssa.bit_size / 8;
-                     const uint32_t comp_last_byte =
-                        align_u32(offset + (c + 1) * comp_size_B,
-                                  ANV_UBO_BOUNDS_CHECK_ALIGNMENT) - 1;
-                     last_byte_const[c] =
-                        nir_const_value_for_uint(comp_last_byte, 32);
-                  }
-                  nir_ssa_def *last_byte =
-                     nir_build_imm(&b, intrin->dest.ssa.num_components, 32,
-                                   last_byte_const);
-                  nir_ssa_def *in_bounds =
-                     nir_ult(&b, last_byte, &load_size->dest.ssa);
-
-                  nir_ssa_def *zero =
-                     nir_imm_zero(&b, intrin->dest.ssa.num_components,
-                                      intrin->dest.ssa.bit_size);
-                  nir_ssa_def *value =
-                     nir_bcsel(&b, in_bounds, &intrin->dest.ssa, zero);
-                  nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
-                                                 nir_src_for_ssa(value),
-                                                 value->parent_instr);
-                  break;
-               }
-
-               case nir_intrinsic_load_push_constant:
-                  intrin->intrinsic = nir_intrinsic_load_uniform;
-                  nir_intrinsic_set_base(intrin,
-                                         nir_intrinsic_base(intrin) -
-                                         push_start);
-                  break;
-
-               default:
-                  break;
-               }
-            }
-         }
-      }
-   }
-
    /* Now that we're done computing the push constant portion of the
     * bind map, hash it.  This lets us quickly determine if the actual
     * mapping has changed and not just a no-op pipeline change.
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index af07a1d203a..20ed281d062 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2477,8 +2477,10 @@ struct anv_push_constants {
    /** Dynamic offsets for dynamic UBOs and SSBOs */
    uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
 
+   uint64_t push_reg_mask;
+
    /** Pad out to a multiple of 32 bytes */
-   uint32_t push_ubo_sizes[4];
+   uint32_t pad[2];
 
    struct {
       /** Base workgroup ID
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 13ad1ced3bc..4375b99f873 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -3152,16 +3152,31 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
             &cmd_buffer->state.push_constants[stage];
 
          if (cmd_buffer->device->robust_buffer_access) {
+            push->push_reg_mask = 0;
+            /* Start of the current range in the shader, relative to the start
+             * of push constants in the shader.
+             */
+            unsigned range_start_reg = 0;
             for (unsigned i = 0; i < 4; i++) {
                const struct anv_push_range *range = &bind_map->push_ranges[i];
-               if (range->length == 0) {
-                  push->push_ubo_sizes[i] = 0;
-               } else {
-                  push->push_ubo_sizes[i] =
-                     get_push_range_bound_size(cmd_buffer, stage, range);
+               if (range->length == 0)
+                  continue;
+
+               unsigned bound_size =
+                  get_push_range_bound_size(cmd_buffer, stage, range);
+               if (bound_size >= range->start * 32) {
+                  unsigned bound_regs =
+                     MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
+                          range->length);
+                  assert(range_start_reg + bound_regs <= 64);
+                  push->push_reg_mask |= BITFIELD64_RANGE(range_start_reg,
+                                                          bound_regs);
                }
+
                cmd_buffer->state.push_constants_dirty |=
                   mesa_to_vk_shader_stage(stage);
+
+               range_start_reg += range->length;
             }
          }