Mesa (main): radv,aco: use all attributes in a binding to obtain an alignment for fetch

Mon Jun 14 10:04:21 UTC 2021

Module: Mesa
Branch: main
Commit: 91f8f82806b60100f4eed186ed19170ca7086ce8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=91f8f82806b60100f4eed186ed19170ca7086ce8

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Fri Feb 12 10:48:58 2021 +0000

radv,aco: use all attributes in a binding to obtain an alignment for fetch

Instead of assuming scalar alignment for an attribute, we can use the
required alignment of other attributes in a binding to expect a higher
one.

This uses the alignment of all attributes in the pipeline, not just the
ones loaded. This can create slightly better code, but could break
pipelines which relied on unused (and unaligned) attributes no being
loaded. I don't think such pipelines are allowed by the spec.

fossil-db (Sienna Cichlid):
Totals from 44350 (30.32% of 146267) affected shaders:
VGPRs: 1694464 -> 1700616 (+0.36%); split: -0.08%, +0.44%
CodeSize: 60207184 -> 58093836 (-3.51%); split: -3.51%, +0.00%
MaxWaves: 1175998 -> 1174948 (-0.09%); split: +0.02%, -0.11%
Instrs: 11763444 -> 11458952 (-2.59%); split: -2.60%, +0.01%
Latency: 70679612 -> 67062215 (-5.12%); split: -5.27%, +0.15%
InvThroughput: 11482495 -> 11362911 (-1.04%); split: -1.20%, +0.16%
VClause: 359459 -> 343248 (-4.51%); split: -6.36%, +1.85%
SClause: 422404 -> 419229 (-0.75%); split: -1.17%, +0.42%
Copies: 754384 -> 764368 (+1.32%); split: -1.74%, +3.06%
Branches: 197472 -> 197474 (+0.00%); split: -0.03%, +0.03%
PreVGPRs: 1215348 -> 1215503 (+0.01%)

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9007>

---

 src/amd/compiler/aco_instruction_selection.cpp | 22 ++++++++++++++--------
 src/amd/vulkan/radv_pipeline.c                 | 13 +++++++++++++
 src/amd/vulkan/radv_private.h                  |  1 +
 src/amd/vulkan/radv_shader.h                   |  1 +
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 66ac6854eff..112e4c3f227 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4592,23 +4592,25 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
 }
 
 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
-                             unsigned offset, unsigned stride, unsigned channels)
+                             unsigned offset, unsigned binding_align, unsigned channels)
 {
+   unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
    if (vtx_info->chan_byte_size != 4 && channels == 3)
       return false;
 
-   /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
+   /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
     * alignment issues that triggers memory violations and eventually a GPU
     * hang. This can happen if the stride (static or dynamic) is unaligned and
     * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
     * offset is 2 for R16G16B16A16_SNORM).
     */
    return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
-          (channels == 1);
+          (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
 }
 
 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
-                              unsigned offset, unsigned stride, unsigned *channels)
+                              unsigned offset, unsigned stride, unsigned *channels,
+                              unsigned binding_align)
 {
    if (!vtx_info->chan_byte_size) {
       *channels = vtx_info->num_channels;
@@ -4616,10 +4618,11 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_
    }
 
    unsigned num_channels = *channels;
-   if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
+   if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
       unsigned new_channels = num_channels + 1;
       /* first, assume more loads is worse and try using a larger data format */
-      while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
+      while (new_channels <= 4 &&
+             !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
          new_channels++;
          /* don't make the attribute potentially out-of-bounds */
          if (offset + new_channels * vtx_info->chan_byte_size > stride)
@@ -4629,7 +4632,8 @@ uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_
       if (new_channels == 5) {
          /* then try decreasing load size (at the cost of more loads) */
          new_channels = *channels;
-         while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
+         while (new_channels > 1 &&
+                !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
             new_channels--;
       }
 
@@ -4708,6 +4712,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
+      unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
       enum ac_fetch_format alpha_adjust = ctx->options->key.vs.alpha_adjust[location];
 
       unsigned dfmt = attrib_format & 0xf;
@@ -4776,7 +4781,8 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
                           vtx_info->chan_byte_size == 4;
          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
          if (!use_mubuf) {
-            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
+            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component,
+                                               binding_align);
          } else {
             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 36019505948..8ebe0cd9f81 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2585,6 +2585,17 @@ radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
       key.vertex_attribute_bindings[location] = desc->binding;
       key.vertex_attribute_offsets[location] = desc->offset;
 
+      const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
+      unsigned attrib_align =
+         dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
+
+      /* If desc->offset is misaligned, then the buffer offset must be too. Just
+       * skip updating vertex_binding_align in this case.
+       */
+      if (desc->offset % attrib_align == 0)
+         key.vertex_binding_align[desc->binding] =
+            MAX2(key.vertex_binding_align[desc->binding], attrib_align);
+
       if (!uses_dynamic_stride) {
          /* From the Vulkan spec 1.2.157:
           *
@@ -2708,6 +2719,8 @@ radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key
       keys[MESA_SHADER_VERTEX].vs.vertex_attribute_strides[i] = key->vertex_attribute_strides[i];
       keys[MESA_SHADER_VERTEX].vs.alpha_adjust[i] = key->vertex_alpha_adjust[i];
    }
+   for (unsigned i = 0; i < MAX_VBS; ++i)
+      keys[MESA_SHADER_VERTEX].vs.vertex_binding_align[i] = key->vertex_binding_align[i];
    keys[MESA_SHADER_VERTEX].vs.outprim = si_conv_prim_to_gs_out(key->topology);
    keys[MESA_SHADER_VERTEX].vs.provoking_vtx_last = key->provoking_vtx_last;
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index c14b2babf7a..e3feb84e613 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -357,6 +357,7 @@ struct radv_pipeline_key {
    uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
    uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
    uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
+   uint8_t vertex_binding_align[MAX_VBS];
    enum ac_fetch_format vertex_alpha_adjust[MAX_VERTEX_ATTRIBS];
    uint32_t vertex_post_shuffle;
    unsigned tess_input_vertices;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 86aa86e902d..33614e68b7b 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -66,6 +66,7 @@ struct radv_vs_variant_key {
    uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
    uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
    uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
+   uint8_t vertex_binding_align[MAX_VBS];
 
    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
     * so we may need to fix it up. */