Mesa (main): radv: add pre-compiled vertex shader prologs for common states

Wed Oct 13 05:33:39 UTC 2021

Module: Mesa
Branch: main
Commit: f6f6f18e55537d6e3240d8c14e3f756e8d8fd75a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f6f6f18e55537d6e3240d8c14e3f756e8d8fd75a

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Mon May 17 19:17:15 2021 +0100

radv: add pre-compiled vertex shader prologs for common states

This lets us pre-compile a prolog and avoid a hash table lookup during
command buffer recording, most of the time.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11717>

---

 src/amd/vulkan/radv_cmd_buffer.c | 50 +++++++++++++++++++++++++++++++-
 src/amd/vulkan/radv_device.c     | 61 ++++++++++++++++++++++++++++++++++++++++
 src/amd/vulkan/radv_private.h    |  4 +++
 3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index da3285874c2..959ba95e039 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2649,6 +2649,35 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.context_roll_without_scissor_emitted = true;
 }
 
+unsigned
+radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
+{
+   /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
+    * single array sorted in ascending order using:
+    * - total number of attributes
+    * - number of instanced attributes
+    * - index of first instanced attribute
+    */
+
+   /* From total number of attributes to offset. */
+   static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
+                                                120, 165, 220, 286, 364, 455, 560, 680};
+   unsigned start_index = total_to_offset[num_attributes - 1];
+
+   /* From number of instanced attributes to offset. This would require a different LUT depending on
+    * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
+    * attributes.
+    */
+   static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
+                                                       100, 108, 115, 121, 126, 130, 133, 135};
+   unsigned count = util_bitcount(instance_rate_inputs);
+   unsigned offset_from_start_index =
+      count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
+
+   unsigned first = ffs(instance_rate_inputs) - 1;
+   return start_index + offset_from_start_index + first;
+}
+
 union vs_prolog_key_header {
    struct {
       uint32_t key_size : 8;
@@ -2734,6 +2763,25 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant
    else if (pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader)
       key.next_stage = MESA_SHADER_GEOMETRY;
 
+   /* try to use a pre-compiled prolog first */
+   struct radv_shader_prolog *prolog = NULL;
+   if (!key.as_ls && key.next_stage == MESA_SHADER_VERTEX &&
+       key.is_ngg == device->physical_device->use_ngg && !misaligned_mask &&
+       !state->alpha_adjust_lo && !state->alpha_adjust_hi &&
+       vs_shader->info.wave_size == device->physical_device->ge_wave_size) {
+      if (!instance_rate_inputs) {
+         prolog = device->simple_vs_prologs[num_attributes - 1];
+      } else if (num_attributes <= 16 && !*nontrivial_divisors &&
+                 util_bitcount(instance_rate_inputs) ==
+                    (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
+         unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
+         prolog = device->instance_rate_vs_prologs[index];
+      }
+   }
+   if (prolog)
+      return prolog;
+
+   /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
    uint32_t key_words[16];
    unsigned key_size = 1;
 
@@ -2801,7 +2849,7 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant
          return prolog_entry->data;
       }
 
-      struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
+      prolog = radv_create_vs_prolog(device, &key);
       uint32_t *key2 = malloc(key_size * 4);
       if (!prolog || !key2) {
          free(key2);
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index a866812f9fd..3246ffeefe3 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -2675,6 +2675,61 @@ radv_device_init_vs_prologs(struct radv_device *device)
    if (!device->vs_prologs)
       return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   /* don't pre-compile prologs if we want to print them */
+   if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
+      return VK_SUCCESS;
+
+   struct radv_vs_input_state state;
+   state.nontrivial_divisors = 0;
+   memset(state.offsets, 0, sizeof(state.offsets));
+   state.alpha_adjust_lo = 0;
+   state.alpha_adjust_hi = 0;
+   memset(state.formats, 0, sizeof(state.formats));
+
+   struct radv_vs_prolog_key key;
+   key.state = &state;
+   key.misaligned_mask = 0;
+   key.as_ls = false;
+   key.is_ngg = device->physical_device->use_ngg;
+   key.next_stage = MESA_SHADER_VERTEX;
+   key.wave32 = device->physical_device->ge_wave_size == 32;
+
+   for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
+      state.attribute_mask = BITFIELD_MASK(i);
+      state.instance_rate_inputs = 0;
+
+      key.num_attributes = i;
+
+      device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
+      if (!device->simple_vs_prologs[i - 1])
+         return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   unsigned idx = 0;
+   for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
+      state.attribute_mask = BITFIELD_MASK(num_attributes);
+
+      for (unsigned i = 0; i < num_attributes; i++)
+         state.divisors[i] = 1;
+
+      for (unsigned count = 1; count <= num_attributes; count++) {
+         for (unsigned start = 0; start <= (num_attributes - count); start++) {
+            state.instance_rate_inputs = u_bit_consecutive(start, count);
+
+            key.num_attributes = num_attributes;
+
+            struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
+            if (!prolog)
+               return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+            assert(idx ==
+                   radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs));
+            device->instance_rate_vs_prologs[idx++] = prolog;
+         }
+      }
+   }
+   assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs));
+
    return VK_SUCCESS;
 }
 
@@ -2689,6 +2744,12 @@ radv_device_finish_vs_prologs(struct radv_device *device)
       }
       _mesa_hash_table_destroy(device->vs_prologs, NULL);
    }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++)
+      radv_prolog_destroy(device, device->simple_vs_prologs[i]);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++)
+      radv_prolog_destroy(device, device->instance_rate_vs_prologs[i]);
 }
 
 VkResult
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 60ea3b3c2aa..f4cf110df05 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -835,6 +835,9 @@ struct radv_device {
 
    struct u_rwlock vs_prologs_lock;
    struct hash_table *vs_prologs;
+
+   struct radv_shader_prolog *simple_vs_prologs[MAX_VERTEX_ATTRIBS];
+   struct radv_shader_prolog *instance_rate_vs_prologs[816];
 };
 
 VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line,
@@ -1543,6 +1546,7 @@ void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
 
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
 
+unsigned radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs);
 uint32_t radv_hash_vs_prolog(const void *key_);
 bool radv_cmp_vs_prolog(const void *a_, const void *b_);