Mesa (main): radv: implement dynamic vertex input state using vertex shader prologs
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Oct 13 05:33:39 UTC 2021
Module: Mesa
Branch: main
Commit: 80841196b2dc921db38d9f3403e67d57749bc1d8
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=80841196b2dc921db38d9f3403e67d57749bc1d8
Author: Rhys Perry <pendingchaos02 at gmail.com>
Date: Fri Apr 16 11:55:59 2021 +0100
radv: implement dynamic vertex input state using vertex shader prologs
This doesn't actually use the functionality or implement prolog
compilation yet.
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11717>
---
docs/envvars.rst | 2 +
src/amd/compiler/aco_interface.cpp | 7 +
src/amd/compiler/aco_interface.h | 3 +
src/amd/compiler/aco_ir.h | 1 +
src/amd/vulkan/radv_cmd_buffer.c | 418 ++++++++++++++++++++++++++++++++++---
src/amd/vulkan/radv_debug.h | 1 +
src/amd/vulkan/radv_device.c | 34 +++
src/amd/vulkan/radv_pipeline.c | 9 +-
src/amd/vulkan/radv_private.h | 29 ++-
src/amd/vulkan/radv_shader.c | 84 +++++++-
src/amd/vulkan/radv_shader.h | 59 +++++-
src/amd/vulkan/radv_shader_args.c | 21 ++
src/amd/vulkan/radv_shader_args.h | 3 +
src/amd/vulkan/radv_shader_info.c | 13 +-
14 files changed, 646 insertions(+), 38 deletions(-)
diff --git a/docs/envvars.rst b/docs/envvars.rst
index 0a0a8e5877a..92e568715e2 100644
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -670,6 +670,8 @@ RADV driver environment variables
disable VRS for flat shading (only on GFX10.3+)
``preoptir``
dump LLVM IR before any optimizations
+ ``prologs``
+ dump vertex shader prologs
``shaders``
dump shaders
``shaderstats``
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index 60499f2aa5e..b70dc530d08 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -263,3 +263,10 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
*binary = (radv_shader_binary*)legacy_binary;
}
+
+void
+aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary,
+ const struct radv_shader_args* args)
+{
+ unreachable("TODO");
+}
diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h
index 1575cf59478..33d2762ba69 100644
--- a/src/amd/compiler/aco_interface.h
+++ b/src/amd/compiler/aco_interface.h
@@ -44,6 +44,9 @@ extern const struct aco_compiler_statistic_info* aco_statistic_infos;
void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
struct radv_shader_binary** binary, const struct radv_shader_args* args);
+void aco_compile_vs_prolog(const struct radv_vs_prolog_key* key, struct radv_prolog_binary** binary,
+ const struct radv_shader_args* args);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 3ab71a8ccca..5998a527e4f 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -38,6 +38,7 @@
struct radv_shader_args;
struct radv_shader_info;
+struct radv_vs_prolog_key;
namespace aco {
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 2353ab68f51..da3285874c2 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -37,6 +37,8 @@
#include "ac_debug.h"
+#include "util/fast_idiv_by_const.h"
+
enum {
RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
RADV_PREFETCH_VS = (1 << 1),
@@ -2647,8 +2649,300 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->state.context_roll_without_scissor_emitted = true;
}
+union vs_prolog_key_header {
+ struct {
+ uint32_t key_size : 8;
+ uint32_t num_attributes : 6;
+ uint32_t as_ls : 1;
+ uint32_t is_ngg : 1;
+ uint32_t wave32 : 1;
+ uint32_t next_stage : 3;
+ uint32_t instance_rate_inputs : 1;
+ uint32_t alpha_adjust_lo : 1;
+ uint32_t alpha_adjust_hi : 1;
+ uint32_t misaligned_mask : 1;
+ uint32_t post_shuffle : 1;
+ uint32_t nontrivial_divisors : 1;
+ /* We need this to ensure the padding is zero. It's useful even if it's unused. */
+ uint32_t padding0 : 6;
+ };
+ uint32_t v;
+};
+
+uint32_t
+radv_hash_vs_prolog(const void *key_)
+{
+ const uint32_t *key = key_;
+ union vs_prolog_key_header header;
+ header.v = key[0];
+ return _mesa_hash_data(key, header.key_size);
+}
+
+bool
+radv_cmp_vs_prolog(const void *a_, const void *b_)
+{
+ const uint32_t *a = a_;
+ const uint32_t *b = b_;
+ if (a[0] != b[0])
+ return false;
+
+ union vs_prolog_key_header header;
+ header.v = a[0];
+ return memcmp(a, b, header.key_size) == 0;
+}
+
+static struct radv_shader_prolog *
+lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
+ uint32_t *nontrivial_divisors)
+{
+ STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
+ assert(vs_shader->info.vs.dynamic_inputs);
+
+ struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
+ struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ struct radv_device *device = cmd_buffer->device;
+ enum chip_class chip = device->physical_device->rad_info.chip_class;
+
+ unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask);
+ uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
+
+ uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
+ *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
+ uint32_t misaligned_mask = 0;
+ if (chip == GFX6 || chip >= GFX10) {
+ u_foreach_bit(index, state->attribute_mask & attribute_mask)
+ {
+ uint8_t req = state->format_align_req_minus_1[index];
+ struct radv_vertex_binding *vb = &cmd_buffer->vertex_bindings[state->bindings[index]];
+ VkDeviceSize offset = vb->offset + state->offsets[index];
+ if (vb->buffer && ((offset & req) || (vb->stride & req)))
+ misaligned_mask |= 1u << index;
+ }
+ }
+
+ struct radv_vs_prolog_key key;
+ key.state = state;
+ key.num_attributes = num_attributes;
+ key.misaligned_mask = misaligned_mask;
+ /* The instance ID input VGPR is placed differently when as_ls=true. */
+ key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
+ key.is_ngg = vs_shader->info.is_ngg;
+ key.wave32 = vs_shader->info.wave_size == 32;
+ key.next_stage = MESA_SHADER_VERTEX;
+ if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader)
+ key.next_stage = MESA_SHADER_TESS_CTRL;
+ else if (pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader)
+ key.next_stage = MESA_SHADER_GEOMETRY;
+
+ uint32_t key_words[16];
+ unsigned key_size = 1;
+
+ union vs_prolog_key_header header;
+ header.v = 0;
+ header.num_attributes = num_attributes;
+ header.as_ls = key.as_ls;
+ header.is_ngg = key.is_ngg;
+ header.wave32 = key.wave32;
+ header.next_stage = key.next_stage;
+
+ if (instance_rate_inputs & ~*nontrivial_divisors) {
+ header.instance_rate_inputs = true;
+ key_words[key_size++] = instance_rate_inputs;
+ }
+ if (*nontrivial_divisors) {
+ header.nontrivial_divisors = true;
+ key_words[key_size++] = *nontrivial_divisors;
+ }
+ if (misaligned_mask) {
+ header.misaligned_mask = true;
+ key_words[key_size++] = misaligned_mask;
+
+ uint8_t *formats = (uint8_t *)&key_words[key_size];
+ unsigned num_formats = 0;
+ u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
+ while (num_formats & 0x3)
+ formats[num_formats++] = 0;
+ key_size += num_formats / 4u;
+
+ if (state->post_shuffle & attribute_mask) {
+ header.post_shuffle = true;
+ key_words[key_size++] = state->post_shuffle & attribute_mask;
+ }
+ }
+ if (state->alpha_adjust_lo & attribute_mask) {
+ header.alpha_adjust_lo = true;
+ key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
+ }
+ if (state->alpha_adjust_hi & attribute_mask) {
+ header.alpha_adjust_hi = true;
+ key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
+ }
+
+ header.key_size = key_size * sizeof(key_words[0]);
+ key_words[0] = header.v;
+
+ uint32_t hash = radv_hash_vs_prolog(key_words);
+
+ if (cmd_buffer->state.emitted_vs_prolog &&
+ cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
+ radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
+ return cmd_buffer->state.emitted_vs_prolog;
+
+ u_rwlock_rdlock(&device->vs_prologs_lock);
+ struct hash_entry *prolog_entry =
+ _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
+ u_rwlock_rdunlock(&device->vs_prologs_lock);
+
+ if (!prolog_entry) {
+ u_rwlock_wrlock(&device->vs_prologs_lock);
+ prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
+ if (prolog_entry) {
+ u_rwlock_wrunlock(&device->vs_prologs_lock);
+ return prolog_entry->data;
+ }
+
+ struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
+ uint32_t *key2 = malloc(key_size * 4);
+ if (!prolog || !key2) {
+ free(key2);
+ u_rwlock_wrunlock(&device->vs_prologs_lock);
+ return NULL;
+ }
+ memcpy(key2, key_words, key_size * 4);
+ _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
+
+ u_rwlock_wrunlock(&device->vs_prologs_lock);
+ return prolog;
+ }
+
+ return prolog_entry->data;
+}
+
+static void
+emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
+ struct radv_shader_prolog *prolog, bool pipeline_is_dirty)
+{
+ /* no need to re-emit anything in this case */
+ if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
+ return;
+
+ enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
+ struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
+
+ assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline);
+ assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs);
+
+ uint32_t rsrc1 = vs_shader->config.rsrc1;
+ if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
+ rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
+
+ /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
+ * work.
+ */
+ assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
+
+ unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
+ unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+ if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
+ pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
+ rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+ } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
+ pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
+ rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
+ } else if (vs_shader->info.vs.as_ls) {
+ pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
+ rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
+ } else if (vs_shader->info.vs.as_es) {
+ pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
+ rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
+ }
+
+ radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2);
+ radeon_emit(cmd_buffer->cs, prolog_va >> 8);
+ radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40));
+
+ if (chip < GFX10)
+ radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
+ else
+ assert(rsrc1 == vs_shader->config.rsrc1);
+
+ radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
+}
+
static void
-radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
+emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
+ uint32_t nontrivial_divisors, bool pipeline_is_dirty)
+{
+ /* no need to re-emit anything in this case */
+ if (!nontrivial_divisors && !pipeline_is_dirty)
+ return;
+
+ struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
+ uint64_t input_va = radv_shader_variant_get_va(vs_shader);
+
+ if (nontrivial_divisors) {
+ unsigned inputs_offset;
+ uint32_t *inputs;
+ unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
+ if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
+ return;
+
+ *(inputs++) = input_va;
+ *(inputs++) = input_va >> 32;
+
+ u_foreach_bit(index, nontrivial_divisors)
+ {
+ uint32_t div = state->divisors[index];
+ if (div == 0) {
+ *(inputs++) = 0;
+ *(inputs++) = 1;
+ } else if (util_is_power_of_two_or_zero(div)) {
+ *(inputs++) = util_logbase2(div) | (1 << 8);
+ *(inputs++) = 0xffffffffu;
+ } else {
+ struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
+ *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
+ *(inputs++) = info.multiplier;
+ }
+ }
+
+ input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
+ }
+
+ struct radv_userdata_info *loc =
+ &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
+ uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX];
+ assert(loc->sgpr_idx != -1);
+ assert(loc->num_sgprs == 2);
+ radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
+ input_va, true);
+}
+
+static void
+radv_emit_vertex_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
+{
+ struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
+
+ if (!vs_shader->info.vs.has_prolog)
+ return;
+
+ uint32_t nontrivial_divisors;
+ struct radv_shader_prolog *prolog =
+ lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
+ if (!prolog) {
+ cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+ return;
+ }
+ emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
+ emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
+
+ cmd_buffer->state.emitted_vs_prolog = prolog;
+}
+
+static void
+radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
uint64_t states =
cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
@@ -2717,6 +3011,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
radv_emit_color_write_enable(cmd_buffer);
+ if (states & RADV_CMD_DIRTY_VERTEX_STATE)
+ radv_emit_vertex_state(cmd_buffer, pipeline_is_dirty);
+
cmd_buffer->state.dirty &= ~states;
}
@@ -2923,33 +3220,105 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
cmd_buffer->push_constant_stages |= dirty_stages;
}
+enum radv_dst_sel {
+ DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+ DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+ DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+ DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+ DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
+ DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
+};
+
+static const uint32_t data_format_dst_sel[] = {
+ [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
+ [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
+ [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
+ [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
+ [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
+ [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
+ [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
+ [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
+ [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
+ [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
+ [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
+ [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
+ [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
+ [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
+ [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
+};
+
static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
{
if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
cmd_buffer->state.pipeline->vb_desc_usage_mask) {
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
+ enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
unsigned vb_offset;
void *vb_ptr;
unsigned desc_index = 0;
uint32_t mask = pipeline->vb_desc_usage_mask;
uint64_t va;
+ struct radv_vs_input_state *vs_state =
+ vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
/* allocate some descriptor state for vertex buffers */
if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
return;
+ assert(!vs_state || pipeline->use_per_attribute_vb_descs);
+
while (mask) {
unsigned i = u_bit_scan(&mask);
uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
- uint32_t offset;
- unsigned binding = pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i;
+ uint32_t offset, rsrc_word3;
+ unsigned binding =
+ vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
+ : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
unsigned num_records;
unsigned stride;
+ if (vs_state) {
+ unsigned format = vs_state->formats[i];
+ unsigned dfmt = format & 0xf;
+ unsigned nfmt = (format >> 4) & 0x7;
+
+ rsrc_word3 =
+ vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
+
+ if (chip >= GFX10)
+ rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
+ else
+ rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
+ } else {
+ if (chip >= GFX10)
+ rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
+ else
+ rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ }
+
if (!buffer) {
- memset(desc, 0, 4 * 4);
+ if (vs_state) {
+ /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
+ * to include the format/word3 so that the alpha channel is 1 for formats without an
+ * alpha channel.
+ */
+ desc[0] = 0;
+ desc[1] = S_008F04_STRIDE(16);
+ desc[2] = 0;
+ desc[3] = rsrc_word3;
+ } else {
+ memset(desc, 0, 4 * 4);
+ }
continue;
}
@@ -2957,6 +3326,8 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
offset = cmd_buffer->vertex_bindings[binding].offset;
va += offset + buffer->offset;
+ if (vs_state)
+ va += vs_state->offsets[i];
if (cmd_buffer->vertex_bindings[binding].size) {
num_records = cmd_buffer->vertex_bindings[binding].size;
@@ -2970,9 +3341,9 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
stride = pipeline->binding_stride[binding];
}
- enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
if (pipeline->use_per_attribute_vb_descs) {
- uint32_t attrib_end = pipeline->attrib_ends[i];
+ uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
+ : pipeline->attrib_ends[i];
if (num_records < attrib_end) {
num_records = 0; /* not enough space for one vertex */
@@ -2997,7 +3368,14 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
* num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
* GFX10.3 but it doesn't hurt.
*/
- memset(desc, 0, 16);
+ if (vs_state) {
+ desc[0] = 0;
+ desc[1] = S_008F04_STRIDE(16);
+ desc[2] = 0;
+ desc[3] = rsrc_word3;
+ } else {
+ memset(desc, 0, 16);
+ }
continue;
}
} else {
@@ -3005,22 +3383,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
num_records = DIV_ROUND_UP(num_records, stride);
}
- uint32_t rsrc_word3 =
- S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
- S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
if (chip >= GFX10) {
/* OOB_SELECT chooses the out-of-bounds check:
* - 1: index >= NUM_RECORDS (Structured)
* - 3: offset >= NUM_RECORDS (Raw)
*/
int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
-
- rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
- S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
- } else {
- rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
- S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+ rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
}
desc[0] = va;
@@ -4009,7 +4378,7 @@ radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBindi
return;
}
- cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
+ cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_STATE;
}
static uint32_t
@@ -4397,7 +4766,7 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
if (!pipeline)
break;
- cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
+ cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
cmd_buffer->push_constant_stages |= pipeline->active_stages;
/* the new vertex shader might not have the same user regs */
@@ -5712,7 +6081,7 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
/* Index, vertex and streamout buffers don't change context regs, and
* pipeline is already handled.
*/
- used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
+ used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_STATE |
RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE);
if (cmd_buffer->state.dirty & used_states)
@@ -5918,7 +6287,8 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct rad
}
static void
-radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
+radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
+ bool pipeline_is_dirty)
{
bool late_scissor_emission;
@@ -5955,7 +6325,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
}
}
- radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
+ radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
radv_emit_draw_registers(cmd_buffer, info);
@@ -6004,7 +6374,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
* the CUs are idle is very short. (there are only SET_SH
* packets between the wait and the draw)
*/
- radv_emit_all_graphics_states(cmd_buffer, info);
+ radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
si_emit_cache_flush(cmd_buffer);
/* <-- CUs are idle here --> */
@@ -6024,7 +6394,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
- radv_emit_all_graphics_states(cmd_buffer, info);
+ radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
}
radv_describe_draw(cmd_buffer);
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 0bfdd4889ce..5c0dd14220e 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -63,6 +63,7 @@ enum {
RADV_DEBUG_NO_VRS_FLAT_SHADING = 1ull << 32,
RADV_DEBUG_NO_ATOC_DITHERING = 1ull << 33,
RADV_DEBUG_NO_NGGC = 1ull << 34,
+ RADV_DEBUG_DUMP_PROLOGS = 1ull << 35,
};
enum {
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 6cc96e00d9b..a866812f9fd 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -853,6 +853,7 @@ static const struct debug_control radv_debug_options[] = {
{"novrsflatshading", RADV_DEBUG_NO_VRS_FLAT_SHADING},
{"noatocdithering", RADV_DEBUG_NO_ATOC_DITHERING},
{"nonggc", RADV_DEBUG_NO_NGGC},
+ {"prologs", RADV_DEBUG_DUMP_PROLOGS},
{NULL, 0}};
const char *
@@ -2666,6 +2667,30 @@ radv_device_finish_border_color(struct radv_device *device)
}
}
+static VkResult
+radv_device_init_vs_prologs(struct radv_device *device)
+{
+ u_rwlock_init(&device->vs_prologs_lock);
+ device->vs_prologs = _mesa_hash_table_create(NULL, &radv_hash_vs_prolog, &radv_cmp_vs_prolog);
+ if (!device->vs_prologs)
+ return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+ return VK_SUCCESS;
+}
+
+static void
+radv_device_finish_vs_prologs(struct radv_device *device)
+{
+ if (device->vs_prologs) {
+ hash_table_foreach(device->vs_prologs, entry)
+ {
+ free((void *)entry->key);
+ radv_prolog_destroy(device, entry->data);
+ }
+ _mesa_hash_table_destroy(device->vs_prologs, NULL);
+ }
+}
+
VkResult
radv_device_init_vrs_state(struct radv_device *device)
{
@@ -2799,6 +2824,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
bool custom_border_colors = false;
bool attachment_vrs_enabled = false;
bool image_float32_atomics = false;
+ bool vs_prologs = false;
/* Check enabled features */
if (pCreateInfo->pEnabledFeatures) {
@@ -3090,6 +3116,12 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
goto fail;
}
+ if (vs_prologs) {
+ result = radv_device_init_vs_prologs(device);
+ if (result != VK_SUCCESS)
+ goto fail;
+ }
+
for (int family = 0; family < RADV_MAX_QUEUE_FAMILIES; ++family) {
device->empty_cs[family] = device->ws->cs_create(device->ws, family);
if (!device->empty_cs[family])
@@ -3156,6 +3188,7 @@ fail:
if (device->gfx_init)
device->ws->buffer_destroy(device->ws, device->gfx_init);
+ radv_device_finish_vs_prologs(device);
radv_device_finish_border_color(device);
for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
@@ -3186,6 +3219,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (device->gfx_init)
device->ws->buffer_destroy(device->ws, device->gfx_init);
+ radv_device_finish_vs_prologs(device);
radv_device_finish_border_color(device);
radv_device_finish_vrs_image(device);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index a266fd01dfc..d18943b0ecb 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2742,8 +2742,8 @@ radv_determine_ngg_settings(struct radv_pipeline *pipeline,
: nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
: 3;
- infos[es_stage].has_ngg_culling =
- radv_consider_culling(device, nir[es_stage], ps_inputs_read, num_vertices_per_prim);
+ infos[es_stage].has_ngg_culling = radv_consider_culling(
+ device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]);
nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]);
infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
@@ -5386,7 +5386,10 @@ radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
}
pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs;
- pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
+ if (info->vs.dynamic_inputs)
+ pipeline->vb_desc_usage_mask = BITFIELD_MASK(util_last_bit(info->vs.vb_desc_usage_mask));
+ else
+ pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
}
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index c65acb1cd64..60ea3b3c2aa 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -832,6 +832,9 @@ struct radv_device {
struct radv_buffer *buffer; /* HTILE */
struct radv_device_memory *mem;
} vrs;
+
+ struct u_rwlock vs_prologs_lock;
+ struct hash_table *vs_prologs;
};
VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line,
@@ -997,7 +1000,8 @@ enum radv_dynamic_state_bits {
RADV_DYNAMIC_LOGIC_OP = 1ull << 26,
RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27,
RADV_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28,
- RADV_DYNAMIC_ALL = (1ull << 29) - 1,
+ RADV_DYNAMIC_VERTEX_INPUT = 1ull << 29,
+ RADV_DYNAMIC_ALL = (1ull << 30) - 1,
};
enum radv_cmd_dirty_bits {
@@ -1032,12 +1036,14 @@ enum radv_cmd_dirty_bits {
RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP = 1ull << 26,
RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27,
RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28,
- RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 29) - 1,
- RADV_CMD_DIRTY_PIPELINE = 1ull << 29,
- RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 30,
- RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 31,
- RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 32,
- RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 33
+ RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT = 1ull << 29,
+ RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 30) - 1,
+ RADV_CMD_DIRTY_PIPELINE = 1ull << 30,
+ RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 31,
+ RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 32,
+ RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 33,
+ RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 34,
+ RADV_CMD_DIRTY_VERTEX_STATE = RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT,
};
enum radv_cmd_flush_bits {
@@ -1349,6 +1355,7 @@ struct radv_cmd_state {
struct radv_render_pass *pass;
const struct radv_subpass *subpass;
struct radv_dynamic_state dynamic;
+ struct radv_vs_input_state dynamic_vs_input;
struct radv_attachment_state *attachments;
struct radv_streamout_state streamout;
VkRect2D render_area;
@@ -1414,6 +1421,10 @@ struct radv_cmd_state {
bool uses_draw_indirect_multi;
uint32_t rt_stack_size;
+
+ struct radv_shader_prolog *emitted_vs_prolog;
+ uint32_t *emitted_vs_prolog_key;
+ uint32_t emitted_vs_prolog_key_hash;
};
struct radv_cmd_pool {
@@ -1531,6 +1542,10 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uin
void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
+
+uint32_t radv_hash_vs_prolog(const void *key_);
+bool radv_cmp_vs_prolog(const void *a_, const void *b_);
+
bool radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
unsigned *out_offset, void **ptr);
void radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index defa4298a0d..e610b92b0cf 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -888,8 +888,8 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
}
bool
-radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
- uint64_t ps_inputs_read, unsigned num_vertices_per_primitive)
+radv_consider_culling(struct radv_device *device, struct nir_shader *nir, uint64_t ps_inputs_read,
+ unsigned num_vertices_per_primitive, const struct radv_shader_info *info)
{
/* Culling doesn't make sense for meta shaders. */
if (!!nir->info.name)
@@ -899,6 +899,10 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK))
return false;
+ /* We don't support culling with vertex shader prologs. */
+ if (info->vs.has_prolog)
+ return false;
+
if (!device->physical_device->use_ngg_culling)
return false;
@@ -1910,6 +1914,72 @@ radv_create_trap_handler_shader(struct radv_device *device)
return shader;
}
+static struct radv_shader_prolog *
+upload_vs_prolog(struct radv_device *device, struct radv_prolog_binary *bin, unsigned wave_size)
+{
+ struct radv_shader_prolog *prolog = malloc(sizeof(struct radv_shader_prolog));
+ if (!prolog)
+ return NULL;
+
+ prolog->alloc = alloc_shader_memory(device, bin->code_size, NULL);
+ if (!prolog->alloc) {
+ free(prolog);
+ return NULL;
+ }
+
+ prolog->bo = prolog->alloc->arena->bo;
+ char *dest_ptr = prolog->alloc->arena->ptr + prolog->alloc->offset;
+
+ memcpy(dest_ptr, bin->data, bin->code_size);
+
+ prolog->rsrc1 = S_00B848_VGPRS((bin->num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
+ S_00B228_SGPRS((bin->num_sgprs - 1) / 8);
+ prolog->num_preserved_sgprs = bin->num_preserved_sgprs;
+
+ return prolog;
+}
+
+struct radv_shader_prolog *
+radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_key *key)
+{
+ struct radv_nir_compiler_options options = {0};
+ options.explicit_scratch_args = true;
+ options.family = device->physical_device->rad_info.family;
+ options.chip_class = device->physical_device->rad_info.chip_class;
+ options.info = &device->physical_device->rad_info;
+ options.address32_hi = device->physical_device->rad_info.address32_hi;
+ options.dump_shader = device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS;
+
+ struct radv_shader_info info = {0};
+ info.wave_size = key->wave32 ? 32 : 64;
+ info.vs.needs_instance_id = true;
+ info.vs.needs_base_instance = true;
+ info.vs.needs_draw_id = true;
+ info.vs.use_per_attribute_vb_descs = true;
+ info.vs.vb_desc_usage_mask = BITFIELD_MASK(key->num_attributes);
+ info.vs.has_prolog = true;
+ info.vs.as_ls = key->as_ls;
+ info.is_ngg = key->is_ngg;
+
+ struct radv_shader_args args = {0};
+ args.options = &options;
+ args.shader_info = &info;
+ radv_declare_shader_args(&args, key->next_stage, key->next_stage != MESA_SHADER_VERTEX,
+ MESA_SHADER_VERTEX);
+
+#ifdef LLVM_AVAILABLE
+ if (options.dump_shader)
+ ac_init_llvm_once();
+#endif
+
+ struct radv_prolog_binary *binary = NULL;
+ aco_compile_vs_prolog(key, &binary, &args);
+ struct radv_shader_prolog *prolog = upload_vs_prolog(device, binary, info.wave_size);
+ free(binary);
+
+ return prolog;
+}
+
void
radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant)
{
@@ -1926,6 +1996,16 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia
free(variant);
}
+void
+radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog)
+{
+ if (!prolog)
+ return;
+
+ free_shader_memory(device, prolog->alloc);
+ free(prolog);
+}
+
uint64_t
radv_shader_variant_get_va(const struct radv_shader_variant *variant)
{
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index c7fc550f49b..105ccfd2d05 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -46,6 +46,7 @@ struct radv_device;
struct radv_pipeline;
struct radv_pipeline_cache;
struct radv_pipeline_key;
+struct radv_vs_input_state;
enum radv_vs_input_alpha_adjust {
ALPHA_ADJUST_NONE = 0,
@@ -71,6 +72,7 @@ struct radv_pipeline_key {
enum radv_vs_input_alpha_adjust vertex_alpha_adjust[MAX_VERTEX_ATTRIBS];
uint32_t vertex_post_shuffle;
uint32_t provoking_vtx_last : 1;
+ uint32_t dynamic_input_state : 1;
uint8_t topology;
} vs;
@@ -145,6 +147,7 @@ enum radv_ud_index {
AC_UD_SHADER_START = 9,
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
+ AC_UD_VS_PROLOG_INPUTS,
AC_UD_VS_MAX_UD,
AC_UD_PS_MAX_UD,
AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
@@ -259,6 +262,8 @@ struct radv_shader_info {
bool needs_base_instance;
bool use_per_attribute_vb_descs;
uint32_t vb_desc_usage_mask;
+ bool has_prolog;
+ bool dynamic_inputs;
} vs;
struct {
uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
@@ -353,6 +358,37 @@ struct radv_shader_info {
struct gfx10_ngg_info ngg_info;
};
+struct radv_vs_input_state {
+ uint32_t attribute_mask;
+ uint8_t bindings[MAX_VERTEX_ATTRIBS];
+
+ uint32_t instance_rate_inputs;
+ uint32_t nontrivial_divisors;
+ uint32_t divisors[MAX_VERTEX_ATTRIBS];
+
+ uint32_t offsets[MAX_VERTEX_ATTRIBS];
+
+ uint32_t post_shuffle;
+ /* Having two separate fields instead of a single uint64_t makes it easier to remove attributes
+ * using bitwise arithmetic.
+ */
+ uint32_t alpha_adjust_lo;
+ uint32_t alpha_adjust_hi;
+ uint8_t formats[MAX_VERTEX_ATTRIBS];
+ uint8_t format_align_req_minus_1[MAX_VERTEX_ATTRIBS];
+ uint8_t format_sizes[MAX_VERTEX_ATTRIBS];
+};
+
+struct radv_vs_prolog_key {
+ struct radv_vs_input_state *state;
+ unsigned num_attributes;
+ uint32_t misaligned_mask;
+ bool as_ls;
+ bool is_ngg;
+ bool wave32;
+ gl_shader_stage next_stage;
+};
+
enum radv_shader_binary_type { RADV_BINARY_TYPE_LEGACY, RADV_BINARY_TYPE_RTLD };
struct radv_shader_binary {
@@ -387,6 +423,14 @@ struct radv_shader_binary_rtld {
uint8_t data[0];
};
+struct radv_prolog_binary {
+ uint8_t num_sgprs;
+ uint8_t num_vgprs;
+ uint8_t num_preserved_sgprs;
+ unsigned code_size;
+ uint8_t data[0];
+};
+
struct radv_shader_arena {
struct list_head list;
struct list_head entries;
@@ -429,6 +473,13 @@ struct radv_shader_variant {
uint32_t *statistics;
};
+struct radv_shader_prolog {
+ struct radeon_winsys_bo *bo;
+ union radv_shader_arena_block *alloc;
+ uint32_t rsrc1;
+ uint8_t num_preserved_sgprs;
+};
+
void radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
bool optimize_conservatively, bool allow_copies);
void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets);
@@ -469,8 +520,13 @@ radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir,
struct radv_shader_variant *radv_create_trap_handler_shader(struct radv_device *device);
+struct radv_shader_prolog *radv_create_vs_prolog(struct radv_device *device,
+ const struct radv_vs_prolog_key *key);
+
void radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant);
+void radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog);
+
uint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant);
struct radv_shader_variant *radv_find_shader_variant(struct radv_device *device, uint64_t pc);
@@ -577,7 +633,8 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
const struct radv_pipeline_key *pl_key);
bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
- uint64_t ps_inputs_read, unsigned num_vertices_per_primitive);
+ uint64_t ps_inputs_read, unsigned num_vertices_per_primitive,
+ const struct radv_shader_info *info);
void radv_get_nir_options(struct radv_physical_device *device);
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c
index a1ff13fe217..e2f1b4a0600 100644
--- a/src/amd/vulkan/radv_shader_args.c
+++ b/src/amd/vulkan/radv_shader_args.c
@@ -184,6 +184,10 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
/* 2 user sgprs will always be allocated for scratch/rings */
user_sgpr_count += 2;
+ /* prolog inputs */
+ if (args->shader_info->vs.has_prolog)
+ user_sgpr_count += 2;
+
switch (stage) {
case MESA_SHADER_COMPUTE:
if (args->shader_info->cs.uses_sbt)
@@ -281,6 +285,9 @@ static void
declare_vs_specific_input_sgprs(struct radv_shader_args *args, gl_shader_stage stage,
bool has_previous_stage, gl_shader_stage previous_stage)
{
+ if (args->shader_info->vs.has_prolog)
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->prolog_inputs);
+
if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
(has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
if (args->shader_info->vs.vb_desc_usage_mask) {
@@ -328,6 +335,17 @@ declare_vs_input_vgprs(struct radv_shader_args *args)
}
}
}
+
+ if (args->shader_info->vs.dynamic_inputs) {
+ assert(args->shader_info->vs.use_per_attribute_vb_descs);
+ unsigned num_attributes = util_last_bit(args->shader_info->vs.vb_desc_usage_mask);
+ for (unsigned i = 0; i < num_attributes; i++)
+ ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]);
+ /* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one
+ * VGPR more than the number of shader arguments in the case of non-trivial divisors on GFX8.
+ */
+ ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+ }
}
static void
@@ -463,6 +481,9 @@ set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage,
bool has_previous_stage, gl_shader_stage previous_stage,
uint8_t *user_sgpr_idx)
{
+ if (args->prolog_inputs.used)
+ set_loc_shader(args, AC_UD_VS_PROLOG_INPUTS, user_sgpr_idx, 2);
+
if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
(has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
if (args->ac.vertex_buffers.used) {
diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h
index a7c13152fcb..a6d4b8b7be7 100644
--- a/src/amd/vulkan/radv_shader_args.h
+++ b/src/amd/vulkan/radv_shader_args.h
@@ -45,6 +45,9 @@ struct radv_shader_args {
struct ac_arg ngg_viewport_scale[2];
struct ac_arg ngg_viewport_translate[2];
+ struct ac_arg prolog_inputs;
+ struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS];
+
bool is_gs_copy_shader;
bool is_trap_handler_shader;
};
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index 10069a4cc42..40042e3b4fe 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -608,12 +608,23 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
}
if (nir->info.stage == MESA_SHADER_VERTEX) {
+ if (pipeline_key->vs.dynamic_input_state && nir->info.inputs_read) {
+ info->vs.has_prolog = true;
+ info->vs.dynamic_inputs = true;
+ }
+
/* Use per-attribute vertex descriptors to prevent faults and
* for correct bounds checking.
*/
- info->vs.use_per_attribute_vb_descs = device->robust_buffer_access;
+ info->vs.use_per_attribute_vb_descs = device->robust_buffer_access || info->vs.dynamic_inputs;
}
+ /* We have to ensure consistent input register assignments between the main shader and the
+ * prolog. */
+ info->vs.needs_instance_id |= info->vs.has_prolog;
+ info->vs.needs_base_instance |= info->vs.has_prolog;
+ info->vs.needs_draw_id |= info->vs.has_prolog;
+
nir_foreach_shader_in_variable (variable, nir)
gather_info_input_decl(nir, variable, pipeline_key, info);
More information about the mesa-commit
mailing list