[Mesa-dev] [PATCH 3/3] radv: use typed buffer loads for vertex input fetches
Samuel Pitoiset
samuel.pitoiset at gmail.com
Tue Feb 26 12:42:28 UTC 2019
This drastically reduces the number of SGPRs because the driver
now uses descriptors per vertex binding, instead of per vertex
attribute format.
29077 shaders in 15096 tests
Totals:
SGPRS: 1354285 -> 1282109 (-5.33 %)
VGPRS: 909896 -> 908800 (-0.12 %)
Spilled SGPRs: 24840 -> 24811 (-0.12 %)
Code Size: 49221144 -> 48986628 (-0.48 %) bytes
Max Waves: 243930 -> 244229 (0.12 %)
Totals from affected shaders:
SGPRS: 390648 -> 318472 (-18.48 %)
VGPRS: 288432 -> 287336 (-0.38 %)
Spilled SGPRs: 94 -> 65 (-30.85 %)
Code Size: 11548412 -> 11313896 (-2.03 %) bytes
Max Waves: 86460 -> 86759 (0.35 %)
This gives a really tiny boost.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
src/amd/vulkan/radv_cmd_buffer.c | 21 +++++++++-----
src/amd/vulkan/radv_nir_to_llvm.c | 47 +++++++++++++++++++++++++------
src/amd/vulkan/radv_pipeline.c | 37 ++----------------------
src/amd/vulkan/radv_private.h | 5 +---
4 files changed, 57 insertions(+), 53 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index ad0b934ddfc..5ab93d11d68 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1985,13 +1985,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
{
if ((pipeline_is_dirty ||
(cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
- cmd_buffer->state.pipeline->vertex_elements.count &&
+ cmd_buffer->state.pipeline->num_vertex_bindings &&
radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
unsigned vb_offset;
void *vb_ptr;
uint32_t i = 0;
- uint32_t count = velems->count;
+ uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
uint64_t va;
/* allocate some descriptor state for vertex buffers */
@@ -2002,13 +2002,15 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
for (i = 0; i < count; i++) {
uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
uint32_t offset;
- int vb = velems->binding[i];
- struct radv_buffer *buffer = cmd_buffer->vertex_bindings[vb].buffer;
- uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb];
+ struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
+ uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
+
+ if (!buffer)
+ continue;
va = radv_buffer_get_va(buffer->bo);
- offset = cmd_buffer->vertex_bindings[vb].offset + velems->offset[i];
+ offset = cmd_buffer->vertex_bindings[i].offset;
va += offset + buffer->offset;
desc[0] = va;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
@@ -2016,7 +2018,12 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
else
desc[2] = buffer->size - offset;
- desc[3] = velems->rsrc_word3[i];
+ desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+ S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+ S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+ S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+ S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+ S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
}
va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 36f499be212..e6c8f3ecb92 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2008,6 +2008,8 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
+ alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
+
if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
else
@@ -2035,7 +2037,7 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
}
- return alpha;
+ return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
}
static unsigned
@@ -2096,7 +2098,7 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
for (unsigned i = num_channels; i < 4; i++) {
chan[i] = i == 3 ? one : zero;
- chan[i] = ac_to_float(&ctx->ac, chan[i]);
+ chan[i] = ac_to_integer(&ctx->ac, chan[i]);
}
return ac_build_gather_values(&ctx->ac, chan, 4);
@@ -2154,20 +2156,49 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
} else
buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
ctx->abi.base_vertex, "");
- t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false);
-
- t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
/* Adjust the number of channels to load based on the vertex
* attribute format.
*/
unsigned num_format_channels = get_num_channels_from_data_format(data_format);
unsigned num_channels = MIN2(num_input_channels, num_format_channels);
+ unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
+ unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
+ unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
- input = ac_build_buffer_load_format(&ctx->ac, t_list,
+ if (attrib_stride != 0 && attrib_offset > attrib_stride) {
+ LLVMValueRef buffer_offset =
+ LLVMConstInt(ctx->ac.i32,
+ attrib_offset / attrib_stride, false);
+
+ buffer_index = LLVMBuildAdd(ctx->ac.builder,
buffer_index,
- ctx->ac.i32_0,
- num_channels, false, true);
+ buffer_offset, "");
+
+ attrib_offset = attrib_offset % attrib_stride;
+ }
+
+ t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
+ t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
+
+ input = ac_build_tbuffer_load(&ctx->ac, t_list, buffer_index,
+ LLVMConstInt(ctx->ac.i32, attrib_offset, false),
+ ctx->ac.i32_0, ctx->ac.i32_0,
+ num_channels,
+ data_format, num_format,
+ false, false, true);
+
+ if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) {
+ if (num_channels > 1) {
+ LLVMValueRef c[4];
+ c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
+ c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
+ c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
+ c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
+
+ input = ac_build_gather_values(&ctx->ac, c, 4);
+ }
+ }
input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
is_float);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 5fd57932102..30c3f60790e 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1244,25 +1244,6 @@ si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
}
}
-static unsigned si_map_swizzle(unsigned swizzle)
-{
- switch (swizzle) {
- case VK_SWIZZLE_Y:
- return V_008F0C_SQ_SEL_Y;
- case VK_SWIZZLE_Z:
- return V_008F0C_SQ_SEL_Z;
- case VK_SWIZZLE_W:
- return V_008F0C_SQ_SEL_W;
- case VK_SWIZZLE_0:
- return V_008F0C_SQ_SEL_0;
- case VK_SWIZZLE_1:
- return V_008F0C_SQ_SEL_1;
- default: /* VK_SWIZZLE_X */
- return V_008F0C_SQ_SEL_X;
- }
-}
-
-
static unsigned radv_dynamic_state_mask(VkDynamicState state)
{
switch(state) {
@@ -3557,24 +3538,10 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
&vi_info->pVertexAttributeDescriptions[i];
unsigned loc = desc->location;
const struct vk_format_description *format_desc;
- int first_non_void;
- uint32_t num_format, data_format;
- format_desc = vk_format_description(desc->format);
- first_non_void = vk_format_get_first_non_void_channel(desc->format);
- num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
- data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
+ format_desc = vk_format_description(desc->format);
- velems->rsrc_word3[loc] = S_008F0C_DST_SEL_X(si_map_swizzle(format_desc->swizzle[0])) |
- S_008F0C_DST_SEL_Y(si_map_swizzle(format_desc->swizzle[1])) |
- S_008F0C_DST_SEL_Z(si_map_swizzle(format_desc->swizzle[2])) |
- S_008F0C_DST_SEL_W(si_map_swizzle(format_desc->swizzle[3])) |
- S_008F0C_NUM_FORMAT(num_format) |
- S_008F0C_DATA_FORMAT(data_format);
velems->format_size[loc] = format_desc->block.bits / 8;
- velems->offset[loc] = desc->offset;
- velems->binding[loc] = desc->binding;
- velems->count = MAX2(velems->count, loc + 1);
}
for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
@@ -3582,6 +3549,8 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
&vi_info->pVertexBindingDescriptions[i];
pipeline->binding_stride[desc->binding] = desc->stride;
+ pipeline->num_vertex_bindings =
+ MAX2(pipeline->num_vertex_bindings, desc->binding + 1);
}
}
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index c73bdaca0a3..39fa6110fde 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1342,11 +1342,7 @@ struct radv_prim_vertex_count {
};
struct radv_vertex_elements_info {
- uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS];
uint32_t format_size[MAX_VERTEX_ATTRIBS];
- uint32_t binding[MAX_VERTEX_ATTRIBS];
- uint32_t offset[MAX_VERTEX_ATTRIBS];
- uint32_t count;
};
struct radv_ia_multi_vgt_param_helpers {
@@ -1378,6 +1374,7 @@ struct radv_pipeline {
struct radv_vertex_elements_info vertex_elements;
uint32_t binding_stride[MAX_VBS];
+ uint8_t num_vertex_bindings;
uint32_t user_data_0[MESA_SHADER_STAGES];
union {
--
2.21.0
More information about the mesa-dev
mailing list