[Mesa-dev] [PATCH 3/3] radv: use typed buffer loads for vertex input fetches

Samuel Pitoiset samuel.pitoiset at gmail.com
Tue Feb 26 12:42:28 UTC 2019


This drastically reduces the number of SGPRs because the driver
now uses descriptors per vertex binding, instead of per vertex
attribute format.

29077 shaders in 15096 tests
Totals:
SGPRS: 1354285 -> 1282109 (-5.33 %)
VGPRS: 909896 -> 908800 (-0.12 %)
Spilled SGPRs: 24840 -> 24811 (-0.12 %)
Code Size: 49221144 -> 48986628 (-0.48 %) bytes
Max Waves: 243930 -> 244229 (0.12 %)

Totals from affected shaders:
SGPRS: 390648 -> 318472 (-18.48 %)
VGPRS: 288432 -> 287336 (-0.38 %)
Spilled SGPRs: 94 -> 65 (-30.85 %)
Code Size: 11548412 -> 11313896 (-2.03 %) bytes
Max Waves: 86460 -> 86759 (0.35 %)

This gives a really tiny boost.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
 src/amd/vulkan/radv_cmd_buffer.c  | 21 +++++++++-----
 src/amd/vulkan/radv_nir_to_llvm.c | 47 +++++++++++++++++++++++++------
 src/amd/vulkan/radv_pipeline.c    | 37 ++----------------------
 src/amd/vulkan/radv_private.h     |  5 +---
 4 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index ad0b934ddfc..5ab93d11d68 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1985,13 +1985,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 {
 	if ((pipeline_is_dirty ||
 	    (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
-	    cmd_buffer->state.pipeline->vertex_elements.count &&
+	    cmd_buffer->state.pipeline->num_vertex_bindings &&
 	    radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
 		struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
 		unsigned vb_offset;
 		void *vb_ptr;
 		uint32_t i = 0;
-		uint32_t count = velems->count;
+		uint32_t count = cmd_buffer->state.pipeline->num_vertex_bindings;
 		uint64_t va;
 
 		/* allocate some descriptor state for vertex buffers */
@@ -2002,13 +2002,15 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 		for (i = 0; i < count; i++) {
 			uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
 			uint32_t offset;
-			int vb = velems->binding[i];
-			struct radv_buffer *buffer = cmd_buffer->vertex_bindings[vb].buffer;
-			uint32_t stride = cmd_buffer->state.pipeline->binding_stride[vb];
+			struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
+			uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
+
+			if (!buffer)
+				continue;
 
 			va = radv_buffer_get_va(buffer->bo);
 
-			offset = cmd_buffer->vertex_bindings[vb].offset + velems->offset[i];
+			offset = cmd_buffer->vertex_bindings[i].offset;
 			va += offset + buffer->offset;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
@@ -2016,7 +2018,12 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 				desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
 			else
 				desc[2] = buffer->size - offset;
-			desc[3] = velems->rsrc_word3[i];
+			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+				  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 		}
 
 		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 36f499be212..e6c8f3ecb92 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2008,6 +2008,8 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
 
 	LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
 
+	alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
+
 	if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
 		alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
 	else
@@ -2035,7 +2037,7 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx,
 		alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
 	}
 
-	return alpha;
+	return LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.i32, "");
 }
 
 static unsigned
@@ -2096,7 +2098,7 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
 
 	for (unsigned i = num_channels; i < 4; i++) {
 		chan[i] = i == 3 ? one : zero;
-		chan[i] = ac_to_float(&ctx->ac, chan[i]);
+		chan[i] = ac_to_integer(&ctx->ac, chan[i]);
 	}
 
 	return ac_build_gather_values(&ctx->ac, chan, 4);
@@ -2154,20 +2156,49 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 		} else
 			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
 			                            ctx->abi.base_vertex, "");
-		t_offset = LLVMConstInt(ctx->ac.i32, attrib_index, false);
-
-		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
 		/* Adjust the number of channels to load based on the vertex
 		 * attribute format.
 		 */
 		unsigned num_format_channels = get_num_channels_from_data_format(data_format);
 		unsigned num_channels = MIN2(num_input_channels, num_format_channels);
+		unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
+		unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
+		unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
 
-		input = ac_build_buffer_load_format(&ctx->ac, t_list,
+		if (attrib_stride != 0 && attrib_offset > attrib_stride) {
+			LLVMValueRef buffer_offset =
+				LLVMConstInt(ctx->ac.i32,
+					     attrib_offset / attrib_stride, false);
+
+			buffer_index = LLVMBuildAdd(ctx->ac.builder,
 						    buffer_index,
-						    ctx->ac.i32_0,
-						    num_channels, false, true);
+						    buffer_offset, "");
+
+			attrib_offset = attrib_offset % attrib_stride;
+		}
+
+		t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
+		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
+
+		input = ac_build_tbuffer_load(&ctx->ac, t_list, buffer_index,
+					      LLVMConstInt(ctx->ac.i32, attrib_offset, false),
+					      ctx->ac.i32_0, ctx->ac.i32_0,
+					      num_channels,
+					      data_format, num_format,
+					      false, false, true);
+
+		if (ctx->options->key.vs.post_shuffle & (1 << attrib_index)) {
+			if (num_channels > 1) {
+				LLVMValueRef c[4];
+				c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
+				c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
+				c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
+				c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
+
+				input = ac_build_gather_values(&ctx->ac, c, 4);
+			}
+		}
 
 		input = radv_fixup_vertex_input_fetches(ctx, input, num_channels,
 							is_float);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 5fd57932102..30c3f60790e 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1244,25 +1244,6 @@ si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
 	}
 }
 
-static unsigned si_map_swizzle(unsigned swizzle)
-{
-	switch (swizzle) {
-	case VK_SWIZZLE_Y:
-		return V_008F0C_SQ_SEL_Y;
-	case VK_SWIZZLE_Z:
-		return V_008F0C_SQ_SEL_Z;
-	case VK_SWIZZLE_W:
-		return V_008F0C_SQ_SEL_W;
-	case VK_SWIZZLE_0:
-		return V_008F0C_SQ_SEL_0;
-	case VK_SWIZZLE_1:
-		return V_008F0C_SQ_SEL_1;
-	default: /* VK_SWIZZLE_X */
-		return V_008F0C_SQ_SEL_X;
-	}
-}
-
-
 static unsigned radv_dynamic_state_mask(VkDynamicState state)
 {
 	switch(state) {
@@ -3557,24 +3538,10 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
 			&vi_info->pVertexAttributeDescriptions[i];
 		unsigned loc = desc->location;
 		const struct vk_format_description *format_desc;
-		int first_non_void;
-		uint32_t num_format, data_format;
-		format_desc = vk_format_description(desc->format);
-		first_non_void = vk_format_get_first_non_void_channel(desc->format);
 
-		num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
-		data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
+		format_desc = vk_format_description(desc->format);
 
-		velems->rsrc_word3[loc] = S_008F0C_DST_SEL_X(si_map_swizzle(format_desc->swizzle[0])) |
-			S_008F0C_DST_SEL_Y(si_map_swizzle(format_desc->swizzle[1])) |
-			S_008F0C_DST_SEL_Z(si_map_swizzle(format_desc->swizzle[2])) |
-			S_008F0C_DST_SEL_W(si_map_swizzle(format_desc->swizzle[3])) |
-			S_008F0C_NUM_FORMAT(num_format) |
-			S_008F0C_DATA_FORMAT(data_format);
 		velems->format_size[loc] = format_desc->block.bits / 8;
-		velems->offset[loc] = desc->offset;
-		velems->binding[loc] = desc->binding;
-		velems->count = MAX2(velems->count, loc + 1);
 	}
 
 	for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
@@ -3582,6 +3549,8 @@ radv_compute_vertex_input_state(struct radv_pipeline *pipeline,
 			&vi_info->pVertexBindingDescriptions[i];
 
 		pipeline->binding_stride[desc->binding] = desc->stride;
+		pipeline->num_vertex_bindings =
+			MAX2(pipeline->num_vertex_bindings, desc->binding + 1);
 	}
 }
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index c73bdaca0a3..39fa6110fde 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1342,11 +1342,7 @@ struct radv_prim_vertex_count {
 };
 
 struct radv_vertex_elements_info {
-	uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS];
 	uint32_t format_size[MAX_VERTEX_ATTRIBS];
-	uint32_t binding[MAX_VERTEX_ATTRIBS];
-	uint32_t offset[MAX_VERTEX_ATTRIBS];
-	uint32_t count;
 };
 
 struct radv_ia_multi_vgt_param_helpers {
@@ -1378,6 +1374,7 @@ struct radv_pipeline {
 	struct radv_vertex_elements_info             vertex_elements;
 
 	uint32_t                                     binding_stride[MAX_VBS];
+	uint8_t                                      num_vertex_bindings;
 
 	uint32_t user_data_0[MESA_SHADER_STAGES];
 	union {
-- 
2.21.0



More information about the mesa-dev mailing list