[Mesa-dev] [PATCH 3/5] radeonsi: use faster integer division for instance divisors

Marek Olšák maraeo at gmail.com
Sun Sep 23 16:57:53 UTC 2018


From: Marek Olšák <marek.olsak at amd.com>

We know the divisors when we upload them, so instead we can precompute
and upload division factors derived from each divisor.

This fast division consists of add, mul_hi, and two shifts,
and we have to load 4 dwords intead of 1.

This probably won't affect any apps.
---
 src/gallium/drivers/radeonsi/si_shader.c | 46 +++++++++++++++-----------------
 src/gallium/drivers/radeonsi/si_state.c  | 42 ++++++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_state.h  |  2 +-
 3 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 36f58e2..90cb059 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -421,34 +421,20 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 			return LLVMConstInt(ctx->i32, stride, 0);
 		}
 		return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
 	default:
 		assert(0);
 		return NULL;
 	}
 }
 
-static LLVMValueRef get_instance_index_for_fetch(
-	struct si_shader_context *ctx,
-	unsigned param_start_instance, LLVMValueRef divisor)
-{
-	LLVMValueRef result = ctx->abi.instance_id;
-
-	/* The division must be done before START_INSTANCE is added. */
-	if (divisor != ctx->i32_1)
-		result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
-	return LLVMBuildAdd(ctx->ac.builder, result,
-			    LLVMGetParam(ctx->main_fn, param_start_instance), "");
-}
-
 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
  * to float. */
 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
 					    LLVMValueRef vec4,
 					    unsigned double_index)
 {
 	LLVMBuilderRef builder = ctx->ac.builder;
 	LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
 	LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
 					      LLVMVectorType(f64, 2), "");
@@ -7294,34 +7280,44 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 			ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
 	}
 
 	for (i = 0; i <= key->vs_prolog.last_input; i++) {
 		bool divisor_is_one =
 			key->vs_prolog.states.instance_divisor_is_one & (1u << i);
 		bool divisor_is_fetched =
 			key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
 		LLVMValueRef index;
 
-		if (divisor_is_one || divisor_is_fetched) {
-			LLVMValueRef divisor = ctx->i32_1;
+		if (divisor_is_one) {
+			index = ctx->abi.instance_id;
+		} else if (divisor_is_fetched) {
+			LLVMValueRef udiv_factors[4];
 
-			if (divisor_is_fetched) {
-				divisor = buffer_load_const(ctx, instance_divisor_constbuf,
-							    LLVMConstInt(ctx->i32, i * 4, 0));
-				divisor = ac_to_integer(&ctx->ac, divisor);
+			for (unsigned j = 0; j < 4; j++) {
+				udiv_factors[j] =
+					buffer_load_const(ctx, instance_divisor_constbuf,
+							  LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+				udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
 			}
+			/* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+			 * Such InstanceID might not be achievable in a reasonable time though.
+			 */
+			index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+						       udiv_factors[0], udiv_factors[1],
+						       udiv_factors[2], udiv_factors[3]);
+		}
 
-			/* InstanceID / Divisor + StartInstance */
-			index = get_instance_index_for_fetch(ctx,
-							     user_sgpr_base +
-							     SI_SGPR_START_INSTANCE,
-							     divisor);
+		if (divisor_is_one || divisor_is_fetched) {
+			/* Add StartInstance. */
+			index = LLVMBuildAdd(ctx->ac.builder, index,
+					     LLVMGetParam(ctx->main_fn, user_sgpr_base +
+							  SI_SGPR_START_INSTANCE), "");
 		} else {
 			/* VertexID + BaseVertex */
 			index = LLVMBuildAdd(ctx->ac.builder,
 					     ctx->abi.vertex_id,
 					     LLVMGetParam(func, user_sgpr_base +
 								SI_SGPR_BASE_VERTEX), "");
 		}
 
 		index = ac_to_float(&ctx->ac, index);
 		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index bc1417a..aa57b3f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -25,20 +25,21 @@
 #include "si_build_pm4.h"
 #include "gfx9d.h"
 #include "si_query.h"
 
 #include "util/u_dual_blend.h"
 #include "util/u_format.h"
 #include "util/u_format_s3tc.h"
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
 
 static unsigned si_map_swizzle(unsigned swizzle)
 {
 	switch (swizzle) {
 	case PIPE_SWIZZLE_Y:
 		return V_008F0C_SQ_SEL_Y;
 	case PIPE_SWIZZLE_Z:
 		return V_008F0C_SQ_SEL_Z;
 	case PIPE_SWIZZLE_W:
 		return V_008F0C_SQ_SEL_W;
@@ -4348,20 +4349,26 @@ static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
  * Vertex elements & buffers
  */
 
 static void *si_create_vertex_elements(struct pipe_context *ctx,
 				       unsigned count,
 				       const struct pipe_vertex_element *elements)
 {
 	struct si_screen *sscreen = (struct si_screen*)ctx->screen;
 	struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
 	bool used[SI_NUM_VERTEX_BUFFERS] = {};
+	struct util_fast_udiv_info divisor_factors[SI_MAX_ATTRIBS] = {};
+	STATIC_ASSERT(sizeof(struct util_fast_udiv_info) == 16);
+	STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+	STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+	STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+	STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
 	int i;
 
 	assert(count <= SI_MAX_ATTRIBS);
 	if (!v)
 		return NULL;
 
 	v->count = count;
 	v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT);
 
 	for (i = 0; i < count; ++i) {
@@ -4370,28 +4377,31 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 		unsigned data_format, num_format;
 		int first_non_void;
 		unsigned vbo_index = elements[i].vertex_buffer_index;
 		unsigned char swizzle[4];
 
 		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
 			FREE(v);
 			return NULL;
 		}
 
-		if (elements[i].instance_divisor) {
+		unsigned instance_divisor = elements[i].instance_divisor;
+		if (instance_divisor) {
 			v->uses_instance_divisors = true;
-			v->instance_divisors[i] = elements[i].instance_divisor;
 
-			if (v->instance_divisors[i] == 1)
+			if (instance_divisor == 1) {
 				v->instance_divisor_is_one |= 1u << i;
-			else
+			} else {
 				v->instance_divisor_is_fetched |= 1u << i;
+				divisor_factors[i] =
+					util_compute_fast_udiv_info(instance_divisor, 32);
+			}
 		}
 
 		if (!used[vbo_index]) {
 			v->first_vb_use_mask |= 1 << i;
 			used[vbo_index] = true;
 		}
 
 		desc = util_format_description(elements[i].src_format);
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
 		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
@@ -4487,20 +4497,36 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 			}
 		}
 
 		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
 				   S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
 				   S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
 				   S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
 				   S_008F0C_NUM_FORMAT(num_format) |
 				   S_008F0C_DATA_FORMAT(data_format);
 	}
+
+	if (v->instance_divisor_is_fetched) {
+		unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+		v->instance_divisor_factor_buffer =
+			(struct r600_resource*)
+			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   num_divisors * sizeof(divisor_factors[0]));
+		if (!v->instance_divisor_factor_buffer) {
+			FREE(v);
+			return NULL;
+		}
+		void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
+						    NULL, PIPE_TRANSFER_WRITE);
+		memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+	}
 	return v;
 }
 
 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_vertex_elements *old = sctx->vertex_elements;
 	struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
 	sctx->vertex_elements = v;
@@ -4510,34 +4536,36 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 	    (!old ||
 	     old->count != v->count ||
 	     old->uses_instance_divisors != v->uses_instance_divisors ||
 	     v->uses_instance_divisors || /* we don't check which divisors changed */
 	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
 		sctx->do_update_shaders = true;
 
 	if (v && v->instance_divisor_is_fetched) {
 		struct pipe_constant_buffer cb;
 
-		cb.buffer = NULL;
-		cb.user_buffer = v->instance_divisors;
+		cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+		cb.user_buffer = NULL;
 		cb.buffer_offset = 0;
-		cb.buffer_size = sizeof(uint32_t) * v->count;
+		cb.buffer_size = 0xffffffff;
 		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
 	}
 }
 
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
 	if (sctx->vertex_elements == state)
 		sctx->vertex_elements = NULL;
+	r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
 	FREE(state);
 }
 
 static void si_set_vertex_buffers(struct pipe_context *ctx,
 				  unsigned start_slot, unsigned count,
 				  const struct pipe_vertex_buffer *buffers)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
 	int i;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 89bb5b6..d9c3e70 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -131,21 +131,21 @@ struct si_state_dsa {
 
 };
 
 struct si_stencil_ref {
 	struct pipe_stencil_ref		state;
 	struct si_dsa_stencil_ref_part	dsa_part;
 };
 
 struct si_vertex_elements
 {
-	uint32_t			instance_divisors[SI_MAX_ATTRIBS];
+	struct r600_resource		*instance_divisor_factor_buffer;
 	uint32_t			rsrc_word3[SI_MAX_ATTRIBS];
 	uint16_t			src_offset[SI_MAX_ATTRIBS];
 	uint8_t				fix_fetch[SI_MAX_ATTRIBS];
 	uint8_t				format_size[SI_MAX_ATTRIBS];
 	uint8_t				vertex_buffer_index[SI_MAX_ATTRIBS];
 
 	uint8_t				count;
 	bool				uses_instance_divisors;
 
 	uint16_t			first_vb_use_mask;
-- 
2.7.4



More information about the mesa-dev mailing list