Mesa (master): radeonsi: use faster integer division for instance divisors

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Oct 16 22:58:08 UTC 2018


Module: Mesa
Branch: master
Commit: 0b40fbc8796b8e204e7af45b3d39b67d20fb3da7
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=0b40fbc8796b8e204e7af45b3d39b67d20fb3da7

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sat Sep 22 22:02:32 2018 -0400

radeonsi: use faster integer division for instance divisors

We know the divisors when we upload them, so instead we can precompute
and upload division factors derived from each divisor.

This fast division consists of add, mul_hi, and two shifts,
and we have to load 4 dwords intead of 1.

This probably won't affect any apps.

---

 src/gallium/drivers/radeonsi/si_shader.c | 52 ++++++++++++-------------
 src/gallium/drivers/radeonsi/si_state.c  | 65 ++++++++++++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_state.h  |  2 +-
 3 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index acd4d34f89..19522cc97b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -428,20 +428,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 	}
 }
 
-static LLVMValueRef get_instance_index_for_fetch(
-	struct si_shader_context *ctx,
-	unsigned param_start_instance, LLVMValueRef divisor)
-{
-	LLVMValueRef result = ctx->abi.instance_id;
-
-	/* The division must be done before START_INSTANCE is added. */
-	if (divisor != ctx->i32_1)
-		result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
-	return LLVMBuildAdd(ctx->ac.builder, result,
-			    LLVMGetParam(ctx->main_fn, param_start_instance), "");
-}
-
 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
  * to float. */
 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
@@ -7302,22 +7288,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
 			key->vs_prolog.states.instance_divisor_is_one & (1u << i);
 		bool divisor_is_fetched =
 			key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-		LLVMValueRef index;
-
-		if (divisor_is_one || divisor_is_fetched) {
-			LLVMValueRef divisor = ctx->i32_1;
-
-			if (divisor_is_fetched) {
-				divisor = buffer_load_const(ctx, instance_divisor_constbuf,
-							    LLVMConstInt(ctx->i32, i * 4, 0));
-				divisor = ac_to_integer(&ctx->ac, divisor);
+		LLVMValueRef index = NULL;
+
+		if (divisor_is_one) {
+			index = ctx->abi.instance_id;
+		} else if (divisor_is_fetched) {
+			LLVMValueRef udiv_factors[4];
+
+			for (unsigned j = 0; j < 4; j++) {
+				udiv_factors[j] =
+					buffer_load_const(ctx, instance_divisor_constbuf,
+							  LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+				udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
 			}
+			/* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+			 * Such InstanceID might not be achievable in a reasonable time though.
+			 */
+			index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+						       udiv_factors[0], udiv_factors[1],
+						       udiv_factors[2], udiv_factors[3]);
+		}
 
-			/* InstanceID / Divisor + StartInstance */
-			index = get_instance_index_for_fetch(ctx,
-							     user_sgpr_base +
-							     SI_SGPR_START_INSTANCE,
-							     divisor);
+		if (divisor_is_one || divisor_is_fetched) {
+			/* Add StartInstance. */
+			index = LLVMBuildAdd(ctx->ac.builder, index,
+					     LLVMGetParam(ctx->main_fn, user_sgpr_base +
+							  SI_SGPR_START_INSTANCE), "");
 		} else {
 			/* VertexID + BaseVertex */
 			index = LLVMBuildAdd(ctx->ac.builder,
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 827d849500..8e4cdddf0b 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -32,6 +32,7 @@
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
 
 static unsigned si_map_swizzle(unsigned swizzle)
 {
@@ -4372,6 +4373,29 @@ static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
  * Vertex elements & buffers
  */
 
+struct util_fast_udiv_info32 {
+   unsigned multiplier; /* the "magic number" multiplier */
+   unsigned pre_shift; /* shift for the dividend before multiplying */
+   unsigned post_shift; /* shift for the dividend after multiplying */
+   int increment; /* 0 or 1; if set then increment the numerator, using one of
+                     the two strategies */
+};
+
+static struct util_fast_udiv_info32
+util_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
+{
+	struct util_fast_udiv_info info =
+		util_compute_fast_udiv_info(D, num_bits, 32);
+
+	struct util_fast_udiv_info32 result = {
+		info.multiplier,
+		info.pre_shift,
+		info.post_shift,
+		info.increment,
+	};
+	return result;
+}
+
 static void *si_create_vertex_elements(struct pipe_context *ctx,
 				       unsigned count,
 				       const struct pipe_vertex_element *elements)
@@ -4379,6 +4403,12 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 	struct si_screen *sscreen = (struct si_screen*)ctx->screen;
 	struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
 	bool used[SI_NUM_VERTEX_BUFFERS] = {};
+	struct util_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
+	STATIC_ASSERT(sizeof(struct util_fast_udiv_info32) == 16);
+	STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+	STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+	STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+	STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
 	int i;
 
 	assert(count <= SI_MAX_ATTRIBS);
@@ -4401,14 +4431,17 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 			return NULL;
 		}
 
-		if (elements[i].instance_divisor) {
+		unsigned instance_divisor = elements[i].instance_divisor;
+		if (instance_divisor) {
 			v->uses_instance_divisors = true;
-			v->instance_divisors[i] = elements[i].instance_divisor;
 
-			if (v->instance_divisors[i] == 1)
+			if (instance_divisor == 1) {
 				v->instance_divisor_is_one |= 1u << i;
-			else
+			} else {
 				v->instance_divisor_is_fetched |= 1u << i;
+				divisor_factors[i] =
+					util_compute_fast_udiv_info32(instance_divisor, 32);
+			}
 		}
 
 		if (!used[vbo_index]) {
@@ -4518,6 +4551,22 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 				   S_008F0C_NUM_FORMAT(num_format) |
 				   S_008F0C_DATA_FORMAT(data_format);
 	}
+
+	if (v->instance_divisor_is_fetched) {
+		unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+		v->instance_divisor_factor_buffer =
+			(struct r600_resource*)
+			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   num_divisors * sizeof(divisor_factors[0]));
+		if (!v->instance_divisor_factor_buffer) {
+			FREE(v);
+			return NULL;
+		}
+		void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
+						    NULL, PIPE_TRANSFER_WRITE);
+		memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+	}
 	return v;
 }
 
@@ -4541,10 +4590,10 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 	if (v && v->instance_divisor_is_fetched) {
 		struct pipe_constant_buffer cb;
 
-		cb.buffer = NULL;
-		cb.user_buffer = v->instance_divisors;
+		cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+		cb.user_buffer = NULL;
 		cb.buffer_offset = 0;
-		cb.buffer_size = sizeof(uint32_t) * v->count;
+		cb.buffer_size = 0xffffffff;
 		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
 	}
 }
@@ -4552,9 +4601,11 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_vertex_elements *v = (struct si_vertex_elements*)state;
 
 	if (sctx->vertex_elements == state)
 		sctx->vertex_elements = NULL;
+	r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
 	FREE(state);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 16fd223d00..f52296d111 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -132,7 +132,7 @@ struct si_stencil_ref {
 
 struct si_vertex_elements
 {
-	uint32_t			instance_divisors[SI_MAX_ATTRIBS];
+	struct r600_resource		*instance_divisor_factor_buffer;
 	uint32_t			rsrc_word3[SI_MAX_ATTRIBS];
 	uint16_t			src_offset[SI_MAX_ATTRIBS];
 	uint8_t				fix_fetch[SI_MAX_ATTRIBS];




More information about the mesa-commit mailing list