Mesa (master): freedreno/ir3: rework location of driver constants

Rob Clark robclark at kemper.freedesktop.org
Tue Dec 27 22:03:30 UTC 2016


Module: Mesa
Branch: master
Commit: fc10dc9fdea6ad7d04dfcdb8fd2e2d59ea67f68b
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fc10dc9fdea6ad7d04dfcdb8fd2e2d59ea67f68b

Author: Rob Clark <robdclark at gmail.com>
Date:   Wed Dec 21 22:43:52 2016 -0500

freedreno/ir3: rework location of driver constants

Rework how we lay out driver constants (driver-params, UBO/TFBO buffer
addresses, immediates) for more flexibility.  For a5xx+ we need to deal
with the fact that gpu ptrs are 64b instead of 32b, which makes the
fixed offset scheme not work so well.  While we are dealing with that
we might also make the layout more dynamic to account for varying # of
UBOs, etc.

Signed-off-by: Rob Clark <robdclark at gmail.com>

---

 src/gallium/drivers/freedreno/a3xx/fd3_emit.c      |  9 ++--
 src/gallium/drivers/freedreno/a4xx/fd4_emit.c      |  9 ++--
 .../drivers/freedreno/ir3/ir3_compiler_nir.c       | 54 +++++++++++++++-------
 src/gallium/drivers/freedreno/ir3/ir3_cp.c         |  2 +-
 src/gallium/drivers/freedreno/ir3/ir3_shader.c     | 16 +++----
 src/gallium/drivers/freedreno/ir3/ir3_shader.h     | 38 +++++++--------
 6 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 95e6d26..6c3458a 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -96,16 +96,16 @@ static void
 fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 		uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets)
 {
+	uint32_t anum = align(num, 4);
 	uint32_t i;
 
 	debug_assert((regid % 4) == 0);
-	debug_assert((num % 4) == 0);
 
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
-			CP_LOAD_STATE_0_NUM_UNIT(num/2));
+			CP_LOAD_STATE_0_NUM_UNIT(anum/2));
 	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
 			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
 
@@ -120,6 +120,9 @@ fd3_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
+
+	for (; i < anum; i++)
+		OUT_RING(ring, 0xffffffff);
 }
 
 #define VERT_TEX_OFF    0
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 9231823..2f3e0a6 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -96,16 +96,16 @@ static void
 fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 		uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets)
 {
+	uint32_t anum = align(num, 4);
 	uint32_t i;
 
 	debug_assert((regid % 4) == 0);
-	debug_assert((num % 4) == 0);
 
-	OUT_PKT3(ring, CP_LOAD_STATE, 2 + num);
+	OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum);
 	OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/4) |
 			CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) |
 			CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) |
-			CP_LOAD_STATE_0_NUM_UNIT(num/4));
+			CP_LOAD_STATE_0_NUM_UNIT(anum/4));
 	OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) |
 			CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS));
 
@@ -120,6 +120,9 @@ fd4_emit_const_bo(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 			OUT_RING(ring, 0xbad00000 | (i << 16));
 		}
 	}
+
+	for (; i < anum; i++)
+		OUT_RING(ring, 0xffffffff);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index ac6840c..e0fc2aa 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -119,6 +119,11 @@ struct ir3_compile {
 	bool error;
 };
 
+/* gpu pointer size in units of 32bit registers/slots */
+static unsigned pointer_size(struct ir3_compile *ctx)
+{
+	return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
+}
 
 static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
 static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
@@ -181,31 +186,46 @@ compile_init(struct ir3_compiler *compiler,
 		nir_print_shader(ctx->s, stdout);
 	}
 
-	so->first_driver_param = so->first_immediate = align(ctx->s->num_uniforms, 4);
+	so->num_uniforms = ctx->s->num_uniforms;
+	so->num_ubos = ctx->s->info->num_ubos;
 
-	/* Layout of constant registers:
+	/* Layout of constant registers, each section aligned to vec4.  Note
+	 * that pointer size (ubo, etc) changes depending on generation.
 	 *
-	 *    num_uniform * vec4  -  user consts
-	 *    4 * vec4            -  UBO addresses
+	 *    user consts
+	 *    UBO addresses
 	 *    if (vertex shader) {
-	 *        N * vec4        -  driver params (IR3_DP_*)
-	 *        1 * vec4        -  stream-out addresses
+	 *        driver params (IR3_DP_*)
+	 *        if (stream_output.num_outputs > 0)
+	 *           stream-out addresses
 	 *    }
+	 *    immediates
 	 *
-	 * TODO this could be made more dynamic, to at least skip sections
-	 * that we don't need..
+	 * Immediates go last mostly because they are inserted in the CP pass
+	 * after the nir -> ir3 frontend.
 	 */
+	unsigned constoff = align(ctx->s->num_uniforms, 4);
+	unsigned ptrsz = pointer_size(ctx);
 
-	/* reserve 4 (vec4) slots for ubo base addresses: */
-	so->first_immediate += 4;
+	memset(&so->constbase, ~0, sizeof(so->constbase));
+
+	if (so->num_ubos > 0) {
+		so->constbase.ubo = constoff;
+		constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4;
+	}
 
 	if (so->type == SHADER_VERTEX) {
-		/* driver params (see ir3_driver_param): */
-		so->first_immediate += IR3_DP_COUNT/4;  /* convert to vec4 */
-		/* one (vec4) slot for stream-output base addresses: */
-		so->first_immediate++;
+		so->constbase.driver_param = constoff;
+		constoff += align(IR3_DP_COUNT, 4) / 4;
+
+		if (so->shader->stream_output.num_outputs > 0) {
+			so->constbase.tfbo = constoff;
+			constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4;
+		}
 	}
 
+	so->constbase.immediate = constoff;
+
 	return ctx;
 }
 
@@ -576,7 +596,7 @@ create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
 {
 	/* first four vec4 sysval's reserved for UBOs: */
 	/* NOTE: dp is in scalar, but there can be >4 dp components: */
-	unsigned n = ctx->so->first_driver_param + IR3_DRIVER_PARAM_OFF;
+	unsigned n = ctx->so->constbase.driver_param;
 	unsigned r = regid(n + dp / 4, dp % 4);
 	return create_uniform(ctx, r);
 }
@@ -975,7 +995,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 	struct ir3_instruction *addr, *src0, *src1;
 	nir_const_value *const_offset;
 	/* UBO addresses are the first driver params: */
-	unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0);
+	unsigned ubo = regid(ctx->so->constbase.ubo, 0);
 	int off = 0;
 
 	/* First src is ubo index, which could either be an immed or not: */
@@ -1905,7 +1925,7 @@ emit_stream_out(struct ir3_compile *ctx)
 		unsigned stride = strmout->stride[i];
 		struct ir3_instruction *base, *off;
 
-		base = create_uniform(ctx, regid(v->first_driver_param + IR3_TFBOS_OFF, i));
+		base = create_uniform(ctx, regid(v->constbase.tfbo, i));
 
 		/* 24-bit should be enough: */
 		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 57c37e2..71e0261 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -296,7 +296,7 @@ lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags
 	new_flags &= ~IR3_REG_IMMED;
 	new_flags |= IR3_REG_CONST;
 	reg->flags = new_flags;
-	reg->num = i + (4 * ctx->so->first_immediate);
+	reg->num = i + (4 * ctx->so->constbase.immediate);
 
 	return reg;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 8920225..4da7246 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -366,7 +366,7 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
 	}
 
 	for (i = 0; i < so->immediates_count; i++) {
-		debug_printf("@const(c%d.x)\t", so->first_immediate + i);
+		debug_printf("@const(c%d.x)\t", so->constbase.immediate + i);
 		debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
 				so->immediates[i].val[0],
 				so->immediates[i].val[1],
@@ -503,7 +503,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		 * the user consts early to avoid HLSQ lockup caused by
 		 * writing too many consts
 		 */
-		uint32_t max_const = MIN2(v->first_driver_param, v->constlen);
+		uint32_t max_const = MIN2(v->num_uniforms, v->constlen);
 
 		// I expect that size should be a multiple of vec4's:
 		assert(size == align(size, 4));
@@ -527,9 +527,9 @@ static void
 emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
 {
-	uint32_t offset = v->first_driver_param + IR3_UBOS_OFF;
+	uint32_t offset = v->constbase.ubo;
 	if (v->constlen > offset) {
-		uint32_t params = MIN2(4, v->constlen - offset) * 4;
+		uint32_t params = v->num_ubos;
 		uint32_t offsets[params];
 		struct pipe_resource *prscs[params];
 
@@ -557,7 +557,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_ringbuffer *ring)
 {
 	int size = v->immediates_count;
-	uint32_t base = v->first_immediate;
+	uint32_t base = v->constbase.immediate;
 
 	/* truncate size to avoid writing constants that shader
 	 * does not use:
@@ -581,7 +581,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
 		struct fd_ringbuffer *ring)
 {
 	/* streamout addresses after driver-params: */
-	uint32_t offset = v->first_driver_param + IR3_TFBOS_OFF;
+	uint32_t offset = v->constbase.tfbo;
 	if (v->constlen > offset) {
 		struct fd_streamout_stateobj *so = &ctx->streamout;
 		struct pipe_stream_output_info *info = &v->shader->stream_output;
@@ -680,8 +680,8 @@ ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 	/* emit driver params every time: */
 	/* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
 	if (info && (v->type == SHADER_VERTEX)) {
-		uint32_t offset = v->first_driver_param + IR3_DRIVER_PARAM_OFF;
-		if (v->constlen >= offset) {
+		uint32_t offset = v->constbase.driver_param;
+		if (v->constlen > offset) {
 			uint32_t vertex_params[IR3_DP_COUNT] = {
 				[IR3_DP_VTXID_BASE] = info->indexed ?
 						info->index_bias : info->start,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index c603168..7a0ff98 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -47,22 +47,6 @@ enum ir3_driver_param {
 	IR3_DP_COUNT      = 36   /* must be aligned to vec4 */
 };
 
-/* Layout of constant registers:
- *
- *    num_uniform * vec4  -  user consts
- *    4 * vec4            -  UBO addresses
- *    if (vertex shader) {
- *        N * vec4        -  driver params (IR3_DP_*)
- *        1 * vec4        -  stream-out addresses
- *    }
- *
- * TODO this could be made more dynamic, to at least skip sections
- * that we don't need..
- */
-#define IR3_UBOS_OFF         0  /* UBOs after user consts */
-#define IR3_DRIVER_PARAM_OFF 4  /* driver params after UBOs */
-#define IR3_TFBOS_OFF       (IR3_DRIVER_PARAM_OFF + IR3_DP_COUNT/4)
-
 /* Configuration key used to identify a shader variant.. different
  * shader variants can be used to implement features not supported
  * in hw (two sided color), binning-pass vertex shader, etc.
@@ -143,6 +127,12 @@ struct ir3_shader_variant {
 	 */
 	unsigned constlen;
 
+	/* number of uniforms (in vec4), not including built-in compiler
+	 * constants, etc.
+	 */
+	unsigned num_uniforms;
+	unsigned num_ubos;
+
 	/* About Linkage:
 	 *   + Let the frag shader determine the position/compmask for the
 	 *     varyings, since it is the place where we know if the varying
@@ -211,12 +201,18 @@ struct ir3_shader_variant {
 	/* do we have kill instructions: */
 	bool has_kill;
 
-	/* const reg # of first immediate, ie. 1 == c1
-	 * (not regid, because TGSI thinks in terms of vec4 registers,
-	 * not scalar registers)
+	/* Layout of constant registers, each section (in vec4). Pointer size
+	 * is 32b (a3xx, a4xx), or 64b (a5xx+), which effects the size of the
+	 * UBO and stream-out consts.
 	 */
-	unsigned first_driver_param;
-	unsigned first_immediate;
+	struct {
+		/* user const start at zero */
+		unsigned ubo;
+		unsigned driver_param;
+		unsigned tfbo;
+		unsigned immediate;
+	} constbase;
+
 	unsigned immediates_count;
 	struct {
 		uint32_t val[4];




More information about the mesa-commit mailing list