Mesa (master): freedreno/ir3: assign varying locations later

Thu Nov 26 17:49:51 UTC 2015

Module: Mesa
Branch: master
Commit: 57fc0dd8d5610a0a25cece53b172b0c992421db0
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=57fc0dd8d5610a0a25cece53b172b0c992421db0

Author: Rob Clark <robclark at freedesktop.org>
Date:   Thu Nov 26 12:26:53 2015 -0500

freedreno/ir3: assign varying locations later

Rather than assigning inloc up front, when we don't yet know if it will
be unused, assign it last thing before the legalize pass.

Also, realize when inputs are unused (since for frag shader's we can't
rely on them being removed from ir->inputs[]).  This doesn't make sense
if we don't also dynamically assign the inloc's, since we could end up
telling the hw the wrong # of varyings (since we currently assume that
the # of varyings and max-inloc are related..)

Signed-off-by: Rob Clark <robclark at freedesktop.org>

---

 src/gallium/drivers/freedreno/a3xx/fd3_program.c   |    2 +-
 src/gallium/drivers/freedreno/a4xx/fd4_program.c   |    2 +-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c       |   52 ++++++++++----------
 src/gallium/drivers/freedreno/ir3/ir3_shader.h     |   10 +++-
 4 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 01daa0f..7361516 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -264,7 +264,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0)));
 	OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
 			A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
-			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4));
+			A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in));
 
 	for (i = 0, j = -1; (i < 8) && (j < (int)fp->inputs_count); i++) {
 		uint32_t reg = 0;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index ffa53f5..0e861b9 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -326,7 +326,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in));
 	OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
 			A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
-			A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(s[FS].v->total_in, 4) / 4));
+			A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in));
 
 	for (i = 0, j = -1; (i < 16) && (j < (int)s[FS].v->inputs_count); i++) {
 		uint32_t reg = 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 156bb0b..8617704 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -96,9 +96,6 @@ struct ir3_compile {
 	 */
 	struct hash_table *block_ht;
 
-	/* for calculating input/output positions/linkages: */
-	unsigned next_inloc;
-
 	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
 	 * so we need to use ldlv.u32 to load the varying directly:
 	 */
@@ -235,7 +232,6 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->compiler = compiler;
 	ctx->ir = so->ir;
 	ctx->so = so;
-	ctx->next_inloc = 8;
 	ctx->def_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->var_ht = _mesa_hash_table_create(ctx,
@@ -722,11 +718,12 @@ create_input(struct ir3_block *block, unsigned n)
 }
 
 static struct ir3_instruction *
-create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv)
+create_frag_input(struct ir3_compile *ctx, bool use_ldlv)
 {
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *instr;
-	struct ir3_instruction *inloc = create_immed(block, n);
+	/* actual inloc is assigned and fixed up later: */
+	struct ir3_instruction *inloc = create_immed(block, 0);
 
 	if (use_ldlv) {
 		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
@@ -2185,8 +2182,6 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 
 	so->inputs[n].slot = slot;
 	so->inputs[n].compmask = (1 << ncomp) - 1;
-	so->inputs[n].inloc = ctx->next_inloc;
-	so->inputs[n].interpolate = INTERP_QUALIFIER_NONE;
 	so->inputs_count = MAX2(so->inputs_count, n + 1);
 	so->inputs[n].interpolate = in->data.interpolation;
 
@@ -2231,8 +2226,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 
 				so->inputs[n].bary = true;
 
-				instr = create_frag_input(ctx,
-						so->inputs[n].inloc + i - 8, use_ldlv);
+				instr = create_frag_input(ctx, use_ldlv);
 			}
 
 			ctx->ir->inputs[idx] = instr;
@@ -2247,7 +2241,6 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 	}
 
 	if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
-		ctx->next_inloc += ncomp;
 		so->total_in += ncomp;
 	}
 }
@@ -2471,7 +2464,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	struct ir3_compile *ctx;
 	struct ir3 *ir;
 	struct ir3_instruction **inputs;
-	unsigned i, j, actual_in;
+	unsigned i, j, actual_in, inloc;
 	int ret = 0, max_bary;
 
 	assert(!so->ir);
@@ -2591,13 +2584,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 		ir3_print(ir);
 	}
 
-	ir3_legalize(ir, &so->has_samp, &max_bary);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER LEGALIZE:\n");
-		ir3_print(ir);
-	}
-
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
 		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
@@ -2611,32 +2597,46 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	/* Note that some or all channels of an input may be unused: */
 	actual_in = 0;
+	inloc = 0;
 	for (i = 0; i < so->inputs_count; i++) {
 		unsigned j, regid = ~0, compmask = 0;
 		so->inputs[i].ncomp = 0;
+		so->inputs[i].inloc = inloc + 8;
 		for (j = 0; j < 4; j++) {
 			struct ir3_instruction *in = inputs[(i*4) + j];
-			if (in) {
+			if (in && !(in->flags & IR3_INSTR_UNUSED)) {
 				compmask |= (1 << j);
 				regid = in->regs[0]->num - j;
 				actual_in++;
 				so->inputs[i].ncomp++;
+				if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) {
+					/* assign inloc: */
+					assert(in->regs[1]->flags & IR3_REG_IMMED);
+					in->regs[1]->iim_val = inloc++;
+				}
 			}
 		}
+		if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary)
+			so->varying_in++;
 		so->inputs[i].regid = regid;
 		so->inputs[i].compmask = compmask;
 	}
 
-	/* fragment shader always gets full vec4's even if it doesn't
-	 * fetch all components, but vertex shader we need to update
-	 * with the actual number of components fetch, otherwise thing
-	 * will hang due to mismaptch between VFD_DECODE's and
-	 * TOTALATTRTOVS
+	/* We need to do legalize after (for frag shader's) the "bary.f"
+	 * offsets (inloc) have been assigned.
 	 */
+	ir3_legalize(ir, &so->has_samp, &max_bary);
+
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER LEGALIZE:\n");
+		ir3_print(ir);
+	}
+
+	/* Note that actual_in counts inputs that are not bary.f'd for FS: */
 	if (so->type == SHADER_VERTEX)
 		so->total_in = actual_in;
 	else
-		so->total_in = align(max_bary + 1, 4);
+		so->total_in = max_bary + 1;
 
 out:
 	if (ret) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 5d1cccb..cf99a4c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -195,7 +195,15 @@ struct ir3_shader_variant {
 		enum glsl_interp_qualifier interpolate;
 	} inputs[16 + 2];  /* +POSITION +FACE */
 
-	unsigned total_in;       /* sum of inputs (scalar) */
+	/* sum of input components (scalar).  For frag shaders, it only counts
+	 * the varying inputs:
+	 */
+	unsigned total_in;
+
+	/* For frag shaders, the total number of inputs (not scalar,
+	 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+	 */
+	unsigned varying_in;
 
 	/* do we have one or more texture sample instructions: */
 	bool has_samp;