Mesa (master): freedreno/a3xx: binning-pass vertex shader variant

Rob Clark robclark at kemper.freedesktop.org
Sun Mar 2 16:32:32 UTC 2014


Module: Mesa
Branch: master
Commit: cb540c21f24720436356ab34f15e440a58e3e55d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=cb540c21f24720436356ab34f15e440a58e3e55d

Author: Rob Clark <robclark at freedesktop.org>
Date:   Sat Feb 22 10:47:27 2014 -0500

freedreno/a3xx: binning-pass vertex shader variant

Now that we have the infrastructure for shader variants, add support to
generate an optimized shader for hw binning pass (with varyings/outputs
other than position/pointsize removed).  This exposes the possibility
that the shader uses fewer constants than what is bound, so we have to
take care to not emit consts beyond what the shader uses, lest we
provoke the wrath of the HLSQ lockup!

Signed-off-by: Rob Clark <robclark at freedesktop.org>

---

 src/gallium/drivers/freedreno/a3xx/fd3_compiler.c |   25 ++++++++++++-
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c     |   41 +++++++++++++++++----
 src/gallium/drivers/freedreno/a3xx/fd3_program.c  |    9 -----
 3 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
index 54b3626..905af54 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c
@@ -2214,7 +2214,7 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 {
 	struct fd3_compile_context ctx;
 	struct ir3_block *block;
-	unsigned i, actual_in;
+	unsigned i, j, actual_in;
 	int ret = 0;
 
 	assert(!so->ir);
@@ -2232,6 +2232,29 @@ fd3_compile_shader(struct fd3_shader_variant *so,
 
 	block = ctx.block;
 
+	/* at this point, for binning pass, throw away unneeded outputs: */
+	if (key.binning_pass) {
+		for (i = 0, j = 0; i < so->outputs_count; i++) {
+			unsigned name = sem2name(so->outputs[i].semantic);
+			unsigned idx = sem2name(so->outputs[i].semantic);
+
+			/* throw away everything but first position/psize */
+			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
+					(name == TGSI_SEMANTIC_PSIZE))) {
+				if (i != j) {
+					so->outputs[j] = so->outputs[i];
+					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
+					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
+					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
+					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+				}
+				j++;
+			}
+		}
+		so->outputs_count = j;
+		block->noutputs = j * 4;
+	}
+
 	/* at this point, we want the kill's in the outputs array too,
 	 * so that they get scheduled (since they have no dst).. we've
 	 * already ensured that the array is big enough in push_block():
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 5bfd976..50271fa 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -90,6 +90,7 @@ emit_constants(struct fd_ringbuffer *ring,
 		struct fd3_shader_variant *shader)
 {
 	uint32_t enabled_mask = constbuf->enabled_mask;
+	uint32_t first_immediate;
 	uint32_t base = 0;
 	unsigned i;
 
@@ -97,6 +98,13 @@ emit_constants(struct fd_ringbuffer *ring,
 	// they are clobbered by a clear, gmem2mem, or mem2gmem..
 	constbuf->dirty_mask = enabled_mask;
 
+	/* in particular, with binning shader and a unneeded consts no
+	 * longer referenced, we could end up w/ constlen that is smaller
+	 * than first_immediate.  In that case truncate the user consts
+	 * early to avoid HLSQ lockup caused by writing too many consts
+	 */
+	first_immediate = MIN2(shader->first_immediate, shader->constlen);
+
 	/* emit user constants: */
 	while (enabled_mask) {
 		unsigned index = ffs(enabled_mask) - 1;
@@ -109,10 +117,14 @@ emit_constants(struct fd_ringbuffer *ring,
 		/* gallium could leave const buffers bound above what the
 		 * current shader uses.. don't let that confuse us.
 		 */
-		if (base >= (4 * shader->first_immediate))
+		if (base >= (4 * first_immediate))
 			break;
 
 		if (constbuf->dirty_mask & (1 << index)) {
+			/* and even if the start of the const buffer is before
+			 * first_immediate, the end may not be:
+			 */
+			size = MIN2(size, (4 * first_immediate) - base);
 			fd3_emit_constant(ring, sb, base,
 					cb->buffer_offset, size,
 					cb->user_buffer, cb->buffer);
@@ -332,6 +344,15 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring,
 			j++;
 		}
 	}
+
+	OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
+	OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) |
+			A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
+			A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) |
+			A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j));
+	OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
+			A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
+			A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
 }
 
 void
@@ -429,11 +450,13 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
 		struct fd3_rasterizer_stateobj *rasterizer =
 				fd3_rasterizer_stateobj(ctx->rasterizer);
-		uint32_t stride_in_vpc;
+		uint32_t stride_in_vpc = 0;
 
-		stride_in_vpc = align(fp->total_in, 4) / 4;
-		if (stride_in_vpc > 0)
-			stride_in_vpc = MAX2(stride_in_vpc, 2);
+		if (!key.binning_pass) {
+			stride_in_vpc = align(fp->total_in, 4) / 4;
+			if (stride_in_vpc > 0)
+				stride_in_vpc = MAX2(stride_in_vpc, 2);
+		}
 
 		OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1);
 		OUT_RING(ring, rasterizer->pc_prim_vtx_cntl |
@@ -480,9 +503,11 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		emit_constants(ring,  SB_VERT_SHADER,
 				&ctx->constbuf[PIPE_SHADER_VERTEX],
 				(prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL);
-		emit_constants(ring, SB_FRAG_SHADER,
-				&ctx->constbuf[PIPE_SHADER_FRAGMENT],
-				(prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+		if (!key.binning_pass) {
+			emit_constants(ring, SB_FRAG_SHADER,
+					&ctx->constbuf[PIPE_SHADER_FRAGMENT],
+					(prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL);
+		}
 	}
 
 	if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 01502ce..6fc39a9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -532,15 +532,6 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 		OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 		OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
 	}
-
-	OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2);
-	OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(vp->total_in) |
-			A3XX_VFD_CONTROL_0_PACKETSIZE(2) |
-			A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(vp->inputs_count) |
-			A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(vp->inputs_count));
-	OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX
-			A3XX_VFD_CONTROL_1_REGID4VTX(regid(63,0)) |
-			A3XX_VFD_CONTROL_1_REGID4INST(regid(63,0)));
 }
 
 /* hack.. until we figure out how to deal w/ vpsrepl properly.. */




More information about the mesa-commit mailing list