Mesa (master): freedreno/a3xx: handle large shader program sizes

Rob Clark robclark at kemper.freedesktop.org
Thu Oct 2 18:09:03 UTC 2014


Module: Mesa
Branch: master
Commit: 7309c6126f2987d703b2f30b3cb56edf97d437d3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=7309c6126f2987d703b2f30b3cb56edf97d437d3

Author: Rob Clark <robclark at freedesktop.org>
Date:   Wed Oct  1 15:26:26 2014 -0400

freedreno/a3xx: handle large shader program sizes

Above a certain limit use CACHE mode instead of BUFFER mode.  This
should solve gpu hangs with large shader programs.

Signed-off-by: Rob Clark <robclark at freedesktop.org>

---

 src/gallium/drivers/freedreno/a3xx/fd3_program.c |   74 ++++++++++++++++++----
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index d7fe42e..d2a3248 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -179,6 +179,8 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 {
 	const struct ir3_shader_variant *vp, *fp;
 	const struct ir3_info *vsi, *fsi;
+	enum a3xx_instrbuffermode fpbuffer, vpbuffer;
+	uint32_t fpbuffersz, vpbuffersz, fsoff;
 	uint32_t pos_regid, posz_regid, psize_regid, color_regid;
 	int i, j, k;
 
@@ -195,6 +197,46 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 	vsi = &vp->info;
 	fsi = &fp->info;
 
+	fpbuffer = BUFFER;
+	vpbuffer = BUFFER;
+	fpbuffersz = fp->instrlen;
+	vpbuffersz = vp->instrlen;
+
+	/*
+	 * Decide whether to use BUFFER or CACHE mode for VS and FS.  It
+	 * appears like 256 is the hard limit, but when the combined size
+	 * exceeds 128 then blob will try to keep FS in BUFFER mode and
+	 * switch to CACHE for VS until VS is too large.  The blob seems
+	 * to switch FS out of BUFFER mode at slightly under 128.  But
+	 * a bit fuzzy on the decision tree, so use slightly conservative
+	 * limits.
+	 *
+	 * TODO check if these thresholds for BUFFER vs CACHE mode are the
+	 *      same for all a3xx or whether we need to consider the gpuid
+	 */
+
+	if ((fpbuffersz + vpbuffersz) > 128) {
+		if (fpbuffersz < 112) {
+			/* FP:BUFFER   VP:CACHE  */
+			vpbuffer = CACHE;
+			vpbuffersz = 256 - fpbuffersz;
+		} else if (vpbuffersz < 112) {
+			/* FP:CACHE    VP:BUFFER */
+			fpbuffer = CACHE;
+			fpbuffersz = 256 - vpbuffersz;
+		} else {
+			/* FP:CACHE    VP:CACHE  */
+			vpbuffer = fpbuffer = CACHE;
+			vpbuffersz = fpbuffersz = 192;
+		}
+	}
+
+	if (fpbuffer == BUFFER) {
+		fsoff = 128 - fpbuffersz;
+	} else {
+		fsoff = 256 - fpbuffersz;
+	}
+
 	pos_regid = find_output_regid(vp,
 		ir3_semantic_name(TGSI_SEMANTIC_POSITION, 0));
 	posz_regid = find_output_regid(fp,
@@ -223,10 +265,10 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid));
 	OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
 			A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) |
-			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vp->instrlen));
+			A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz));
 	OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) |
 			A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) |
-			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fp->instrlen));
+			A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz));
 
 	OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1);
 	OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) |
@@ -239,15 +281,15 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 
 	OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3);
 	OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
-			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
-			A3XX_SP_VS_CTRL_REG0_CACHEINVALID |
+			A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) |
+			COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
 			A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
 			A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
 			A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
 			A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
 			A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
 			COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
-			A3XX_SP_VS_CTRL_REG0_LENGTH(vp->instrlen));
+			A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
 	OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
 			A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
 			A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0)));
@@ -311,28 +353,36 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
 				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER));
 		OUT_RING(ring, 0x00000000);
+
+		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1);
+		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
+				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
 	} else {
 		OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1);
 		OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen));
 
 		OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2);
 		OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
-				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) |
-				A3XX_SP_FS_CTRL_REG0_CACHEINVALID |
+				A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) |
+				COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
 				A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
 				A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
 				A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
 				A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
 				A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
 				COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
-				A3XX_SP_FS_CTRL_REG0_LENGTH(fp->instrlen));
+				A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz));
 		OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) |
 				A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->total_in) |
 				A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen + 1, 0)) |
 				A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63));
+
+		/* NOTE: I believe VS.CONSTLEN should be <= FS.CONSTOBJOFFSET*/
+		debug_assert(vp->constlen <= 128);
+
 		OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2);
 		OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) |
-				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0));
+				A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff));
 		OUT_RELOC(ring, fp->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
 	}
 
@@ -411,13 +461,15 @@ fd3_program_emit(struct fd_ringbuffer *ring,
 	OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) |
 			A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252));
 
-	emit_shader(ring, vp);
+	if (vpbuffer == BUFFER)
+		emit_shader(ring, vp);
 
 	OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 	OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */
 
 	if (!key.binning_pass) {
-		emit_shader(ring, fp);
+		if (fpbuffer == BUFFER)
+			emit_shader(ring, fp);
 
 		OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1);
 		OUT_RING(ring, 0x00000000);        /* VFD_PERFCOUNTER0_SELECT */




More information about the mesa-commit mailing list