[Mesa-dev] [PATCH 05/11] freedreno: a2xx: implement a20x binning shader

Mon Oct 8 04:06:05 UTC 2018

writes to position export are mapped to a temp reg, code inserted at the
end of vertex shaders to export the position and compute the memory
exports for hw binning on a20x. C64 is the offset in the binning data,
C65/C66 are viewport parameters, C67+i/C68+i are binning view parameters.
C3+i is the binning data "pointer" - relative_addr=1 (in ir-a2xx) makes
it not interfere with the other shader constants

Signed-off-by: Jonathan Marek <jonathan at marek.ca>
---
 .../drivers/freedreno/a2xx/fd2_compiler.c     | 72 +++++++++++++++++--
 src/gallium/drivers/freedreno/a2xx/fd2_emit.c | 14 ++++
 .../drivers/freedreno/a2xx/fd2_program.c      |  6 +-
 src/gallium/drivers/freedreno/a2xx/ir-a2xx.c  | 62 +++++++++++++---
 src/gallium/drivers/freedreno/a2xx/ir-a2xx.h  |  4 +-
 5 files changed, 141 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
index 54f0df54da..1ce3bc4f82 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
@@ -294,7 +294,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx)
 {
 	unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
 	if (ctx->type == PIPE_SHADER_VERTEX)
-		num++;
+		num += 2; /* vertex fetch input / position temp */
 	return num;
 }
 
@@ -310,12 +310,19 @@ add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
 		flags |= IR2_REG_EXPORT;
 		if (ctx->type == PIPE_SHADER_VERTEX) {
 			if (dst->Index == ctx->position) {
-				num = 62;
+				/* position needed for fragcoord / a20x hw binning
+				 * write to a temp reg instead
+				 */
+				num = ctx->num_regs[TGSI_FILE_INPUT] + 1;
+				flags &= ~IR2_REG_EXPORT;
 			} else if (dst->Index == ctx->psize) {
 				num = 63;
 			} else {
-				num = export_linkage(ctx,
-						ctx->output_export_idx[dst->Index]);
+				num = ctx->prog->export_linkage[
+						ctx->output_export_idx[dst->Index]];
+				/* not used by fragment shader - ir-a2xx will clean it up */
+				if (num == 0xff)
+					num = ctx->prog->num_exports;
 			}
 		} else {
 			num = dst->Index;
@@ -1091,6 +1098,60 @@ compile_instructions(struct fd2_compile_context *ctx)
 	}
 }
 
+static void
+compile_extra_exports(struct fd2_compile_context *ctx)
+{
+	struct ir2_shader *shader = ctx->so->ir;
+	struct ir2_instruction *instr;
+	int position = ctx->num_regs[TGSI_FILE_INPUT] + 1;
+	unsigned i;
+	/* XXX hacky way to get new temporaries */
+	unsigned tmp = shader->max_reg + 1;
+
+	instr = ir2_instr_create_alu_v(shader, MAXv);
+	ir2_reg_create(instr, position, "xyzw", 0);
+	ir2_reg_create(instr, position, "xyzw", 0);
+	ir2_dst_create(instr, 62, "xyzw", IR2_REG_EXPORT);
+
+	instr = ir2_instr_create_alu_s(shader, RECIP_CLAMP);
+	ir2_reg_create(instr, position, "xyzw", 0);
+	ir2_dst_create(instr, tmp, "___w", 0);
+
+	instr = ir2_instr_create_alu_v(shader, MULv);
+	ir2_reg_create(instr, position, "xyzw", 0);
+	ir2_reg_create(instr, tmp, "wwww", 0);
+	ir2_dst_create(instr, tmp + 1, "xyzw", 0);
+
+	/* these two instructions could be avoided with constant folding
+	 * but it would be hard to implement..
+	 */
+	instr = ir2_instr_create_alu_v(shader, MULADDv);
+	ir2_reg_create(instr, 66, "xyzw", IR2_REG_CONST);
+	ir2_reg_create(instr, tmp + 1, "xyzw", 0);
+	ir2_reg_create(instr, 65, "xyzw", IR2_REG_CONST);
+	ir2_dst_create(instr, tmp + 2, "xyzw", 0);
+
+	instr = ir2_instr_create_alu_v(shader, ADDv);
+	ir2_reg_create(instr, 64, "xxxx", IR2_REG_CONST);
+	ir2_reg_create(instr, 15, "xxxx", IR2_REG_INPUT);
+	ir2_dst_create(instr, tmp + 3, "x___", 0);
+
+	/* 8 max set in freedreno_screen.. unneeded instrs patched out */
+	for (i = 0; i < 8; i++) {
+		instr = ir2_instr_create_alu_v(shader, MULADDv);
+		ir2_reg_create(instr, 1, "wyww", IR2_REG_CONST);
+		ir2_reg_create(instr, tmp + 3, "xxxx", 0);
+		ir2_reg_create(instr, 3 + i, "xyzw", IR2_REG_CONST);
+		ir2_dst_create(instr, 32, "xyzw", IR2_REG_EXPORT);
+
+		instr = ir2_instr_create_alu_v(shader, MULADDv);
+		ir2_reg_create(instr, 68 + i * 2, "xyzw", IR2_REG_CONST);
+		ir2_reg_create(instr, tmp + 2, "xyzw", 0);
+		ir2_reg_create(instr, 67 + i * 2, "xyzw", IR2_REG_CONST);
+		ir2_dst_create(instr, 33, "xyzw", IR2_REG_EXPORT);
+	}
+}
+
 int
 fd2_compile_shader(struct fd_program_stateobj *prog,
 		struct fd2_shader_stateobj *so)
@@ -1114,6 +1175,9 @@ fd2_compile_shader(struct fd_program_stateobj *prog,
 
 	compile_instructions(&ctx);
 
+	if (ctx.type == PIPE_SHADER_VERTEX)
+		compile_extra_exports(&ctx);
+
 	compile_free(&ctx);
 
 	return 0;
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index ba136deb1d..17a75b595e 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -277,6 +277,20 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
 				A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA |
 				A2XX_PA_CL_VTE_CNTL_VPORT_Z_SCALE_ENA |
 				A2XX_PA_CL_VTE_CNTL_VPORT_Z_OFFSET_ENA);
+
+		/* set C65/C66, for viewport calculation in shader */
+		OUT_PKT3(ring, CP_SET_CONSTANT, 9);
+		OUT_RING(ring, 0x00000184);
+
+		OUT_RING(ring, fui(ctx->viewport.translate[0]));
+		OUT_RING(ring, fui(ctx->viewport.translate[1]));
+		OUT_RING(ring, fui(ctx->viewport.translate[2]));
+		OUT_RING(ring, fui(0.0f));
+
+		OUT_RING(ring, fui(ctx->viewport.scale[0]));
+		OUT_RING(ring, fui(ctx->viewport.scale[1]));
+		OUT_RING(ring, fui(ctx->viewport.scale[2]));
+		OUT_RING(ring, fui(0.0f));
 	}
 
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) {
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
index 34622eaba0..74b3da5895 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -48,6 +48,8 @@ create_shader(enum shader_t type)
 	if (!so)
 		return NULL;
 	so->type = type;
+	so->info.num_exports = 1;
+	so->info.cf_export32 = -1;
 	return so;
 }
 
@@ -66,7 +68,7 @@ static struct fd2_shader_stateobj *
 assemble(struct fd2_shader_stateobj *so)
 {
 	free(so->bin);
-	so->bin = ir2_shader_assemble(so->ir, &so->info);
+	so->bin = ir2_shader_assemble(so->ir, &so->info, 0);
 	if (!so->bin)
 		goto fail;
 
@@ -103,6 +105,8 @@ compile(struct fd_program_stateobj *prog, struct fd2_shader_stateobj *so)
 	 */
 
 	so->info.sizedwords = 0;
+	/* num_exports value is for vertex shader.. */
+	so->info.num_exports = so->type == SHADER_VERTEX ? prog->num_exports : 32;
 
 	return so;
 
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
index 57625b2110..f8e056e424 100644
--- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
@@ -177,10 +177,8 @@ static bool sets_pred(struct ir2_instruction *instr)
 		instr->alu_scalar.opc <= PRED_SET_RESTOREs;
 }
 
-
-
 void* ir2_shader_assemble(struct ir2_shader *shader,
-		struct ir2_shader_info *info)
+		struct ir2_shader_info *info, bool a20x_binning)
 {
 	/* NOTES
 	 * blob compiler seems to always puts PRED_* instrs in a CF by
@@ -201,9 +199,13 @@ void* ir2_shader_assemble(struct ir2_shader *shader,
 	 */
 
 	/* mask of exports that must be generated
-	 * used to avoid calculating ps exports with hw binning
-	*/
-	uint64_t export = ~0ull;
+	 * low 32 exports are regular exports, disabled for binning shader
+	 * exports 32/33 are only used in the a20x binning shader
+	 * exports 62/63 are position/size outputs
+	 */
+	uint64_t export = 3ull << 62 | (a20x_binning ?
+		3ull << 32 : ((1ull << info->num_exports) - 1));
+
 	/* bitmask of variables required for exports defined by "export" */
 	uint32_t export_mask[REG_MASK/32+1] = {};
 
@@ -253,11 +255,18 @@ void* ir2_shader_assemble(struct ir2_shader *shader,
 			}
 
 			/* update dependencies */
-			uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
-					export_mask : shader->reg[dst_reg.num].regmask;
+			uint32_t *mask = shader->reg[dst_reg.num].regmask;
+			if (dst_reg.flags & IR2_REG_EXPORT) {
+				mask = export_mask;
+				if (!(export & (1ull << dst_reg.num)))
+					continue;
+			}
+
 			mask_set(mask, reg, num);
-			if (sets_pred(instr))
+			if (sets_pred(instr)) {
 				mask_set(export_mask, reg, num);
+				mask_set(export_mask, reg, dst_reg.num);
+			}
 		}
 	}
 
@@ -272,6 +281,13 @@ void* ir2_shader_assemble(struct ir2_shader *shader,
 		shader->reg[reg_idx].reg = reg_idx;
 	info->max_reg = max_input;
 
+	/* R2 contains the vertex index */
+	if (a20x_binning) {
+		shader->reg[IR2_REG_VERT_ID].reg = 2;
+		info->max_reg = MAX2(info->max_reg, 2);
+		export_size = 0;
+	}
+
 	/* CF instr state */
 	instr_cf_exec_t exec = { .opc = EXEC };
 	instr_cf_alloc_t alloc = { .opc = ALLOC };
@@ -312,6 +328,14 @@ void* ir2_shader_assemble(struct ir2_shader *shader,
 				need_alloc = export_size >= 0;
 				export_size = -1;
 			} else if (num == 32 || num == 33) {
+				if (info->cf_export32 == -1) {
+					/* current idx (cf - cfs) + possible exec clause
+					 * + alloc clause + pixel parameter alloc
+
+					 */
+					info->cf_export32 =
+						(cf - cfs + !!exec.count + 1 + !export_size) / 2 * 3;
+				}
 				alloc.size = 0;
 				alloc.buffer_select = SQ_MEMORY;
 				need_alloc = num != 33;
@@ -343,7 +367,25 @@ void* ir2_shader_assemble(struct ir2_shader *shader,
 		}
 
 		if (need_alloc) {
+			/* need to emit the required pixel output before memory exports
+			 * since the shader will be cut off with patching
+			 */
+			if (alloc.buffer_select == SQ_MEMORY && !export_size) {
+				instr_cf_alloc_t alloc = {};
+				alloc.opc = ALLOC;
+				alloc.size = 0;
+				alloc.buffer_select = SQ_PARAMETER_PIXEL;
+				*cf++ = *(instr_cf_t*) &alloc;
+				export_size = -1;
+			}
+
 			*cf++ = *(instr_cf_t*) &alloc;
+
+			if (alloc.buffer_select == SQ_MEMORY && info->cf_export32 == -1) {
+				/* dword offset, rounded down to a pair of cfs */
+				info->cf_export32 = (cf - cfs) / 2 * 3;
+			}
+
 			need_alloc = false;
 		}
 
@@ -357,7 +399,6 @@ void* ir2_shader_assemble(struct ir2_shader *shader,
 		 exec.count += 1;
 	}
 
-
 	exec.opc = !export_size ? EXEC : EXEC_END;
 	*cf++ = *(instr_cf_t*) &exec;
 	exec.address += exec.count;
@@ -420,7 +461,6 @@ struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
 	return instr;
 }
 
-
 /*
  * FETCH instructions:
  */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
index 27fe16f944..7ed31ce8e3 100644
--- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
+++ b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
@@ -38,6 +38,8 @@ struct ir2_shader;
 struct ir2_shader_info {
 	uint16_t sizedwords;
 	int8_t   max_reg;   /* highest GPR # used by shader */
+	unsigned num_exports;
+	int cf_export32; /* dword offset to patch # of pipes */
 };
 
 struct ir2_register {
@@ -139,7 +141,7 @@ struct ir2_shader {
 struct ir2_shader * ir2_shader_create(void);
 void ir2_shader_destroy(struct ir2_shader *shader);
 void * ir2_shader_assemble(struct ir2_shader *shader,
-		struct ir2_shader_info *info);
+		struct ir2_shader_info *info, bool a20x_binning);
 
 struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
 		int instr_type);
-- 
2.17.1