[Mesa-dev] [PATCH] r600: initial attempt at gl_HelperInvocation (v3)

Thu Feb 1 08:21:56 UTC 2018

From: Dave Airlie <airlied at redhat.com>

This passes the CTS and piglit tests.

This also disable sb for helper invocations until it doesn't
mess up the VPM flags.

Thanks to Ilia and Glenn for advice, and Roland for working
out the working evergreen path.
---
 src/gallium/drivers/r600/r600_asm.c    |   7 +-
 src/gallium/drivers/r600/r600_isa.c    |   1 +
 src/gallium/drivers/r600/r600_isa.h    |   5 +-
 src/gallium/drivers/r600/r600_shader.c | 113 +++++++++++++++++++++++++++++++++
 src/gallium/drivers/r600/r600_shader.h |   1 +
 src/gallium/drivers/r600/r600_sq.h     |   2 +
 6 files changed, 126 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 21d069d..ec2d34e 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -2099,9 +2099,12 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
 				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
 						bc->bytecode[id + 1], cfop->name);
 				fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr);
-				fprintf(stderr, "\n");
+				if (cf->vpm)
+					fprintf(stderr, "VPM ");
 				if (cf->end_of_program)
 					fprintf(stderr, "EOP ");
+				fprintf(stderr, "\n");
+
 			} else if (cfop->flags & CF_EXP) {
 				int o = 0;
 				const char *exp_type[] = {"PIXEL", "POS  ", "PARAM"};
@@ -2198,6 +2201,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
 					fprintf(stderr, "POP:%X ", cf->pop_count);
 				if (cf->count && (cfop->flags & CF_EMIT))
 					fprintf(stderr, "STREAM%d ", cf->count);
+				if (cf->vpm)
+					fprintf(stderr, "VPM ");
 				if (cf->end_of_program)
 					fprintf(stderr, "EOP ");
 				fprintf(stderr, "\n");
diff --git a/src/gallium/drivers/r600/r600_isa.c b/src/gallium/drivers/r600/r600_isa.c
index 2633cdc..611b370 100644
--- a/src/gallium/drivers/r600/r600_isa.c
+++ b/src/gallium/drivers/r600/r600_isa.c
@@ -506,6 +506,7 @@ static const struct cf_op_info cf_op_table[] = {
 		{"ALU_EXT",                       {   -1,   -1, 0x0C, 0x0C },  CF_CLAUSE | CF_ALU | CF_ALU_EXT  },
 		{"ALU_CONTINUE",                  { 0x0D, 0x0D, 0x0D,   -1 },  CF_CLAUSE | CF_ALU  },
 		{"ALU_BREAK",                     { 0x0E, 0x0E, 0x0E,   -1 },  CF_CLAUSE | CF_ALU  },
+		{"ALU_VALID_PIXEL_MODE",          {   -1,   -1,   -1, 0x0E },  CF_CLAUSE | CF_ALU  },
 		{"ALU_ELSE_AFTER",                { 0x0F, 0x0F, 0x0F, 0x0F },  CF_CLAUSE | CF_ALU  },
 		{"CF_NATIVE",                     { 0x00, 0x00, 0x00, 0x00 },  0  }
 };
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
index f6e2697..fcaf1f7 100644
--- a/src/gallium/drivers/r600/r600_isa.h
+++ b/src/gallium/drivers/r600/r600_isa.h
@@ -646,10 +646,11 @@ struct cf_op_info
 #define CF_OP_ALU_EXT                      84
 #define CF_OP_ALU_CONTINUE                 85
 #define CF_OP_ALU_BREAK                    86
-#define CF_OP_ALU_ELSE_AFTER               87
+#define CF_OP_ALU_VALID_PIXEL_MODE         87
+#define CF_OP_ALU_ELSE_AFTER               88
 
 /* CF_NATIVE means that r600_bytecode_cf contains pre-encoded native data */
-#define CF_NATIVE                          88
+#define CF_NATIVE                          89
 
 enum r600_chip_class {
 	ISA_CC_R600,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index a462691..9388db9 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -197,6 +197,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 
 	use_sb &= !shader->shader.uses_atomics;
 	use_sb &= !shader->shader.uses_images;
+	use_sb &= !shader->shader.uses_helper_invocation;
 
 	/* Check if the bytecode has already been built. */
 	if (!shader->shader.bc.bytecode) {
@@ -346,6 +347,7 @@ struct r600_shader_ctx {
 	boolean                 clip_vertex_write;
 	unsigned                cv_output;
 	unsigned		edgeflag_output;
+	int					helper_invoc_reg;
 	int                                     cs_block_size_reg;
 	int                                     cs_grid_size_reg;
 	bool cs_block_size_loaded, cs_grid_size_loaded;
@@ -1295,6 +1297,93 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
 	return t1;
 }
 
+static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
+{
+	int r;
+	struct r600_bytecode_alu alu;
+
+	/* do a vtx fetch with wqm set on the vtx fetch */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[0].value = 0xffffffff;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* do a vtx fetch in VPM mode */
+	struct r600_bytecode_vtx vtx;
+	memset(&vtx, 0, sizeof(vtx));
+	vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
+	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
+	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
+	vtx.src_gpr = 0;
+	vtx.mega_fetch_count = 16; /* no idea here really... */
+	vtx.dst_gpr = ctx->helper_invoc_reg;
+	vtx.dst_sel_x = 4;
+	vtx.dst_sel_y = 7;		/* SEL_Y */
+	vtx.dst_sel_z = 7;		/* SEL_Z */
+	vtx.dst_sel_w = 7;		/* SEL_W */
+	vtx.data_format = FMT_32;
+	if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
+		return r;
+	ctx->bc->cf_last->vpm = 1;
+
+	/* compare the result with 0 */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP3_CNDE_INT;
+	alu.is_op3 = 1;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.dst.write = 1;
+	alu.src[0].sel = ctx->helper_invoc_reg;
+	alu.src[0].chan = 0;
+	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[1].value = 0x0;
+	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[2].value = 0xffffffff;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
+{
+	int r;
+	struct r600_bytecode_alu alu;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[0].value = 0xffffffff;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_0;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
+	if (r)
+		return r;
+
+	return ctx->helper_invoc_reg;
+}
+
 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
 {
 	struct r600_bytecode_vtx vtx;
@@ -1458,6 +1547,12 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 			r600_src->sel = load_block_grid_size(ctx, false);
 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
 			r600_src->sel = load_block_grid_size(ctx, true);
+		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
+			r600_src->sel = ctx->helper_invoc_reg;
+			r600_src->swizzle[0] = 0;
+			r600_src->swizzle[1] = 0;
+			r600_src->swizzle[2] = 0;
+			r600_src->swizzle[3] = 0;
 		}
 	} else {
 		if (tgsi_src->Register.Indirect)
@@ -3120,6 +3215,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	tgsi_scan_shader(tokens, &ctx.info);
 	shader->indirect_files = ctx.info.indirect_files;
 
+	shader->uses_helper_invocation = false;
 	shader->uses_doubles = ctx.info.uses_doubles;
 	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
 	shader->nsys_inputs = 0;
@@ -3193,6 +3289,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.clip_vertex_write = 0;
 	ctx.thread_id_gpr_loaded = false;
 
+	ctx.helper_invoc_reg = -1;
 	ctx.cs_block_size_reg = -1;
 	ctx.cs_grid_size_reg = -1;
 	ctx.cs_block_size_loaded = false;
@@ -3238,6 +3335,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
 		else
 			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
+
+		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
+			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
+				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
+				shader->uses_helper_invocation = true;
+			}
+		}
 	}
 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
 		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
@@ -3439,6 +3543,15 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
 		shader->nr_ps_max_color_exports = 8;
 
+	if (ctx.shader->uses_helper_invocation) {
+		if (ctx.bc->chip_class == CAYMAN)
+			r = cm_load_helper_invocation(&ctx);
+		else
+			r = eg_load_helper_invocation(&ctx);
+		if (r)
+			return r;
+
+	}
 	if (ctx.fragcoord_input >= 0) {
 		if (ctx.bc->chip_class == CAYMAN) {
 			for (j = 0 ; j < 4; j++) {
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 8444907..da96688 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -119,6 +119,7 @@ struct r600_shader {
 	boolean			uses_doubles;
 	boolean                 uses_atomics;
 	boolean			uses_images;
+	boolean			uses_helper_invocation;
 	uint8_t                 atomic_base;
 	uint8_t			rat_base;
 	uint8_t                 image_size_const_offset;
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index f51ffcf..6b07dc1 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -198,6 +198,8 @@
 #define     EG_V_SQ_ALU_SRC_LDS_DIRECT_B                             0x000000E0
 #define     EG_V_SQ_ALU_SRC_TIME_HI                                  0x000000E3
 #define     EG_V_SQ_ALU_SRC_TIME_LO                                  0x000000E4
+#define     EG_V_SQ_ALU_SRC_MASK_HI                                  0x000000E5
+#define     EG_V_SQ_ALU_SRC_MASK_LO                                  0x000000E6
 #define     EG_V_SQ_ALU_SRC_HW_WAVE_ID                               0x000000E7
 #define     EG_V_SQ_ALU_SRC_SIMD_ID                                  0x000000E8
 #define     EG_V_SQ_ALU_SRC_SE_ID                                    0x000000E9
-- 
2.9.5