[Mesa-dev] [PATCH] r600/cayman: initial attempt at gl_HelperInvocation

Wed Jan 31 07:31:09 UTC 2018

From: Dave Airlie <airlied at redhat.com>

This is a cayman only patch, it doesn't appear that
evergreen supports the ALU on VPM. I'll try and figure it out later.

All I can say for this patch is it passes the piglit test and
the CTS tests.

This also disable sb for helper invocations until it doesn't
mess up the MBCNTs.

TODO : non-cayman
---
 src/gallium/drivers/r600/r600_isa.c        |   1 +
 src/gallium/drivers/r600/r600_isa.h        |   5 +-
 src/gallium/drivers/r600/r600_shader.c     | 185 +++++++++++++++++++++++++++++
 src/gallium/drivers/r600/r600_shader.h     |   1 +
 src/gallium/drivers/r600/r600_sq.h         |   2 +
 src/gallium/drivers/r600/sb/sb_bc_dump.cpp |  15 +++
 6 files changed, 207 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_isa.c b/src/gallium/drivers/r600/r600_isa.c
index 2633cdcdb9..611b370bf5 100644
--- a/src/gallium/drivers/r600/r600_isa.c
+++ b/src/gallium/drivers/r600/r600_isa.c
@@ -506,6 +506,7 @@ static const struct cf_op_info cf_op_table[] = {
 		{"ALU_EXT",                       {   -1,   -1, 0x0C, 0x0C },  CF_CLAUSE | CF_ALU | CF_ALU_EXT  },
 		{"ALU_CONTINUE",                  { 0x0D, 0x0D, 0x0D,   -1 },  CF_CLAUSE | CF_ALU  },
 		{"ALU_BREAK",                     { 0x0E, 0x0E, 0x0E,   -1 },  CF_CLAUSE | CF_ALU  },
+		{"ALU_VALID_PIXEL_MODE",          {   -1,   -1,   -1, 0x0E },  CF_CLAUSE | CF_ALU  },
 		{"ALU_ELSE_AFTER",                { 0x0F, 0x0F, 0x0F, 0x0F },  CF_CLAUSE | CF_ALU  },
 		{"CF_NATIVE",                     { 0x00, 0x00, 0x00, 0x00 },  0  }
 };
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
index f6e26976c5..fcaf1f766b 100644
--- a/src/gallium/drivers/r600/r600_isa.h
+++ b/src/gallium/drivers/r600/r600_isa.h
@@ -646,10 +646,11 @@ struct cf_op_info
 #define CF_OP_ALU_EXT                      84
 #define CF_OP_ALU_CONTINUE                 85
 #define CF_OP_ALU_BREAK                    86
-#define CF_OP_ALU_ELSE_AFTER               87
+#define CF_OP_ALU_VALID_PIXEL_MODE         87
+#define CF_OP_ALU_ELSE_AFTER               88
 
 /* CF_NATIVE means that r600_bytecode_cf contains pre-encoded native data */
-#define CF_NATIVE                          88
+#define CF_NATIVE                          89
 
 enum r600_chip_class {
 	ISA_CC_R600,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 3344bcb76a..d0ae508a8b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -197,6 +197,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 
 	use_sb &= !shader->shader.uses_atomics;
 	use_sb &= !shader->shader.uses_images;
+	use_sb &= !shader->shader.uses_helper_invocation;
 
 	/* Check if the bytecode has already been built. */
 	if (!shader->shader.bc.bytecode) {
@@ -346,9 +347,11 @@ struct r600_shader_ctx {
 	boolean                 clip_vertex_write;
 	unsigned                cv_output;
 	unsigned		edgeflag_output;
+	int					helper_invoc_reg;
 	int                                     cs_block_size_reg;
 	int                                     cs_grid_size_reg;
 	bool cs_block_size_loaded, cs_grid_size_loaded;
+	bool helper_invoc_loaded;
 	int					fragcoord_input;
 	int					next_ring_offset;
 	int					gs_out_ring_offset;
@@ -1295,6 +1298,176 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
 	return t1;
 }
 
+static int load_helper_invocation(struct r600_shader_ctx *ctx)
+{
+	int r;
+
+	if (ctx->helper_invoc_loaded)
+		return ctx->helper_invoc_reg;
+
+	if (ctx->bc->chip_class != CAYMAN)
+		return -1;
+
+	/* force add a cf and set vpm on it */
+	struct r600_bytecode_alu alu;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 2;
+	alu.src[0].sel = EG_V_SQ_ALU_SRC_MASK_HI;
+	alu.dst.write = 1;
+	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
+	if (r)
+		return r;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MOV;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 3;
+	alu.src[0].sel = EG_V_SQ_ALU_SRC_MASK_LO;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
+	if (r)
+		return r;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[0].value = -1;
+	alu.dst.write = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	ctx->bc->cf_last->vpm = 1;
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP1_MBCNT_32HI_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 1;
+	alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[0].value = -1;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* temp_reg.x is now the thread id from 0..63 */
+
+	/* t.z subtract 32 from it */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP2_SUB_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 2;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[1].chan = 32;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* t.w left shift 1 << thread_id - if thread_id > 32 - 0*/
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP2_LSHL_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 3;
+	alu.src[0].sel = V_SQ_ALU_SRC_1_INT;
+	alu.src[1].sel = ctx->temp_reg;
+	alu.src[1].chan = 0;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* t.z left shift 1 << (thread_id - 32) - if thread_id < 32 - 0 */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP2_LSHL_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 2;
+	alu.src[0].sel = V_SQ_ALU_SRC_1_INT;
+	alu.src[1].sel = ctx->temp_reg;
+	alu.src[1].chan = 2;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* t.w = mask_lo & (1 << thread_id) */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP2_AND_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 3;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 3;
+	alu.src[1].sel = ctx->helper_invoc_reg;
+	alu.src[1].chan = 3;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* t.z = mask_hi & (1 << thread_id - 32) */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP2_AND_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 2;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 2;
+	alu.src[1].sel = ctx->helper_invoc_reg;
+	alu.src[1].chan = 2;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* t.z = t.z | t.w */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP2_OR_INT;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.chan = 2;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 2;
+	alu.src[1].sel = ctx->temp_reg;
+	alu.src[1].chan = 3;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	/* if t.z == 0 h.x = 0xffffffff, else h.x = 0 */
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = ALU_OP3_CNDE_INT;
+	alu.is_op3 = true;
+	alu.dst.sel = ctx->helper_invoc_reg;
+	alu.dst.chan = 0;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 2;
+	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+	alu.src[1].value = 0xffffffff;
+	alu.src[2].sel = V_SQ_ALU_SRC_0;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+
+	ctx->bc->force_add_cf = 1;
+	ctx->helper_invoc_loaded = true;
+	return ctx->helper_invoc_reg;
+}
+
 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
 {
 	struct r600_bytecode_vtx vtx;
@@ -1458,6 +1631,8 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 			r600_src->sel = load_block_grid_size(ctx, false);
 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
 			r600_src->sel = load_block_grid_size(ctx, true);
+		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
+			r600_src->sel = load_helper_invocation(ctx);
 		}
 	} else {
 		if (tgsi_src->Register.Indirect)
@@ -3120,6 +3295,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	tgsi_scan_shader(tokens, &ctx.info);
 	shader->indirect_files = ctx.info.indirect_files;
 
+	shader->uses_helper_invocation = false;
 	shader->uses_doubles = ctx.info.uses_doubles;
 	shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
 	shader->nsys_inputs = 0;
@@ -3193,6 +3369,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.clip_vertex_write = 0;
 	ctx.thread_id_gpr_loaded = false;
 
+	ctx.helper_invoc_reg = -1;
+	ctx.helper_invoc_loaded = false;
 	ctx.cs_block_size_reg = -1;
 	ctx.cs_grid_size_reg = -1;
 	ctx.cs_block_size_loaded = false;
@@ -3238,6 +3416,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
 		else
 			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
+
+		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
+			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
+				ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
+				shader->uses_helper_invocation = true;
+			}
+		}
 	}
 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
 		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 8444907883..da96688e54 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -119,6 +119,7 @@ struct r600_shader {
 	boolean			uses_doubles;
 	boolean                 uses_atomics;
 	boolean			uses_images;
+	boolean			uses_helper_invocation;
 	uint8_t                 atomic_base;
 	uint8_t			rat_base;
 	uint8_t                 image_size_const_offset;
diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h
index f51ffcf9e2..6b07dc1ecf 100644
--- a/src/gallium/drivers/r600/r600_sq.h
+++ b/src/gallium/drivers/r600/r600_sq.h
@@ -198,6 +198,8 @@
 #define     EG_V_SQ_ALU_SRC_LDS_DIRECT_B                             0x000000E0
 #define     EG_V_SQ_ALU_SRC_TIME_HI                                  0x000000E3
 #define     EG_V_SQ_ALU_SRC_TIME_LO                                  0x000000E4
+#define     EG_V_SQ_ALU_SRC_MASK_HI                                  0x000000E5
+#define     EG_V_SQ_ALU_SRC_MASK_LO                                  0x000000E6
 #define     EG_V_SQ_ALU_SRC_HW_WAVE_ID                               0x000000E7
 #define     EG_V_SQ_ALU_SRC_SIMD_ID                                  0x000000E8
 #define     EG_V_SQ_ALU_SRC_SE_ID                                    0x000000E9
diff --git a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
index 3b5d9e77b2..9093531fb3 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_dump.cpp
@@ -330,6 +330,21 @@ static void print_src(sb_ostream &s, bc_alu &alu, unsigned idx)
 		case ALU_SRC_0:
 			s << "0";
 			break;
+		case ALU_SRC_MASK_LO:
+			s << "MASK_LO";
+			break;
+		case ALU_SRC_MASK_HI:
+			s << "MASK_HI";
+			break;
+		case ALU_SRC_HW_WAVE_ID:
+			s << "HW_WAVE_ID";
+			break;
+		case ALU_SRC_SIMD_ID:
+			s << "SIMD_ID";
+			break;
+		case ALU_SRC_SE_ID:
+			s << "SE_ID";
+			break;
 		default:
 			s << "??IMM_" <<  sel;
 			break;
-- 
2.14.3