[Mesa-dev] [PATCH 38/53] r600/shader: handle fetching tcs/tes inputs and tcs outputs

Sun Nov 29 22:20:47 PST 2015

From: Dave Airlie <airlied at redhat.com>

This handles the logic for doing fetches from LDS for
TCS and TES. For TCS we need to fetch both inputs and outputs,
for TES only inputs need to be fetched.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/gallium/drivers/r600/r600_shader.c | 298 ++++++++++++++++++++++++++++++++-
 1 file changed, 297 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index a5fb80d..69675b2 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1386,6 +1386,295 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
+
+/* Tessellation shaders pass outputs to the next shader using LDS.
+ *
+ * LS outputs = TCS(HS) inputs
+ * TCS(HS) outputs = TES(DS) inputs
+ *
+ * The LDS layout is:
+ * - TCS inputs for patch 0
+ * - TCS inputs for patch 1
+ * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
+ * - ...
+ * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
+ * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
+ * - TCS outputs for patch 1
+ * - Per-patch TCS outputs for patch 1
+ * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
+ * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
+ * - ...
+ *
+ * All three shaders VS(LS), TCS, TES share the same LDS space.
+ */
+/* this will return with the dw address in temp_reg.x */
+static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
+				 const struct tgsi_full_dst_register *dst,
+				 const struct tgsi_full_src_register *src,
+				 int stride_bytes_reg, int stride_bytes_chan)
+{
+	struct tgsi_full_dst_register reg;
+	ubyte *name, *index, *array_first;
+	int r;
+	int param;
+	struct tgsi_shader_info *info = &ctx->info;
+	/* Set the register description. The address computation is the same
+	 * for sources and destinations. */
+	if (src) {
+		reg.Register.File = src->Register.File;
+		reg.Register.Index = src->Register.Index;
+		reg.Register.Indirect = src->Register.Indirect;
+		reg.Register.Dimension = src->Register.Dimension;
+		reg.Indirect = src->Indirect;
+		reg.Dimension = src->Dimension;
+		reg.DimIndirect = src->DimIndirect;
+	} else
+		reg = *dst;
+
+	/* If the register is 2-dimensional (e.g. an array of vertices
+	 * in a primitive), calculate the base address of the vertex. */
+	if (reg.Register.Dimension) {
+		int sel, chan;
+		if (reg.Dimension.Indirect) {
+			unsigned addr_reg;
+			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
+
+			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
+			/* pull the value from index_reg */
+			sel = addr_reg;
+			chan = 0;
+		} else {
+			sel = V_SQ_ALU_SRC_LITERAL;
+			chan = reg.Dimension.Index;
+		}
+		r = single_alu_op2(ctx, ALU_OP2_MULLO_INT,
+				   temp_reg, 1,
+				   stride_bytes_reg, stride_bytes_chan,
+				   sel, chan);
+		if (r)
+			return r;
+
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   temp_reg, 0,
+				   temp_reg, 0,
+				   temp_reg, 1);
+		if (r)
+			return r;
+	}
+
+	if (reg.Register.File == TGSI_FILE_INPUT) {
+		name = info->input_semantic_name;
+		index = info->input_semantic_index;
+		array_first = info->input_array_first;
+	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
+		name = info->output_semantic_name;
+		index = info->output_semantic_index;
+		array_first = info->output_array_first;
+	} else {
+		assert(0);
+		return -1;
+	}
+	if (reg.Register.Indirect) {
+		int addr_reg;
+		int first;
+		/* Add the relative address of the element. */
+		if (reg.Indirect.ArrayID)
+			first = array_first[reg.Indirect.ArrayID];
+		else
+			first = reg.Register.Index;
+
+		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
+		/* pull the value from index_reg */
+		r = single_alu_op2(ctx, ALU_OP2_MULLO_INT,
+				   temp_reg, 1,
+				   V_SQ_ALU_SRC_LITERAL, 16,
+				   addr_reg, 0);
+		if (r)
+			return r;
+
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   temp_reg, 0,
+				   temp_reg, 0,
+				   temp_reg, 1);
+		if (r)
+			return r;
+		param = r600_get_lds_unique_index(name[first],
+						  index[first]);
+
+	} else {
+		param = r600_get_lds_unique_index(name[reg.Register.Index],
+						  index[reg.Register.Index]);
+	}
+
+	/* add to base_addr - passed in temp_reg.x */
+	if (param) {
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   temp_reg, 0,
+				   temp_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, param * 16);
+		if (r)
+			return r;
+
+	}
+	return 0;
+}
+
+static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
+			       unsigned dst_reg)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
+		ctx->bc->force_add_cf = 1;
+	for (i = 1; i < 4; i++) {
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   temp_reg, i,
+				   temp_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, 4 * i);
+	}
+	for (i = 0; i < 4; i++) {
+		/* emit an LDS_READ_RET */
+		memset(&alu, 0, sizeof(alu));
+		alu.op = LDS_OP1_LDS_READ_RET;
+		alu.src[0].sel = temp_reg;
+		alu.src[0].chan = i;
+		alu.src[1].sel = V_SQ_ALU_SRC_0;
+		alu.src[2].sel = V_SQ_ALU_SRC_0;
+		alu.dst.chan = 0;
+		alu.is_lds_idx_op = true;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	for (i = 0; i < 4; i++) {
+		memset(&alu, 0, sizeof(alu));
+		alu.op = ALU_OP0_NOP;
+		alu.dst.chan = i;
+		alu.last = i == 3;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	for (i = 0; i < 4; i++) {
+		/* then read from LDS_OQ_A_POP */
+		memset(&alu, 0, sizeof(alu));
+
+		alu.op = ALU_OP1_MOV;
+		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
+		alu.src[0].chan = 0;
+		alu.dst.sel = dst_reg;
+		alu.dst.chan = i;
+		alu.dst.write = 1;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
+	return 0;
+}
+
+static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
+{
+	int r;
+	unsigned temp_reg = r600_get_temp(ctx);
+
+	r = get_lds_offset0(ctx, 2, temp_reg,
+			    src->Register.Dimension ? false : true);
+	if (r)
+		return r;
+
+	/* the base address is now in temp.x */
+	r = r600_get_byte_address(ctx, temp_reg,
+				  NULL, src, ctx->tess_output_info, 1);
+	if (r)
+		return r;
+
+	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
+{
+	int r;
+	unsigned temp_reg = r600_get_temp(ctx);
+
+	/* t.x = ips * r0.y */
+	r = single_alu_op2(ctx, ALU_OP2_MULLO_INT,
+			   temp_reg, 0,
+			   ctx->tess_input_info, 0,
+			   0, 1);
+
+	if (r)
+		return r;
+
+	/* the base address is now in temp.x */
+	r = r600_get_byte_address(ctx, temp_reg,
+				  NULL, src, ctx->tess_input_info, 1);
+	if (r)
+		return r;
+
+	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
+{
+	int r;
+	unsigned temp_reg = r600_get_temp(ctx);
+
+	r = get_lds_offset0(ctx, 1, temp_reg,
+			    src->Register.Dimension ? false : true);
+	if (r)
+		return r;
+		/* the base address is now in temp.x */
+	r = r600_get_byte_address(ctx, temp_reg,
+				  NULL, src,
+				  ctx->tess_output_info, 1);
+	if (r)
+		return r;
+
+	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
+	if (r)
+		return r;
+	return 0;
+}
+
+static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	int i;
+
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		struct tgsi_full_src_register *src = &inst->Src[i];
+
+		if (ctx->type == TGSI_PROCESSOR_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
+			int treg = r600_get_temp(ctx);
+			fetch_tes_input(ctx, src, treg);
+			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
+		}
+		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
+			int treg = r600_get_temp(ctx);
+			fetch_tcs_input(ctx, src, treg);
+			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
+		}
+		if (ctx->type == TGSI_PROCESSOR_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
+			int treg = r600_get_temp(ctx);
+			fetch_tcs_output(ctx, src, treg);
+			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
+		}
+	}
+	return 0;
+}
+
 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -2116,6 +2405,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	bool use_llvm = false;
 	bool indirect_gprs;
 	bool ring_outputs = false;
+	bool lds_inputs = false;
 	bool pos_emitted = false;
 
 #ifdef R600_USE_LLVM
@@ -2153,9 +2443,11 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		break;
 	case TGSI_PROCESSOR_TESS_CTRL:
 		shader->tcs_prim_mode = key.tcs.prim_mode;
+		lds_inputs = true;
 		break;
 	case TGSI_PROCESSOR_TESS_EVAL:
 		shader->tes_as_es = key.tes.as_es;
+		lds_inputs = true;
 		if (shader->tes_as_es)
 			ring_outputs = true;
 		break;
@@ -2504,9 +2796,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 					goto out_err;
 				if ((r = tgsi_split_literal_constant(&ctx)))
 					goto out_err;
-				if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
+				if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
 					if ((r = tgsi_split_gs_inputs(&ctx)))
 						goto out_err;
+				} else if (lds_inputs) {
+					if ((r = tgsi_split_lds_inputs(&ctx)))
+						goto out_err;
+				}
 				if (ctx.bc->chip_class == CAYMAN)
 					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
 				else if (ctx.bc->chip_class >= EVERGREEN)
-- 
2.5.0