[Mesa-dev] [RFC PATCH 5/6] r600_shader.c: Pre-caclculate some offsets for LDS access

Gert Wollny gw.fossdev at gmail.com
Wed Nov 15 09:29:15 UTC 2017


Some offsets used for the LDS access are recalculated quite regularly.
Since tesselation shaders are not optimized by the SB manually pre-evaluate
some offsets to speed up this type of shader.

Signed-off-by: Gert Wollny <gw.fossdev at gmail.com>
---
 src/gallium/drivers/r600/r600_shader.c | 253 ++++++++++++++++++++++-----------
 1 file changed, 172 insertions(+), 81 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 873b525449..163ae75eb5 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -183,6 +183,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 		R600_ERR("translation from TGSI failed !\n");
 		goto error;
 	}
+
 	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
 		/* only disable for vertex shaders in tess paths */
 		if (key.vs.as_ls)
@@ -329,6 +330,7 @@ struct  r600_tess_input_cache_entry {
 struct r600_tess_input_cache {
 	struct r600_tess_input_cache_entry data[32];
 	int fill;
+	int uses_lds_io;
 };
 
 struct r600_shader_ctx {
@@ -367,7 +369,8 @@ struct r600_shader_ctx {
 	unsigned				enabled_stream_buffers_mask;
 	unsigned                                tess_input_info; /* temp with tess input offsets */
 	unsigned                                tess_output_info; /* temp with tess input offsets */
-	struct r600_tess_input_cache            tess_input_cache;
+	unsigned          tess_io_info_precalc; /* temp with precalcuated offsets */
+	struct r600_tess_input_cache tess_input_cache;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -392,7 +395,8 @@ static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
 			const struct r600_shader_src *shader_src,
 			unsigned chan);
 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
-			       unsigned dst_reg, unsigned mask, int param);
+			       unsigned temp_chan, unsigned dst_reg,
+			       unsigned mask, int param);
 
 static int tgsi_last_instruction(unsigned writemask)
 {
@@ -1027,13 +1031,8 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
 			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
 			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
-			unsigned temp_reg = r600_get_temp(ctx);
-
-			r = get_lds_offset0(ctx, 2, temp_reg, true);
-			if (r)
-				return r;
 
-			do_lds_fetch_values(ctx, temp_reg, dreg, 0xF, param);
+			do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 3, dreg, 0xF, param);
 		}
 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
 			/* MOV r1.x, r0.x;
@@ -1648,7 +1647,9 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
  * All three shaders VS(LS), TCS, TES share the same LDS space.
  */
 /* this will return with the dw address in temp_reg.x */
-static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
+static int r600_get_byte_address(struct r600_shader_ctx *ctx,
+				 unsigned *result_reg, unsigned *result_chan,
+				 int base_offset_reg, int base_offset_chan,
 				 const struct tgsi_full_dst_register *dst,
 				 const struct tgsi_full_src_register *src,
 				 int stride_bytes_reg, int stride_bytes_chan, int *param)
@@ -1656,7 +1657,11 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
 	struct tgsi_full_dst_register reg;
 	ubyte *name, *index, *array_first;
 	int r;
+	int temp_reg = -1;
 	struct tgsi_shader_info *info = &ctx->info;
+	*result_reg = base_offset_reg;
+	*result_chan = base_offset_chan;
+
 	/* Set the register description. The address computation is the same
 	 * for sources and destinations. */
 	if (src) {
@@ -1686,14 +1691,18 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
 			sel = V_SQ_ALU_SRC_LITERAL;
 			chan = reg.Dimension.Index;
 		}
-
+		temp_reg = r600_get_temp(ctx);
+		*result_reg = temp_reg;
+		*result_chan = 0;
 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
 				   temp_reg, 0,
 				   stride_bytes_reg, stride_bytes_chan,
 				   sel, chan,
-				   temp_reg, 0);
+				   base_offset_reg, base_offset_chan);
 		if (r)
 			return r;
+	} else {
+
 	}
 
 	if (reg.Register.File == TGSI_FILE_INPUT) {
@@ -1719,15 +1728,20 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
 
 		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
 
-		/* pull the value from index_reg */
+		if (temp_reg < 0)
+				temp_reg = r600_get_temp(ctx);
+
 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
 				   temp_reg, 0,
 				   V_SQ_ALU_SRC_LITERAL, 16,
 				   addr_reg, 0,
-				   temp_reg, 0);
+				   *result_reg, *result_chan);
 		if (r)
 			return r;
 
+		*result_reg = temp_reg;
+		*result_chan = 0;
+
 		*param = r600_get_lds_unique_index(name[first],
 						  index[first]);
 
@@ -1739,14 +1753,17 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
 	return 0;
 }
 
-static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
+static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned offs_reg,
+			       unsigned offs_chan,
 			       unsigned dst_reg, unsigned mask, int param)
+	
 {
 	struct r600_bytecode_alu alu;
 	int r, i;
 	int lasti = tgsi_last_instruction(mask);
 	int firsti = param > 0 ? 0 : 1;
 
+
 	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
 		ctx->bc->force_add_cf = 1;
 
@@ -1756,12 +1773,12 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
 			continue;
 
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-		alu.dst.sel = temp_reg;
+		alu.dst.sel = ctx->temp_reg;
 		alu.dst.chan = i;
 		alu.dst.write = 1;
 		alu.op = ALU_OP2_ADD_INT;
-		alu.src[0].sel = temp_reg;
-		alu.src[0].chan = 0;
+		alu.src[0].sel = offs_reg;
+		alu.src[0].chan = offs_chan;
 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
 		alu.src[1].value = 4 * i + 16 * param;
 
@@ -1779,8 +1796,13 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
 		/* emit an LDS_READ_RET */
 		memset(&alu, 0, sizeof(alu));
 		alu.op = LDS_OP1_LDS_READ_RET;
-		alu.src[0].sel = temp_reg;
-		alu.src[0].chan = i;
+		if (i > 0 || firsti == 0) {
+			alu.src[0].sel = ctx->temp_reg;
+			alu.src[0].chan = i;
+		} else {
+			alu.src[0].sel = offs_reg;
+			alu.src[0].chan = offs_chan;
+		}
 		alu.src[1].sel = V_SQ_ALU_SRC_0;
 		alu.src[2].sel = V_SQ_ALU_SRC_0;
 		alu.dst.chan = 0;
@@ -1824,20 +1846,18 @@ static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_reg
 			   unsigned int dst_reg, unsigned mask)
 {
 	int r, param;
-	unsigned temp_reg = r600_get_temp(ctx);
-
-	r = get_lds_offset0(ctx, 2, temp_reg,
-			    src->Register.Dimension ? false : true);
-	if (r)
-		return r;
+	unsigned temp_reg;
+	unsigned temp_chan;
 
 	/* the base address is now in temp.x */
-	r = r600_get_byte_address(ctx, temp_reg,
+	r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+				  ctx->tess_io_info_precalc,
+				  src->Register.Dimension ? 2:3,
 				  NULL, src, ctx->tess_output_info, 1, &param);
 	if (r)
 		return r;
 
-	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+	r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
 	if (r)
 		return r;
 	return 0;
@@ -1848,23 +1868,16 @@ static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_reg
 {
 	int r,param;
 	unsigned temp_reg = r600_get_temp(ctx);
-
-	/* t.x = ips * r0.y */
-	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
-			   temp_reg, 0,
-			   ctx->tess_input_info, 0,
-			   0, 1);
-
-	if (r)
-		return r;
+	unsigned temp_chan = 0;
 
 	/* the base address is now in temp.x */
-	r = r600_get_byte_address(ctx, temp_reg,
+	r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+				  ctx->tess_io_info_precalc, 3,
 				  NULL, src, ctx->tess_input_info, 1, &param);
 	if (r)
 		return r;
 
-	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+	r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
 	if (r)
 		return r;
 	return 0;
@@ -1874,20 +1887,18 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
 			    unsigned int dst_reg, unsigned mask)
 {
 	int r, param;
-	unsigned temp_reg = r600_get_temp(ctx);
+	unsigned temp_reg;
+	unsigned temp_chan;
 
-	r = get_lds_offset0(ctx, 1, temp_reg,
-			    src->Register.Dimension ? false : true);
-	if (r)
-		return r;
-	/* the base address is now in temp.x */
-	r = r600_get_byte_address(ctx, temp_reg,
+	r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+				  ctx->tess_io_info_precalc,
+				  src->Register.Dimension ? 0:1,
 				  NULL, src,
 				  ctx->tess_output_info, 1, &param);
 	if (r)
 		return r;
 
-	r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+	r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
 	if (r)
 		return r;
 	return 0;
@@ -1896,11 +1907,12 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
 static int tgsi_full_src_register_equal_for_cache(struct tgsi_full_src_register *lhs,
 					struct tgsi_full_src_register *rhs)
 {
+	if (lhs->Register.File != rhs->Register.File)
+		return 0;
+	
 	if (lhs->Register.Index != rhs->Register.Index)
 		return 0;
 
-	if (lhs->Register.File != rhs->Register.File)
-
 	if (lhs->Register.Indirect || rhs->Register.Indirect)
 		return 0;
 
@@ -2028,9 +2040,10 @@ static void count_tess_inputs(struct r600_shader_ctx *ctx)
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		struct tgsi_full_src_register *src = &inst->Src[i];
 		if (((src->Register.File == TGSI_FILE_INPUT) && (ctx->type == PIPE_SHADER_TESS_EVAL)) ||
-		    (ctx->type == PIPE_SHADER_TESS_CTRL &&
-		     (src->Register.File == TGSI_FILE_INPUT || src->Register.File == TGSI_FILE_OUTPUT)))
+		    (ctx->type == PIPE_SHADER_TESS_CTRL)) {
 			tess_input_cache_check(&ctx->tess_input_cache, src);
+			ctx->tess_input_cache.uses_lds_io = 1;
+		}
 	}
 }
 
@@ -2729,7 +2742,7 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
 			   0, 0);
 	if (r)
 		return r;
-
+	
 	/* used by VS/TCS */
 	if (ctx->tess_input_info) {
 		/* fetch tcs input values into resv space */
@@ -2752,12 +2765,13 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
 		vtx.dst_sel_w = 3;
 		vtx.src_gpr = temp_val;
 		vtx.src_sel_x = 0;
-
+		
 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
 		if (r)
 			return r;
+		
 	}
-
+	
 	/* used by TCS/TES */
 	if (ctx->tess_output_info) {
 		/* fetch tcs output values into resv space */
@@ -2784,6 +2798,64 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
 		if (r)
 			return r;
+		
+		if (ctx->tess_input_cache.uses_lds_io) {
+			
+			/* Precalc some offsets, after this we have
+			 
+			 */
+			
+			/* tess_io_info_precalc.x = tess_output_info.x * R0.y + tess_output_info.z */
+			r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+					   ctx->tess_io_info_precalc, 0,
+					   ctx->tess_output_info, 0,
+					   0, 1,
+					   ctx->tess_output_info, 2);
+			if (r)
+				return r;
+			
+			/* tess_io_info_precalc.y = tess_output_info.x * R0.y + tess_output_info.w */
+			
+			r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+					   ctx->tess_io_info_precalc, 1,
+					   ctx->tess_output_info, 0,
+					   0, 1,
+					   ctx->tess_output_info, 3);
+			if (r)
+				return r;
+
+
+			/* tess_io_info_precalc.z = tess_output_info.x * R0.z + tess_output_info.z */
+			r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+					   ctx->tess_io_info_precalc, 2,
+					   ctx->tess_output_info, 0,
+					   0, 2,
+					   ctx->tess_output_info, 2);
+			if (r)
+				return r;
+
+			/* This is a TCS shader */  
+			if (ctx->tess_input_info) {
+				
+				/* t.x = ips * r0.y */
+				r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
+						   ctx->tess_io_info_precalc, 3,
+						   ctx->tess_input_info, 0,
+						   0, 1);
+				if (r)
+					return r;
+			} else {
+				
+				/* tess_io_info_precalc.w = tess_output_info.x * R0.z + tess_output_info.w */
+				r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+						   ctx->tess_io_info_precalc, 3,
+						   ctx->tess_output_info, 0,
+						   0, 2,
+						   ctx->tess_output_info, 3);
+				if (r)
+					return r;
+			}
+		}
 	}
 	return 0;
 }
@@ -2858,8 +2930,10 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
-	int i, r, lasti;
+	int i, r, lasti, firsti;
 	int temp_reg = r600_get_temp(ctx);
+	unsigned offs_reg;
+	unsigned offs_chan;
 	struct r600_bytecode_alu alu;
 	unsigned write_mask = dst->Register.WriteMask;
 	int param;
@@ -2867,19 +2941,18 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
 	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
 		return 0;
 
-	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
-	if (r)
-		return r;
-
 	/* the base address is now in temp.x */
-	r = r600_get_byte_address(ctx, temp_reg,
+	r = r600_get_byte_address(ctx, &offs_reg, &offs_chan,
+				  ctx->tess_io_info_precalc,
+				  dst->Register.Dimension ? 0:1,
 				  &inst->Dst[0], NULL, ctx->tess_output_info, 1, &param);
 	if (r)
 		return r;
 
+	firsti = param > 0 ? 0 : 1;
 	/* LDS write */
 	lasti = tgsi_last_instruction(write_mask);
-	for (i = (param > 0 ? 0: 1); i <= lasti; i++) {
+	for (i = firsti; i <= lasti; i++) {
 
 		if (!(write_mask & (1 << i)))
 			continue;
@@ -2888,8 +2961,8 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
 		alu.dst.chan = i;
 		alu.dst.write = 1;
 		alu.op = ALU_OP2_ADD_INT;
-		alu.src[0].sel = temp_reg;
-		alu.src[0].chan = 0;
+		alu.src[0].sel = offs_reg;
+		alu.src[0].chan = offs_chan;
 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
 		alu.src[1].value = 4 * i + 16 * param;
 
@@ -2909,8 +2982,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
 		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = LDS_OP3_LDS_WRITE_REL;
-			alu.src[0].sel = temp_reg;
-			alu.src[0].chan = i;
+
+			if (firsti == 0 || i > 0) {
+				alu.src[0].sel = temp_reg;
+				alu.src[0].chan = i;
+			} else {
+				alu.src[0].sel = offs_reg;
+				alu.src[0].chan = offs_chan;
+			}
 
 			alu.src[1].sel = dst->Register.Index;
 			alu.src[1].sel += ctx->file_offset[dst->Register.File];
@@ -2931,8 +3010,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
 		}
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = LDS_OP2_LDS_WRITE;
-		alu.src[0].sel = temp_reg;
-		alu.src[0].chan = i;
+
+		if (firsti == 0 || i > 0) {
+			alu.src[0].sel = temp_reg;
+			alu.src[0].chan = i;
+		} else {
+			alu.src[0].sel = offs_reg;
+			alu.src[0].chan = offs_chan;
+		}
 
 		alu.src[1].sel = dst->Register.Index;
 		alu.src[1].sel += ctx->file_offset[dst->Register.File];
@@ -2953,17 +3038,12 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
 				 int output_idx)
 {
 	int param;
-	unsigned temp_reg = r600_get_temp(ctx);
 	unsigned name = ctx->shader->output[output_idx].name;
 	int dreg = ctx->shader->output[output_idx].gpr;
-	int r;
 
 	param = r600_get_lds_unique_index(name, 0);
-	r = get_lds_offset0(ctx, 1, temp_reg, true);
-	if (r)
-		return r;
-
-	do_lds_fetch_values(ctx, temp_reg, dreg, 0xf, param);
+	
+	do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 1, dreg, 0xf, param);
 	return 0;
 }
 
@@ -3293,11 +3373,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
 		ctx.tess_input_info = ctx.bc->ar_reg + 3;
 		ctx.tess_output_info = ctx.bc->ar_reg + 4;
-		ctx.temp_reg = ctx.bc->ar_reg + 5;
+		ctx.tess_io_info_precalc = ctx.bc->ar_reg + 5;
+		ctx.temp_reg = ctx.bc->ar_reg + 6;
 	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
 		ctx.tess_input_info = 0;
 		ctx.tess_output_info = ctx.bc->ar_reg + 3;
-		ctx.temp_reg = ctx.bc->ar_reg + 4;
+		ctx.tess_io_info_precalc = ctx.bc->ar_reg + 4;
+		ctx.temp_reg = ctx.bc->ar_reg + 5;
 	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
 		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
 		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
@@ -3316,18 +3398,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		ctx.temp_reg = ctx.bc->ar_reg + 3;
 	}
 
-	if (lds_inputs) {
+	ctx.tess_input_cache.uses_lds_io = 0;
+	if (lds_inputs || lds_outputs) {
 		tgsi_parse_init(&ctx.parse, tokens);
+
 		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
 			tgsi_parse_token(&ctx.parse);
-
-			if (ctx.parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
-				continue;
-
-			count_tess_inputs(&ctx);
+			if (ctx.parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION)
+				count_tess_inputs(&ctx);
+			else if (ctx.parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
+				struct tgsi_full_declaration *d = &ctx.parse.FullToken.FullDeclaration;
+				if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE &&
+				    (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
+				     d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER))
+					ctx.tess_input_cache.uses_lds_io = 1;
+				
+			}
 		}
 		ctx.temp_reg += tess_input_cache_count_multiused(&ctx.tess_input_cache, ctx.temp_reg);
 		tgsi_parse_init(&ctx.parse, tokens);
+	} else {
+
 	}
 
 	shader->max_arrays = 0;
-- 
2.13.6



More information about the mesa-dev mailing list