[Mesa-dev] [RFC PATCH 5/6] r600_shader.c: Pre-caclculate some offsets for LDS access
Gert Wollny
gw.fossdev at gmail.com
Wed Nov 15 09:29:15 UTC 2017
Some offsets used for the LDS access are recalculated quite regularly.
Since tesselation shaders are not optimized by the SB manually pre-evaluate
some offsets to speed up this type of shader.
Signed-off-by: Gert Wollny <gw.fossdev at gmail.com>
---
src/gallium/drivers/r600/r600_shader.c | 253 ++++++++++++++++++++++-----------
1 file changed, 172 insertions(+), 81 deletions(-)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 873b525449..163ae75eb5 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -183,6 +183,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
R600_ERR("translation from TGSI failed !\n");
goto error;
}
+
if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
/* only disable for vertex shaders in tess paths */
if (key.vs.as_ls)
@@ -329,6 +330,7 @@ struct r600_tess_input_cache_entry {
struct r600_tess_input_cache {
struct r600_tess_input_cache_entry data[32];
int fill;
+ int uses_lds_io;
};
struct r600_shader_ctx {
@@ -367,7 +369,8 @@ struct r600_shader_ctx {
unsigned enabled_stream_buffers_mask;
unsigned tess_input_info; /* temp with tess input offsets */
unsigned tess_output_info; /* temp with tess input offsets */
- struct r600_tess_input_cache tess_input_cache;
+ unsigned tess_io_info_precalc; /* temp with precalcuated offsets */
+ struct r600_tess_input_cache tess_input_cache;
};
struct r600_shader_tgsi_instruction {
@@ -392,7 +395,8 @@ static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
const struct r600_shader_src *shader_src,
unsigned chan);
static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
- unsigned dst_reg, unsigned mask, int param);
+ unsigned temp_chan, unsigned dst_reg,
+ unsigned mask, int param);
static int tgsi_last_instruction(unsigned writemask)
{
@@ -1027,13 +1031,8 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
- unsigned temp_reg = r600_get_temp(ctx);
-
- r = get_lds_offset0(ctx, 2, temp_reg, true);
- if (r)
- return r;
- do_lds_fetch_values(ctx, temp_reg, dreg, 0xF, param);
+ do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 3, dreg, 0xF, param);
}
else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
/* MOV r1.x, r0.x;
@@ -1648,7 +1647,9 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
* All three shaders VS(LS), TCS, TES share the same LDS space.
*/
/* this will return with the dw address in temp_reg.x */
-static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
+static int r600_get_byte_address(struct r600_shader_ctx *ctx,
+ unsigned *result_reg, unsigned *result_chan,
+ int base_offset_reg, int base_offset_chan,
const struct tgsi_full_dst_register *dst,
const struct tgsi_full_src_register *src,
int stride_bytes_reg, int stride_bytes_chan, int *param)
@@ -1656,7 +1657,11 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
struct tgsi_full_dst_register reg;
ubyte *name, *index, *array_first;
int r;
+ int temp_reg = -1;
struct tgsi_shader_info *info = &ctx->info;
+ *result_reg = base_offset_reg;
+ *result_chan = base_offset_chan;
+
/* Set the register description. The address computation is the same
* for sources and destinations. */
if (src) {
@@ -1686,14 +1691,18 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
sel = V_SQ_ALU_SRC_LITERAL;
chan = reg.Dimension.Index;
}
-
+ temp_reg = r600_get_temp(ctx);
+ *result_reg = temp_reg;
+ *result_chan = 0;
r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
temp_reg, 0,
stride_bytes_reg, stride_bytes_chan,
sel, chan,
- temp_reg, 0);
+ base_offset_reg, base_offset_chan);
if (r)
return r;
+ } else {
+
}
if (reg.Register.File == TGSI_FILE_INPUT) {
@@ -1719,15 +1728,20 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
- /* pull the value from index_reg */
+ if (temp_reg < 0)
+ temp_reg = r600_get_temp(ctx);
+
r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
temp_reg, 0,
V_SQ_ALU_SRC_LITERAL, 16,
addr_reg, 0,
- temp_reg, 0);
+ *result_reg, *result_chan);
if (r)
return r;
+ *result_reg = temp_reg;
+ *result_chan = 0;
+
*param = r600_get_lds_unique_index(name[first],
index[first]);
@@ -1739,14 +1753,17 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
return 0;
}
-static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
+static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned offs_reg,
+ unsigned offs_chan,
unsigned dst_reg, unsigned mask, int param)
+
{
struct r600_bytecode_alu alu;
int r, i;
int lasti = tgsi_last_instruction(mask);
int firsti = param > 0 ? 0 : 1;
+
if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
ctx->bc->force_add_cf = 1;
@@ -1756,12 +1773,12 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
continue;
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
- alu.dst.sel = temp_reg;
+ alu.dst.sel = ctx->temp_reg;
alu.dst.chan = i;
alu.dst.write = 1;
alu.op = ALU_OP2_ADD_INT;
- alu.src[0].sel = temp_reg;
- alu.src[0].chan = 0;
+ alu.src[0].sel = offs_reg;
+ alu.src[0].chan = offs_chan;
alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
alu.src[1].value = 4 * i + 16 * param;
@@ -1779,8 +1796,13 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
/* emit an LDS_READ_RET */
memset(&alu, 0, sizeof(alu));
alu.op = LDS_OP1_LDS_READ_RET;
- alu.src[0].sel = temp_reg;
- alu.src[0].chan = i;
+ if (i > 0 || firsti == 0) {
+ alu.src[0].sel = ctx->temp_reg;
+ alu.src[0].chan = i;
+ } else {
+ alu.src[0].sel = offs_reg;
+ alu.src[0].chan = offs_chan;
+ }
alu.src[1].sel = V_SQ_ALU_SRC_0;
alu.src[2].sel = V_SQ_ALU_SRC_0;
alu.dst.chan = 0;
@@ -1824,20 +1846,18 @@ static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_reg
unsigned int dst_reg, unsigned mask)
{
int r, param;
- unsigned temp_reg = r600_get_temp(ctx);
-
- r = get_lds_offset0(ctx, 2, temp_reg,
- src->Register.Dimension ? false : true);
- if (r)
- return r;
+ unsigned temp_reg;
+ unsigned temp_chan;
/* the base address is now in temp.x */
- r = r600_get_byte_address(ctx, temp_reg,
+ r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+ ctx->tess_io_info_precalc,
+ src->Register.Dimension ? 2:3,
NULL, src, ctx->tess_output_info, 1, ¶m);
if (r)
return r;
- r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+ r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
if (r)
return r;
return 0;
@@ -1848,23 +1868,16 @@ static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_reg
{
int r,param;
unsigned temp_reg = r600_get_temp(ctx);
-
- /* t.x = ips * r0.y */
- r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
- temp_reg, 0,
- ctx->tess_input_info, 0,
- 0, 1);
-
- if (r)
- return r;
+ unsigned temp_chan = 0;
/* the base address is now in temp.x */
- r = r600_get_byte_address(ctx, temp_reg,
+ r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+ ctx->tess_io_info_precalc, 3,
NULL, src, ctx->tess_input_info, 1, ¶m);
if (r)
return r;
- r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+ r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
if (r)
return r;
return 0;
@@ -1874,20 +1887,18 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
unsigned int dst_reg, unsigned mask)
{
int r, param;
- unsigned temp_reg = r600_get_temp(ctx);
+ unsigned temp_reg;
+ unsigned temp_chan;
- r = get_lds_offset0(ctx, 1, temp_reg,
- src->Register.Dimension ? false : true);
- if (r)
- return r;
- /* the base address is now in temp.x */
- r = r600_get_byte_address(ctx, temp_reg,
+ r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+ ctx->tess_io_info_precalc,
+ src->Register.Dimension ? 0:1,
NULL, src,
ctx->tess_output_info, 1, ¶m);
if (r)
return r;
- r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+ r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
if (r)
return r;
return 0;
@@ -1896,11 +1907,12 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re
static int tgsi_full_src_register_equal_for_cache(struct tgsi_full_src_register *lhs,
struct tgsi_full_src_register *rhs)
{
+ if (lhs->Register.File != rhs->Register.File)
+ return 0;
+
if (lhs->Register.Index != rhs->Register.Index)
return 0;
- if (lhs->Register.File != rhs->Register.File)
-
if (lhs->Register.Indirect || rhs->Register.Indirect)
return 0;
@@ -2028,9 +2040,10 @@ static void count_tess_inputs(struct r600_shader_ctx *ctx)
for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
struct tgsi_full_src_register *src = &inst->Src[i];
if (((src->Register.File == TGSI_FILE_INPUT) && (ctx->type == PIPE_SHADER_TESS_EVAL)) ||
- (ctx->type == PIPE_SHADER_TESS_CTRL &&
- (src->Register.File == TGSI_FILE_INPUT || src->Register.File == TGSI_FILE_OUTPUT)))
+ (ctx->type == PIPE_SHADER_TESS_CTRL)) {
tess_input_cache_check(&ctx->tess_input_cache, src);
+ ctx->tess_input_cache.uses_lds_io = 1;
+ }
}
}
@@ -2729,7 +2742,7 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
0, 0);
if (r)
return r;
-
+
/* used by VS/TCS */
if (ctx->tess_input_info) {
/* fetch tcs input values into resv space */
@@ -2752,12 +2765,13 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
vtx.dst_sel_w = 3;
vtx.src_gpr = temp_val;
vtx.src_sel_x = 0;
-
+
r = r600_bytecode_add_vtx(ctx->bc, &vtx);
if (r)
return r;
+
}
-
+
/* used by TCS/TES */
if (ctx->tess_output_info) {
/* fetch tcs output values into resv space */
@@ -2784,6 +2798,64 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
r = r600_bytecode_add_vtx(ctx->bc, &vtx);
if (r)
return r;
+
+ if (ctx->tess_input_cache.uses_lds_io) {
+
+ /* Precalc some offsets, after this we have
+
+ */
+
+ /* tess_io_info_precalc.x = tess_output_info.x * R0.y + tess_output_info.z */
+ r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+ ctx->tess_io_info_precalc, 0,
+ ctx->tess_output_info, 0,
+ 0, 1,
+ ctx->tess_output_info, 2);
+ if (r)
+ return r;
+
+ /* tess_io_info_precalc.y = tess_output_info.x * R0.y + tess_output_info.w */
+
+ r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+ ctx->tess_io_info_precalc, 1,
+ ctx->tess_output_info, 0,
+ 0, 1,
+ ctx->tess_output_info, 3);
+ if (r)
+ return r;
+
+
+ /* tess_io_info_precalc.z = tess_output_info.x * R0.z + tess_output_info.z */
+ r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+ ctx->tess_io_info_precalc, 2,
+ ctx->tess_output_info, 0,
+ 0, 2,
+ ctx->tess_output_info, 2);
+ if (r)
+ return r;
+
+ /* This is a TCS shader */
+ if (ctx->tess_input_info) {
+
+ /* t.x = ips * r0.y */
+ r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
+ ctx->tess_io_info_precalc, 3,
+ ctx->tess_input_info, 0,
+ 0, 1);
+ if (r)
+ return r;
+ } else {
+
+ /* tess_io_info_precalc.w = tess_output_info.x * R0.z + tess_output_info.w */
+ r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+ ctx->tess_io_info_precalc, 3,
+ ctx->tess_output_info, 0,
+ 0, 2,
+ ctx->tess_output_info, 3);
+ if (r)
+ return r;
+ }
+ }
}
return 0;
}
@@ -2858,8 +2930,10 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
{
struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
const struct tgsi_full_dst_register *dst = &inst->Dst[0];
- int i, r, lasti;
+ int i, r, lasti, firsti;
int temp_reg = r600_get_temp(ctx);
+ unsigned offs_reg;
+ unsigned offs_chan;
struct r600_bytecode_alu alu;
unsigned write_mask = dst->Register.WriteMask;
int param;
@@ -2867,19 +2941,18 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
return 0;
- r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
- if (r)
- return r;
-
/* the base address is now in temp.x */
- r = r600_get_byte_address(ctx, temp_reg,
+ r = r600_get_byte_address(ctx, &offs_reg, &offs_chan,
+ ctx->tess_io_info_precalc,
+ dst->Register.Dimension ? 0:1,
&inst->Dst[0], NULL, ctx->tess_output_info, 1, ¶m);
if (r)
return r;
+ firsti = param > 0 ? 0 : 1;
/* LDS write */
lasti = tgsi_last_instruction(write_mask);
- for (i = (param > 0 ? 0: 1); i <= lasti; i++) {
+ for (i = firsti; i <= lasti; i++) {
if (!(write_mask & (1 << i)))
continue;
@@ -2888,8 +2961,8 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
alu.dst.chan = i;
alu.dst.write = 1;
alu.op = ALU_OP2_ADD_INT;
- alu.src[0].sel = temp_reg;
- alu.src[0].chan = 0;
+ alu.src[0].sel = offs_reg;
+ alu.src[0].chan = offs_chan;
alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
alu.src[1].value = 4 * i + 16 * param;
@@ -2909,8 +2982,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
(i == 2 && ((write_mask & 0xc) == 0xc))) {
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
alu.op = LDS_OP3_LDS_WRITE_REL;
- alu.src[0].sel = temp_reg;
- alu.src[0].chan = i;
+
+ if (firsti == 0 || i > 0) {
+ alu.src[0].sel = temp_reg;
+ alu.src[0].chan = i;
+ } else {
+ alu.src[0].sel = offs_reg;
+ alu.src[0].chan = offs_chan;
+ }
alu.src[1].sel = dst->Register.Index;
alu.src[1].sel += ctx->file_offset[dst->Register.File];
@@ -2931,8 +3010,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
}
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
alu.op = LDS_OP2_LDS_WRITE;
- alu.src[0].sel = temp_reg;
- alu.src[0].chan = i;
+
+ if (firsti == 0 || i > 0) {
+ alu.src[0].sel = temp_reg;
+ alu.src[0].chan = i;
+ } else {
+ alu.src[0].sel = offs_reg;
+ alu.src[0].chan = offs_chan;
+ }
alu.src[1].sel = dst->Register.Index;
alu.src[1].sel += ctx->file_offset[dst->Register.File];
@@ -2953,17 +3038,12 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
int output_idx)
{
int param;
- unsigned temp_reg = r600_get_temp(ctx);
unsigned name = ctx->shader->output[output_idx].name;
int dreg = ctx->shader->output[output_idx].gpr;
- int r;
param = r600_get_lds_unique_index(name, 0);
- r = get_lds_offset0(ctx, 1, temp_reg, true);
- if (r)
- return r;
-
- do_lds_fetch_values(ctx, temp_reg, dreg, 0xf, param);
+
+ do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 1, dreg, 0xf, param);
return 0;
}
@@ -3293,11 +3373,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
if (ctx.type == PIPE_SHADER_TESS_CTRL) {
ctx.tess_input_info = ctx.bc->ar_reg + 3;
ctx.tess_output_info = ctx.bc->ar_reg + 4;
- ctx.temp_reg = ctx.bc->ar_reg + 5;
+ ctx.tess_io_info_precalc = ctx.bc->ar_reg + 5;
+ ctx.temp_reg = ctx.bc->ar_reg + 6;
} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
ctx.tess_input_info = 0;
ctx.tess_output_info = ctx.bc->ar_reg + 3;
- ctx.temp_reg = ctx.bc->ar_reg + 4;
+ ctx.tess_io_info_precalc = ctx.bc->ar_reg + 4;
+ ctx.temp_reg = ctx.bc->ar_reg + 5;
} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
@@ -3316,18 +3398,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
ctx.temp_reg = ctx.bc->ar_reg + 3;
}
- if (lds_inputs) {
+ ctx.tess_input_cache.uses_lds_io = 0;
+ if (lds_inputs || lds_outputs) {
tgsi_parse_init(&ctx.parse, tokens);
+
while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
tgsi_parse_token(&ctx.parse);
-
- if (ctx.parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
- continue;
-
- count_tess_inputs(&ctx);
+ if (ctx.parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION)
+ count_tess_inputs(&ctx);
+ else if (ctx.parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
+ struct tgsi_full_declaration *d = &ctx.parse.FullToken.FullDeclaration;
+ if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE &&
+ (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
+ d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER))
+ ctx.tess_input_cache.uses_lds_io = 1;
+
+ }
}
ctx.temp_reg += tess_input_cache_count_multiused(&ctx.tess_input_cache, ctx.temp_reg);
tgsi_parse_init(&ctx.parse, tokens);
+ } else {
+
}
shader->max_arrays = 0;
--
2.13.6
More information about the mesa-dev
mailing list