[Mesa-dev] [PATCH 3/9] radeonsi: optimize TCS epilog when invocation 0 writes tess factors
Marek Olšák
maraeo at gmail.com
Wed Sep 6 17:03:54 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
This removes the barrier and LDS stores and loads for tess factors
when it's possible. The removal of the barrier seems more important
to me though.
In one shader, it removes 17 * 4 bytes from the shader binary.
---
src/gallium/drivers/radeonsi/si_shader.c | 111 ++++++++++++++++------
src/gallium/drivers/radeonsi/si_shader.h | 2 +
src/gallium/drivers/radeonsi/si_shader_internal.h | 1 +
src/gallium/drivers/radeonsi/si_state_shaders.c | 3 +
4 files changed, 89 insertions(+), 28 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 861d82f..de58737 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1084,21 +1084,21 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
{
struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = &ctx->gallivm;
const struct tgsi_full_dst_register *reg = &inst->Dst[0];
const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
unsigned chan_index;
LLVMValueRef dw_addr, stride;
LLVMValueRef buffer, base, buf_addr;
LLVMValueRef values[4];
bool skip_lds_store;
- bool is_tess_factor = false;
+ bool is_tess_factor = false, is_tess_inner = false;
/* Only handle per-patch and per-vertex outputs here.
* Vectors will be lowered to scalars and this function will be called again.
*/
if (reg->Register.File != TGSI_FILE_OUTPUT ||
(dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
si_llvm_emit_store(bld_base, inst, info, dst);
return;
}
@@ -1111,22 +1111,25 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
dw_addr = get_tcs_out_current_patch_data_offset(ctx);
dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
skip_lds_store = !sh_info->reads_perpatch_outputs;
if (!reg->Register.Indirect) {
int name = sh_info->output_semantic_name[reg->Register.Index];
/* Always write tess factors into LDS for the TCS epilog. */
if (name == TGSI_SEMANTIC_TESSINNER ||
name == TGSI_SEMANTIC_TESSOUTER) {
- skip_lds_store = false;
+ /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
+ skip_lds_store = !sh_info->reads_tessfactor_outputs &&
+ ctx->shader->selector->tcs_info.invoc0_tessfactors_are_def;
is_tess_factor = true;
+ is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
}
}
}
buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
@@ -1141,20 +1144,32 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
lds_store(bld_base, chan_index, dw_addr, value);
value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
values[chan_index] = value;
if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
buf_addr, base,
4 * chan_index, 1, 0, true, false);
}
+
+ /* Write tess factors into VGPRs for the epilog. */
+ if (is_tess_factor &&
+ ctx->shader->selector->tcs_info.invoc0_tessfactors_are_def) {
+ if (!is_tess_inner) {
+ LLVMBuildStore(gallivm->builder, value, /* outer */
+ ctx->invoc0_tess_factors[chan_index]);
+ } else if (chan_index < 2) {
+ LLVMBuildStore(gallivm->builder, value, /* inner */
+ ctx->invoc0_tess_factors[4 + chan_index]);
+ }
+ }
}
if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
LLVMValueRef value = lp_build_gather_values(gallivm,
values, 4);
ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
base, 0, 1, 0, true, false);
}
}
@@ -2605,32 +2620,36 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
lds_ptr);
ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
buffer_offset, 0, 1, 0, true, false);
}
}
static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
LLVMValueRef rel_patch_id,
LLVMValueRef invocation_id,
- LLVMValueRef tcs_out_current_patch_data_offset)
+ LLVMValueRef tcs_out_current_patch_data_offset,
+ LLVMValueRef invoc0_tf_outer[4],
+ LLVMValueRef invoc0_tf_inner[2])
{
struct si_shader_context *ctx = si_shader_context(bld_base);
struct gallivm_state *gallivm = &ctx->gallivm;
struct si_shader *shader = ctx->shader;
unsigned tess_inner_index, tess_outer_index;
LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
unsigned stride, outer_comps, inner_comps, i, offset;
struct lp_build_if_state if_ctx, inner_if_ctx;
- si_llvm_emit_barrier(NULL, bld_base, NULL);
+ /* Add a barrier before loading tess factors from LDS. */
+ if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+ si_llvm_emit_barrier(NULL, bld_base, NULL);
/* Do this only for invocation 0, because the tess levels are per-patch,
* not per-vertex.
*
* This can't jump, because invocation 0 executes this. It should
* at least mask out the loads and stores for other invocations.
*/
lp_build_if(&if_ctx, gallivm,
LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
invocation_id, ctx->i32_0, ""));
@@ -2650,56 +2669,65 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
case PIPE_PRIM_QUADS:
stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
outer_comps = 4;
inner_comps = 2;
break;
default:
assert(0);
return;
}
- /* Load tess_inner and tess_outer from LDS.
- * Any invocation can write them, so we can't get them from a temporary.
- */
- tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
- tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
- lds_base = tcs_out_current_patch_data_offset;
- lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
- LLVMConstInt(ctx->i32,
- tess_inner_index * 4, 0), "");
- lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
- LLVMConstInt(ctx->i32,
- tess_outer_index * 4, 0), "");
-
for (i = 0; i < 4; i++) {
inner[i] = LLVMGetUndef(ctx->i32);
outer[i] = LLVMGetUndef(ctx->i32);
}
- if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
- /* For isolines, the hardware expects tess factors in the
- * reverse order from what GLSL / TGSI specify.
- */
- outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
- outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
+ if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+ /* Tess factors are in VGPRs. */
+ for (i = 0; i < outer_comps; i++)
+ outer[i] = out[i] = invoc0_tf_outer[i];
+ for (i = 0; i < inner_comps; i++)
+ inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
} else {
+ /* Load tess_inner and tess_outer from LDS.
+ * Any invocation can write them, so we can't get them from a temporary.
+ */
+ tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+ tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+ lds_base = tcs_out_current_patch_data_offset;
+ lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+ LLVMConstInt(ctx->i32,
+ tess_inner_index * 4, 0), "");
+ lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+ LLVMConstInt(ctx->i32,
+ tess_outer_index * 4, 0), "");
+
for (i = 0; i < outer_comps; i++) {
outer[i] = out[i] =
lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
}
for (i = 0; i < inner_comps; i++) {
inner[i] = out[outer_comps+i] =
lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
}
}
+ if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+ /* For isolines, the hardware expects tess factors in the
+ * reverse order from what GLSL / TGSI specify.
+ */
+ LLVMValueRef tmp = out[0];
+ out[0] = out[1];
+ out[1] = tmp;
+ }
+
/* Convert the outputs to vectors for stores. */
vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
vec1 = NULL;
if (stride > 4)
vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
/* Get the buffer. */
buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
@@ -2880,21 +2908,32 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
/* Leave a hole corresponding to the two input VGPRs. This ensures that
* the invocation_id output does not alias the param_tcs_rel_ids input,
* which saves a V_MOV on gfx9.
*/
vgpr += 2;
ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
- ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+
+ if (ctx->shader->selector->tcs_info.invoc0_tessfactors_are_def) {
+ vgpr++; /* skip the tess factor LDS offset */
+ for (unsigned i = 0; i < 6; i++) {
+ LLVMValueRef value =
+ LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
+ value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+ ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
+ }
+ } else {
+ ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+ }
ctx->return_value = ret;
}
/* Pass TCS inputs from LS to TCS on GFX9. */
static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
{
LLVMValueRef ret = ctx->return_value;
ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
@@ -4373,21 +4412,21 @@ static void create_function(struct si_shader_context *ctx)
/* VGPRs */
ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
/* param_tcs_offchip_offset and param_tcs_factor_offset are
* placed after the user SGPRs.
*/
for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 5; i++)
+ for (i = 0; i < 11; i++)
returns[num_returns++] = ctx->f32; /* VGPRs */
break;
case SI_SHADER_MERGED_VERTEX_TESSCTRL:
/* Merged stages have 8 system SGPRs at the beginning. */
ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -4430,21 +4469,21 @@ static void create_function(struct si_shader_context *ctx)
returns[num_returns++] = ctx->f32; /* VGPRs */
} else {
/* TCS return values are inputs to the TCS epilog.
*
* param_tcs_offchip_offset, param_tcs_factor_offset,
* param_tcs_offchip_layout, and param_rw_buffers
* should be passed to the epilog.
*/
for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
returns[num_returns++] = ctx->i32; /* SGPRs */
- for (i = 0; i < 5; i++)
+ for (i = 0; i < 11; i++)
returns[num_returns++] = ctx->f32; /* VGPRs */
}
break;
case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
/* Merged stages have 8 system SGPRs at the beginning. */
ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
@@ -5735,20 +5774,28 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx,
si_llvm_emit_barrier(NULL, bld_base, NULL);
LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
LLVMValueRef ena =
LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
ac_get_thread_id(&ctx->ac), num_threads, "");
lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
}
}
+ if (ctx->type == PIPE_SHADER_TESS_CTRL &&
+ sel->tcs_info.invoc0_tessfactors_are_def) {
+ for (unsigned i = 0; i < 6; i++) {
+ ctx->invoc0_tess_factors[i] =
+ lp_build_alloca_undef(&ctx->gallivm, ctx->i32, "");
+ }
+ }
+
if (ctx->type == PIPE_SHADER_GEOMETRY) {
int i;
for (i = 0; i < 4; i++) {
ctx->gs_next_vertex[i] =
lp_build_alloca(&ctx->gallivm,
ctx->i32, "");
}
}
if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
@@ -6969,30 +7016,38 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
}
add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
unsigned tess_factors_idx =
add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
+ for (unsigned i = 0; i < 6; i++)
+ add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
+
/* Create the function. */
si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
ctx->screen->b.chip_class >= CIK ? 128 : 64);
declare_lds_as_pointer(ctx);
func = ctx->main_fn;
+ LLVMValueRef invoc0_tess_factors[6];
+ for (unsigned i = 0; i < 6; i++)
+ invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
+
si_write_tess_factors(bld_base,
LLVMGetParam(func, tess_factors_idx),
LLVMGetParam(func, tess_factors_idx + 1),
- LLVMGetParam(func, tess_factors_idx + 2));
+ LLVMGetParam(func, tess_factors_idx + 2),
+ invoc0_tess_factors, invoc0_tess_factors + 4);
LLVMBuildRetVoid(gallivm->builder);
}
/**
* Select and compile (or reuse) TCS parts (epilog).
*/
static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
LLVMTargetMachineRef tm,
struct si_shader *shader,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index ee6b0c1..4592ac5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -320,20 +320,21 @@ struct si_shader_selector {
struct si_shader *main_shader_part;
struct si_shader *main_shader_part_ls; /* as_ls is set in the key */
struct si_shader *main_shader_part_es; /* as_es is set in the key */
struct si_shader *gs_copy_shader;
struct tgsi_token *tokens;
struct nir_shader *nir;
struct pipe_stream_output_info so;
struct tgsi_shader_info info;
+ struct tgsi_tessctrl_info tcs_info;
/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
unsigned type;
bool vs_needs_prolog;
unsigned pa_cl_vs_out_cntl;
ubyte clipdist_mask;
ubyte culldist_mask;
/* GS parameters. */
unsigned esgs_itemsize;
@@ -397,20 +398,21 @@ struct si_vs_prolog_bits {
* from the constant buffer.
*/
uint16_t instance_divisor_is_one; /* bitmask of inputs */
uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
unsigned ls_vgpr_fix:1;
};
/* Common TCS bits between the shader key and the epilog key. */
struct si_tcs_epilog_bits {
unsigned prim_mode:3;
+ unsigned invoc0_tess_factors_are_def:1;
unsigned tes_reads_tess_factors:1;
};
struct si_gs_prolog_bits {
unsigned tri_strip_adj_fix:1;
};
/* Common PS bits between the shader key and the prolog key. */
struct si_ps_prolog_bits {
unsigned color_two_side:1;
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 1231ef4..4ae8d85 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -204,20 +204,21 @@ struct si_shader_context {
unsigned range_md_kind;
unsigned fpmath_md_kind;
LLVMValueRef fpmath_md_2p5_ulp;
/* Preloaded descriptors. */
LLVMValueRef esgs_ring;
LLVMValueRef gsvs_ring[4];
LLVMValueRef lds;
+ LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
LLVMValueRef gs_next_vertex[4];
LLVMValueRef postponed_kill;
LLVMValueRef return_value;
LLVMTypeRef voidt;
LLVMTypeRef i1;
LLVMTypeRef i8;
LLVMTypeRef i32;
LLVMTypeRef i64;
LLVMTypeRef i128;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index d8791a2..25fcead 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1289,20 +1289,22 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
* - avoid initializing EXEC in both the LS prolog
* and the LS main part when !vs_needs_prolog
* - remove the fixup for unused input VGPRs
*/
key->part.tcs.ls_prolog.ls_vgpr_fix = sctx->ls_vgpr_fix;
key->opt.prefer_mono = sctx->ls_vgpr_fix;
}
key->part.tcs.epilog.prim_mode =
sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+ key->part.tcs.epilog.invoc0_tess_factors_are_def =
+ sel->tcs_info.invoc0_tessfactors_are_def;
key->part.tcs.epilog.tes_reads_tess_factors =
sctx->tes_shader.cso->info.reads_tess_factors;
if (sel == sctx->fixed_func_tcs_shader.cso)
key->mono.u.ff_tcs_inputs_to_copy = sctx->vs_shader.cso->outputs_written;
break;
case PIPE_SHADER_TESS_EVAL:
if (sctx->gs_shader.cso)
key->as_es = 1;
else {
@@ -1992,20 +1994,21 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->so = state->stream_output;
if (state->type == PIPE_SHADER_IR_TGSI) {
sel->tokens = tgsi_dup_tokens(state->tokens);
if (!sel->tokens) {
FREE(sel);
return NULL;
}
tgsi_scan_shader(state->tokens, &sel->info);
+ tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info);
} else {
assert(state->type == PIPE_SHADER_IR_NIR);
sel->nir = state->ir.nir;
si_nir_scan_shader(sel->nir, &sel->info);
si_lower_nir(sel);
}
--
2.7.4
More information about the mesa-dev
mailing list