[Mesa-dev] [PATCH 10/10] radeonsi/gfx9: add workarounds to avoid VGPR indexing completely
Nicolai Hähnle
nhaehnle at gmail.com
Sun Jul 16 09:27:16 UTC 2017
Patches 2 - 10:
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
On 08.07.2017 02:42, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> For inputs and outputs, indirect indexing is lowered by the GLSL compiler.
> For temporaries, use alloca and disable the "promote-alloca" pass.
>
> In the future, we could switch all codepaths to alloca permanently and
> just rely on the "promote-alloca" pass.
> ---
> src/gallium/drivers/radeonsi/si_pipe.c | 25 ++++++++++++++++------
> src/gallium/drivers/radeonsi/si_pipe.h | 1 +
> .../drivers/radeonsi/si_shader_tgsi_setup.c | 3 +--
> 3 files changed, 21 insertions(+), 8 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index afb2bcb..8a4bc41 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -134,22 +134,23 @@ static void si_emit_string_marker(struct pipe_context *ctx,
> dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
> }
>
> static LLVMTargetMachineRef
> si_create_llvm_target_machine(struct si_screen *sscreen)
> {
> const char *triple = "amdgcn--";
> char features[256];
>
> snprintf(features, sizeof(features),
> - "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s",
> + "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s",
> sscreen->b.chip_class >= GFX9 ? ",+xnack" : ",-xnack",
> + sscreen->llvm_has_working_vgpr_indexing ? "" : ",-promote-alloca",
> sscreen->b.debug_flags & DBG_SI_SCHED ? ",+si-scheduler" : "");
>
> return LLVMCreateTargetMachine(ac_get_llvm_target(triple), triple,
> r600_get_llvm_processor_name(sscreen->b.family),
> features,
> LLVMCodeGenLevelDefault,
> LLVMRelocDefault,
> LLVMCodeModelDefault);
> }
>
> @@ -750,34 +751,41 @@ static int si_get_shader_param(struct pipe_screen* pscreen,
> case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
> return 32;
> case PIPE_SHADER_CAP_PREFERRED_IR:
> return PIPE_SHADER_IR_TGSI;
> case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
> return 3;
>
> /* Supported boolean features. */
> case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
> case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
> - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
> case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
> case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
> case PIPE_SHADER_CAP_INTEGERS:
> case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
> case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
> case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
> return 1;
>
> case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
> - /* TODO: Indirection of geometry shader input dimension is not
> - * handled yet
> - */
> - return shader != PIPE_SHADER_GEOMETRY;
> + /* TODO: Indirect indexing of GS inputs is unimplemented. */
> + return shader != PIPE_SHADER_GEOMETRY &&
> + (sscreen->llvm_has_working_vgpr_indexing ||
> + /* TCS and TES load inputs directly from LDS or
> + * offchip memory, so indirect indexing is trivial. */
> + shader == PIPE_SHADER_TESS_CTRL ||
> + shader == PIPE_SHADER_TESS_EVAL);
> +
> + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
> + return sscreen->llvm_has_working_vgpr_indexing ||
> + /* TCS stores outputs directly to memory. */
> + shader == PIPE_SHADER_TESS_CTRL;
>
> /* Unsupported boolean features. */
> case PIPE_SHADER_CAP_SUBROUTINES:
> case PIPE_SHADER_CAP_SUPPORTED_IRS:
> case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
> case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
> return 0;
> }
> return 0;
> }
> @@ -999,20 +1007,25 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
> sscreen->b.info.me_fw_version >= 173) ||
> (sscreen->b.chip_class == SI &&
> sscreen->b.info.pfp_fw_version >= 121 &&
> sscreen->b.info.me_fw_version >= 87);
>
> sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
> sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
> sscreen->b.family <= CHIP_POLARIS12) ||
> sscreen->b.family == CHIP_VEGA10 ||
> sscreen->b.family == CHIP_RAVEN;
> + /* While it would be nice not to have this flag, we are constrained
> + * by the reality that LLVM 5.0 doesn't have working VGPR indexing
> + * on GFX9.
> + */
> + sscreen->llvm_has_working_vgpr_indexing = sscreen->b.chip_class <= VI;
>
> sscreen->b.has_cp_dma = true;
> sscreen->b.has_streamout = true;
>
> /* Some chips have RB+ registers, but don't support RB+. Those must
> * always disable it.
> */
> if (sscreen->b.family == CHIP_STONEY ||
> sscreen->b.chip_class >= GFX9) {
> sscreen->b.has_rbplus = true;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index bd724e8..c028aba 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -76,20 +76,21 @@ struct hash_table;
> struct u_suballocator;
>
> struct si_screen {
> struct r600_common_screen b;
> unsigned gs_table_depth;
> unsigned tess_offchip_block_dw_size;
> bool has_distributed_tess;
> bool has_draw_indirect_multi;
> bool has_ds_bpermute;
> bool has_msaa_sample_loc_bug;
> + bool llvm_has_working_vgpr_indexing;
>
> /* Whether shaders are monolithic (1-part) or separate (3-part). */
> bool use_monolithic_shaders;
> bool record_llvm_ir;
>
> mtx_t shader_parts_mutex;
> struct si_shader_part *vs_prologs;
> struct si_shader_part *tcs_epilogs;
> struct si_shader_part *gs_prologs;
> struct si_shader_part *ps_prologs;
> diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
> index b37d4b2..9c4a234 100644
> --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
> +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
> @@ -748,22 +748,21 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
> * LLVM will store in a register, so theoretically an
> * array with up to 4 * 16 = 64 elements could be
> * handled this way, but whether that's a good idea
> * depends on VGPR register pressure elsewhere.
> *
> * FIXME: We shouldn't need to have the non-alloca
> * code path for arrays. LLVM should be smart enough to
> * promote allocas into registers when profitable.
> */
> if (array_size > 16 ||
> - /* TODO: VGPR indexing is buggy on GFX9. */
> - ctx->screen->b.chip_class == GFX9) {
> + !ctx->screen->llvm_has_working_vgpr_indexing) {
> array_alloca = LLVMBuildAlloca(builder,
> LLVMArrayType(ctx->f32,
> array_size), "array");
> ctx->temp_array_allocas[id] = array_alloca;
> }
> }
>
> if (!ctx->temps_count) {
> ctx->temps_count = bld_base->info->file_max[TGSI_FILE_TEMPORARY] + 1;
> ctx->temps = MALLOC(TGSI_NUM_CHANNELS * ctx->temps_count * sizeof(LLVMValueRef));
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list