[Mesa-dev] [PATCH 6/6] radeonsi: reload PS inputs with direct indexing at each use
Nicolai Hähnle
nhaehnle at gmail.com
Tue Sep 13 18:11:43 UTC 2016
On 13.09.2016 19:13, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> The LLVM compiler can CSE interp intrinsics thanks to
> LLVMReadNoneAttribute.
>
> 26011 shaders in 14651 tests
> Totals:
> SGPRS: 1146340 -> 1132676 (-1.19 %)
> VGPRS: 727371 -> 711730 (-2.15 %)
> Spilled SGPRs: 2218 -> 2078 (-6.31 %)
> Spilled VGPRs: 369 -> 369 (0.00 %)
> Scratch VGPRs: 1344 -> 1344 (0.00 %) dwords per thread
> Code Size: 35841268 -> 36009732 (0.47 %) bytes
> LDS: 767 -> 767 (0.00 %) blocks
> Max Waves: 222559 -> 224779 (1.00 %)
> Wait states: 0 -> 0 (0.00 %)
> ---
> src/gallium/drivers/radeon/radeon_llvm.h | 6 ++++-
> .../drivers/radeon/radeon_setup_tgsi_llvm.c | 28 ++++++++++++++++++----
> src/gallium/drivers/radeonsi/si_shader.c | 27 +++++++++------------
> 3 files changed, 39 insertions(+), 22 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
> index da5b7f5..f508d32 100644
> --- a/src/gallium/drivers/radeon/radeon_llvm.h
> +++ b/src/gallium/drivers/radeon/radeon_llvm.h
> @@ -23,21 +23,23 @@
> * Authors: Tom Stellard <thomas.stellard at amd.com>
> *
> */
>
> #ifndef RADEON_LLVM_H
> #define RADEON_LLVM_H
>
> #include <llvm-c/Core.h>
> #include "gallivm/lp_bld_init.h"
> #include "gallivm/lp_bld_tgsi.h"
> +#include "tgsi/tgsi_parse.h"
>
> +#define RADEON_LLVM_MAX_INPUT_SLOTS 32
> #define RADEON_LLVM_MAX_INPUTS 32 * 4
> #define RADEON_LLVM_MAX_OUTPUTS 32 * 4
>
> #define RADEON_LLVM_INITIAL_CF_DEPTH 4
>
> #define RADEON_LLVM_MAX_SYSTEM_VALUES 4
>
> struct radeon_llvm_branch {
> LLVMBasicBlockRef endif_block;
> LLVMBasicBlockRef if_block;
> @@ -55,33 +57,35 @@ struct radeon_llvm_context {
>
> /*=== Front end configuration ===*/
>
> /* Instructions that are not described by any of the TGSI opcodes. */
>
> /** This function is responsible for initilizing the inputs array and will be
> * called once for each input declared in the TGSI shader.
> */
> void (*load_input)(struct radeon_llvm_context *,
> unsigned input_index,
> - const struct tgsi_full_declaration *decl);
> + const struct tgsi_full_declaration *decl,
> + LLVMValueRef out[4]);
>
> void (*load_system_value)(struct radeon_llvm_context *,
> unsigned index,
> const struct tgsi_full_declaration *decl);
>
> void (*declare_memory_region)(struct radeon_llvm_context *,
> const struct tgsi_full_declaration *decl);
>
> /** This array contains the input values for the shader. Typically these
> * values will be in the form of a target intrinsic that will inform the
> * backend how to load the actual inputs to the shader.
> */
> + struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS];
> LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
> LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
>
> /** This pointer is used to contain the temporary values.
> * The amount of temporary used in tgsi can't be bound to a max value and
> * thus we must allocate this array at runtime.
> */
> LLVMValueRef *temps;
> unsigned temps_count;
> LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES];
> diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> index 4643e6d..11f0cf2 100644
> --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
> @@ -439,28 +439,43 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
> bld_base->int_bld.zero);
> result = LLVMConstInsertElement(result,
> bld->immediates[reg->Register.Index][swizzle + 1],
> bld_base->int_bld.one);
> return LLVMConstBitCast(result, ctype);
> } else {
> return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
> }
> }
>
> - case TGSI_FILE_INPUT:
> - result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
> + case TGSI_FILE_INPUT: {
> + unsigned index = reg->Register.Index;
> + LLVMValueRef input[4];
> +
> + /* I don't think doing this for vertex shaders is beneficial.
> + * For those, we want to make sure the VMEM loads are executed
> + * only once. Fragment shaders don't care much, because
> + * v_interp instructions are much cheaper than VMEM loads.
> + */
> + if (ctx->soa.bld_base.info->processor == PIPE_SHADER_FRAGMENT)
> + ctx->load_input(ctx, index, &ctx->input_decls[index], input);
> + else
> + memcpy(input, &ctx->inputs[index * 4], sizeof(input));
> +
> + result = input[swizzle];
> +
> if (tgsi_type_is_64bit(type)) {
> ptr = result;
> - ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
> + ptr2 = input[swizzle + 1];
> return radeon_llvm_emit_fetch_64bit(bld_base, type, ptr, ptr2);
> }
> break;
> + }
>
> case TGSI_FILE_TEMPORARY:
> if (reg->Register.Index >= ctx->temps_count)
> return LLVMGetUndef(tgsi2llvmtype(bld_base, type));
> ptr = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle];
> if (tgsi_type_is_64bit(type)) {
> ptr2 = ctx->temps[reg->Register.Index * TGSI_NUM_CHANNELS + swizzle + 1];
> return radeon_llvm_emit_fetch_64bit(bld_base, type,
> LLVMBuildLoad(builder, ptr, ""),
> LLVMBuildLoad(builder, ptr2, ""));
> @@ -619,22 +634,25 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
> }
> ctx->temps[first * TGSI_NUM_CHANNELS + i] = ptr;
> }
> }
> break;
> }
> case TGSI_FILE_INPUT:
> {
> unsigned idx;
> for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
> - if (ctx->load_input)
> - ctx->load_input(ctx, idx, decl);
> + if (ctx->load_input) {
> + ctx->input_decls[idx] = *decl;
> + ctx->load_input(ctx, idx, decl,
> + &ctx->inputs[idx * 4]);
I think it makes sense to remove the call to load_input for fragment
shaders, since it just creates IR that will have to be dead-code
eliminated later.
Cheers
Nicolai
> + }
> }
> }
> break;
>
> case TGSI_FILE_SYSTEM_VALUE:
> {
> unsigned idx;
> for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
> ctx->load_system_value(ctx, idx, decl);
> }
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index faa5363..69cc503 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -433,21 +433,22 @@ static LLVMValueRef get_instance_index_for_fetch(
> result = LLVMBuildUDiv(gallivm->builder, result,
> lp_build_const_int32(gallivm, divisor), "");
>
> return LLVMBuildAdd(gallivm->builder, result,
> LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
> }
>
> static void declare_input_vs(
> struct radeon_llvm_context *radeon_bld,
> unsigned input_index,
> - const struct tgsi_full_declaration *decl)
> + const struct tgsi_full_declaration *decl,
> + LLVMValueRef out[4])
> {
> struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
> struct gallivm_state *gallivm = base->gallivm;
> struct si_shader_context *ctx =
> si_shader_context(&radeon_bld->soa.bld_base);
> unsigned divisor =
> ctx->shader->key.vs.prolog.instance_divisors[input_index];
>
> unsigned chan;
>
> @@ -491,25 +492,22 @@ static void declare_input_vs(
> args[0] = t_list;
> args[1] = attribute_offset;
> args[2] = buffer_index;
> input = lp_build_intrinsic(gallivm->builder,
> "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
> LLVMReadNoneAttribute);
>
> /* Break up the vec4 into individual components */
> for (chan = 0; chan < 4; chan++) {
> LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
> - /* XXX: Use a helper function for this. There is one in
> - * tgsi_llvm.c. */
> - ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
> - LLVMBuildExtractElement(gallivm->builder,
> - input, llvm_chan, "");
> + out[chan] = LLVMBuildExtractElement(gallivm->builder,
> + input, llvm_chan, "");
> }
> }
>
> static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
> unsigned swizzle)
> {
> struct si_shader_context *ctx = si_shader_context(bld_base);
>
> if (swizzle > 0)
> return bld_base->uint_bld.zero;
> @@ -1456,47 +1454,44 @@ static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
> }
>
> if (!param)
> param = LLVMGetParam(main_fn, interp_param_idx);
> return param;
> }
>
> static void declare_input_fs(
> struct radeon_llvm_context *radeon_bld,
> unsigned input_index,
> - const struct tgsi_full_declaration *decl)
> + const struct tgsi_full_declaration *decl,
> + LLVMValueRef out[4])
> {
> struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
> struct si_shader_context *ctx =
> si_shader_context(&radeon_bld->soa.bld_base);
> struct si_shader *shader = ctx->shader;
> LLVMValueRef main_fn = radeon_bld->main_fn;
> LLVMValueRef interp_param = NULL;
> int interp_param_idx;
>
> /* Get colors from input VGPRs (set by the prolog). */
> if (!ctx->is_monolithic &&
> decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
> unsigned i = decl->Semantic.Index;
> unsigned colors_read = shader->selector->info.colors_read;
> unsigned mask = colors_read >> (i * 4);
> unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
> (i ? util_bitcount(colors_read & 0xf) : 0);
>
> - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
> - mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
> - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
> - mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
> - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
> - mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
> - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
> - mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
> + out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
> + out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
> + out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
> + out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
> return;
> }
>
> interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
> decl->Interp.Location);
> if (interp_param_idx == -1)
> return;
> else if (interp_param_idx) {
> interp_param_idx = select_interp_param(ctx,
> interp_param_idx);
> @@ -1506,21 +1501,21 @@ static void declare_input_fs(
> if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
> decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
> ctx->shader->key.ps.prolog.flatshade_colors)
> interp_param = NULL; /* load the constant color */
>
> interp_fs_input(ctx, input_index, decl->Semantic.Name,
> decl->Semantic.Index, shader->selector->info.num_inputs,
> shader->selector->info.colors_read, interp_param,
> LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
> LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
> - &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
> + &out[0]);
> }
>
> static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
> {
> return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
> SI_PARAM_ANCILLARY, 8, 4);
> }
>
> /**
> * Set range metadata on an instruction. This can only be used on load and
>
More information about the mesa-dev
mailing list