[Mesa-dev] [PATCH 2/2] [rfc] radv: inline push constants where possible. (v2)
Dave Airlie
airlied at gmail.com
Fri Jan 12 02:43:14 UTC 2018
From: Dave Airlie <airlied at redhat.com>
Instead of putting the push constants into the upload buffer,
if we have space in the sgprs we can upload the per-stage
constants into the shaders directly.
This saves a few reads from memory in the meta shaders,
we should also be able to inline other objects like
descriptors.
v2: fixup 16->available_sgprs (Samuel)
fixup dynamic offsets. (Alex)
bump to 12.
handle push consts > 32 better, avoid F1 2017 crash
TODO: proper vega support (Samuel)
Signed-off-by: Dave Airlie <airlied at redhat.com>
---
src/amd/common/ac_nir_to_llvm.c | 102 +++++++++++++++++++++++++++++++++++----
src/amd/common/ac_nir_to_llvm.h | 5 ++
src/amd/common/ac_shader_info.c | 5 +-
src/amd/common/ac_shader_info.h | 1 +
src/amd/vulkan/radv_cmd_buffer.c | 75 +++++++++++++++++++++-------
5 files changed, 159 insertions(+), 29 deletions(-)
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 6c578de3aca..00ad76a82f7 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -92,6 +92,7 @@ struct nir_to_llvm_context {
LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
LLVMValueRef ring_offsets;
LLVMValueRef push_constants;
+ LLVMValueRef inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
LLVMValueRef view_index;
LLVMValueRef num_work_groups;
LLVMValueRef workgroup_ids[3];
@@ -243,7 +244,7 @@ static void set_llvm_calling_convention(LLVMValueRef func,
LLVMSetFunctionCallConv(func, calling_conv);
}
-#define MAX_ARGS 23
+#define MAX_ARGS 32
struct arg_info {
LLVMTypeRef types[MAX_ARGS];
LLVMValueRef *assign[MAX_ARGS];
@@ -538,6 +539,8 @@ struct user_sgpr_info {
bool need_ring_offsets;
uint8_t sgpr_count;
bool indirect_all_descriptor_sets;
+ uint8_t base_inline_push_consts;
+ uint8_t num_inline_push_consts;
};
static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
@@ -609,8 +612,49 @@ static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
} else {
user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
}
+
+ if (ctx->shader_info->info.loads_push_constants) {
+ uint32_t remaining_sgprs = available_sgprs - user_sgpr_info->sgpr_count;
+ if (!ctx->shader_info->info.has_indirect_push_constants &&
+ !ctx->shader_info->info.loads_dynamic_offsets)
+ remaining_sgprs += 2;
+
+ if (ctx->options->layout->push_constant_size) {
+ uint8_t num_32bit_push_consts = (ctx->shader_info->info.max_push_constant_used -
+ ctx->shader_info->info.min_push_constant_used) / 4;
+
+ if ((ctx->shader_info->info.min_push_constant_used / 4) <= 63 &&
+ (ctx->shader_info->info.max_push_constant_used / 4) <= 63) {
+ user_sgpr_info->base_inline_push_consts = ctx->shader_info->info.min_push_constant_used / 4;
+
+ if (num_32bit_push_consts < remaining_sgprs) {
+ user_sgpr_info->num_inline_push_consts = num_32bit_push_consts;
+ if (!ctx->shader_info->info.has_indirect_push_constants)
+ ctx->shader_info->info.loads_push_constants = false;
+ } else {
+ user_sgpr_info->num_inline_push_consts = remaining_sgprs;
+ }
+
+ if (user_sgpr_info->num_inline_push_consts > AC_UD_MAX_INLINE_PUSH_CONST)
+ user_sgpr_info->num_inline_push_consts = AC_UD_MAX_INLINE_PUSH_CONST;
+ }
+ }
+ }
}
+static void
+declare_inline_push_consts(struct nir_to_llvm_context *ctx,
+ gl_shader_stage stage,
+ const struct user_sgpr_info *user_sgpr_info,
+ struct arg_info *args)
+{
+ ctx->shader_info->inline_push_const_mask = (1ULL << user_sgpr_info->num_inline_push_consts) - 1;
+ ctx->shader_info->inline_push_const_base = user_sgpr_info->base_inline_push_consts;
+
+ for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
+ add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->inline_push_consts[i]);
+
+}
static void
declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
gl_shader_stage stage,
@@ -640,10 +684,14 @@ declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
add_array_arg(args, const_array(type, 32), desc_sets);
}
- if (ctx->shader_info->info.loads_push_constants) {
+ if (ctx->shader_info->info.loads_push_constants ||
+ ctx->shader_info->info.loads_dynamic_offsets) {
/* 1 for push constants and dynamic descriptors */
add_array_arg(args, type, &ctx->push_constants);
}
+
+ if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
+ declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
}
static void
@@ -651,6 +699,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
gl_shader_stage stage,
bool has_previous_stage,
gl_shader_stage previous_stage,
+ const struct user_sgpr_info *user_sgpr_info,
struct arg_info *args)
{
if (!ctx->is_gs_copy_shader &&
@@ -660,6 +709,7 @@ declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
add_arg(args, ARG_SGPR, const_array(ctx->ac.v4i32, 16),
&ctx->vertex_buffers);
}
+ declare_inline_push_consts(ctx, stage, user_sgpr_info, args);
add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
if (ctx->shader_info->info.vs.needs_draw_id) {
@@ -693,6 +743,16 @@ declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
}
+static void
+set_inline_pushconst_locs(struct nir_to_llvm_context *ctx,
+ const struct user_sgpr_info *user_sgpr_info,
+ uint8_t *user_sgpr_idx)
+{
+ ctx->shader_info->user_sgprs_locs.push_const_base = user_sgpr_info->base_inline_push_consts;
+ for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++)
+ set_loc(&ctx->shader_info->user_sgprs_locs.inline_push_consts[i], user_sgpr_idx, 1, 0);
+}
+
static void
set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
bool has_previous_stage, gl_shader_stage previous_stage,
@@ -731,15 +791,21 @@ set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
ctx->shader_info->need_indirect_descriptor_sets = true;
}
- if (ctx->shader_info->info.loads_push_constants) {
+ if (ctx->shader_info->info.loads_push_constants ||
+ ctx->shader_info->info.loads_dynamic_offsets) {
set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
}
+
+
+ if (!((stage == MESA_SHADER_VERTEX) || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX)))
+ set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
}
static void
set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
gl_shader_stage stage, bool has_previous_stage,
gl_shader_stage previous_stage,
+ const struct user_sgpr_info *user_sgpr_info,
uint8_t *user_sgpr_idx)
{
if (!ctx->is_gs_copy_shader &&
@@ -750,6 +816,7 @@ set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
user_sgpr_idx, 2);
}
+ set_inline_pushconst_locs(ctx, user_sgpr_info, user_sgpr_idx);
unsigned vs_num = 2;
if (ctx->shader_info->info.vs.needs_draw_id)
vs_num++;
@@ -805,7 +872,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
previous_stage, &user_sgpr_info,
&args, &desc_sets);
declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
- previous_stage, &args);
+ previous_stage, &user_sgpr_info, &args);
if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
@@ -838,7 +905,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
&desc_sets);
declare_vs_specific_input_sgprs(ctx, stage,
has_previous_stage,
- previous_stage, &args);
+ previous_stage, &user_sgpr_info, &args);
add_arg(&args, ARG_SGPR, ctx->ac.i32,
&ctx->ls_out_layout);
@@ -934,7 +1001,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
} else {
declare_vs_specific_input_sgprs(ctx, stage,
has_previous_stage,
- previous_stage,
+ previous_stage, &user_sgpr_info,
&args);
}
@@ -1076,7 +1143,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
break;
case MESA_SHADER_VERTEX:
set_vs_specific_input_locs(ctx, stage, has_previous_stage,
- previous_stage, &user_sgpr_idx);
+ previous_stage, &user_sgpr_info, &user_sgpr_idx);
if (ctx->view_index)
set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
if (ctx->options->key.vs.as_ls) {
@@ -1088,7 +1155,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
break;
case MESA_SHADER_TESS_CTRL:
set_vs_specific_input_locs(ctx, stage, has_previous_stage,
- previous_stage, &user_sgpr_idx);
+ previous_stage, &user_sgpr_info, &user_sgpr_idx);
if (has_previous_stage)
set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
&user_sgpr_idx, 1);
@@ -1108,6 +1175,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
set_vs_specific_input_locs(ctx, stage,
has_previous_stage,
previous_stage,
+ &user_sgpr_info,
&user_sgpr_idx);
else
set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
@@ -2371,9 +2439,23 @@ static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
nir_intrinsic_instr *instr)
{
LLVMValueRef ptr, addr;
+ LLVMValueRef src0 = get_src(ctx->nir, instr->src[0]);
+ unsigned index = nir_intrinsic_base(instr);
+
+ if (LLVMIsConstant(src0)) {
+ unsigned array_index = index;
+ array_index += LLVMConstIntGetZExtValue(src0);
+ array_index /= 4;
+
+ array_index -= ctx->shader_info->inline_push_const_base;
+ uint64_t bits = (((1ULL << instr->num_components) - 1) << array_index);
+ if ((bits & ctx->shader_info->inline_push_const_mask) == bits) {
+ return ac_build_gather_values(&ctx->ac, &ctx->inline_push_consts[array_index], instr->num_components);
+ }
+ }
- addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
- addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), "");
+ addr = LLVMConstInt(ctx->ac.i32, index, 0);
+ addr = LLVMBuildAdd(ctx->builder, addr, src0, "");
ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index b3ad0a09857..22330fdfbc4 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -127,10 +127,13 @@ enum ac_ud_index {
// Match MAX_SETS from radv_descriptor_set.h
#define AC_UD_MAX_SETS MAX_SETS
+#define AC_UD_MAX_INLINE_PUSH_CONST 12
struct ac_userdata_locations {
struct ac_userdata_info descriptor_sets[AC_UD_MAX_SETS];
struct ac_userdata_info shader_data[AC_UD_MAX_UD];
+ struct ac_userdata_info inline_push_consts[AC_UD_MAX_INLINE_PUSH_CONST];
+ uint8_t push_const_base;
};
struct ac_vs_output_info {
@@ -156,6 +159,8 @@ struct ac_shader_variant_info {
unsigned num_user_sgprs;
unsigned num_input_sgprs;
unsigned num_input_vgprs;
+ uint64_t inline_push_const_mask;
+ uint32_t inline_push_const_base;
bool need_indirect_descriptor_sets;
struct {
struct {
diff --git a/src/amd/common/ac_shader_info.c b/src/amd/common/ac_shader_info.c
index 18fa9e1c94c..fbb46684ae3 100644
--- a/src/amd/common/ac_shader_info.c
+++ b/src/amd/common/ac_shader_info.c
@@ -179,9 +179,10 @@ ac_nir_shader_info_pass(struct nir_shader *nir,
{
struct nir_function *func = (struct nir_function *)exec_list_get_head(&nir->functions);
-
- if (options->layout->dynamic_offset_count)
+ if (options->layout->dynamic_offset_count) {
info->loads_push_constants = true;
+ info->loads_dynamic_offsets = true;
+ }
nir_foreach_variable(variable, &nir->inputs)
gather_info_input_decl(nir, options, variable, info);
diff --git a/src/amd/common/ac_shader_info.h b/src/amd/common/ac_shader_info.h
index e35cde0ca97..e8ea33f2e3a 100644
--- a/src/amd/common/ac_shader_info.h
+++ b/src/amd/common/ac_shader_info.h
@@ -32,6 +32,7 @@ struct ac_shader_info {
uint8_t min_push_constant_used;
uint8_t max_push_constant_used;
bool has_indirect_push_constants;
+ bool loads_dynamic_offsets;
bool loads_push_constants;
bool needs_multiview_view_index;
bool uses_invocation_id;
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index d870a9bedb3..e1953a3a095 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1807,6 +1807,27 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer,
assert(cmd_buffer->cs->cdw <= cdw_max);
}
+static struct ac_userdata_info *
+radv_lookup_push_const_sgpr(struct radv_shader_variant *shader,
+ int idx)
+{
+ idx -= shader->info.user_sgprs_locs.push_const_base;
+ return &shader->info.user_sgprs_locs.inline_push_consts[idx];
+}
+
+static void
+radv_emit_inline_pushconsts(struct radv_cmd_buffer *cmd_buffer,
+ struct radv_shader_variant *shader,
+ unsigned base_reg,
+ int idx, int count, uint32_t *values)
+{
+ struct ac_userdata_info *loc = radv_lookup_push_const_sgpr(shader, idx);
+ assert (loc->sgpr_idx == -1);
+ assert (!loc->indirect);
+ radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
+ radeon_emit_array(cmd_buffer->cs, values, count);
+}
+
static void
radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
struct radv_pipeline *pipeline,
@@ -1816,36 +1837,56 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer,
unsigned offset;
void *ptr;
uint64_t va;
+ bool need_push_constants = false;
stages &= cmd_buffer->push_constant_stages;
if (!stages ||
(!layout->push_constant_size && !layout->dynamic_offset_count))
return;
- if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
- 16 * layout->dynamic_offset_count,
- 256, &offset, &ptr))
- return;
+ radv_foreach_stage(stage, stages) {
+ if (!pipeline->shaders[stage])
+ continue;
+
+ need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants;
+ need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets;
- memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
- if (layout->dynamic_offset_count) {
- memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
- 16 * layout->dynamic_offset_count);
+ uint32_t mask = pipeline->shaders[stage]->info.inline_push_const_mask;
+ uint32_t base_reg = pipeline->user_data_0[stage];
+ while (mask) {
+ int start, count;
+ u_bit_scan_consecutive_range(&mask, &start, &count);
+ start += pipeline->shaders[stage]->info.inline_push_const_base;
+ radv_emit_inline_pushconsts(cmd_buffer, pipeline->shaders[stage], base_reg,
+ start, count, (uint32_t *)&cmd_buffer->push_constants[start * 4]);
+ }
}
- va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
- va += offset;
+ if (need_push_constants) {
+ if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size +
+ 16 * layout->dynamic_offset_count,
+ 256, &offset, &ptr))
+ return;
- MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
- cmd_buffer->cs, MESA_SHADER_STAGES * 4);
+ memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
+ if (layout->dynamic_offset_count) {
+ memcpy((char*)ptr + layout->push_constant_size, cmd_buffer->dynamic_buffers,
+ 16 * layout->dynamic_offset_count);
+ }
- radv_foreach_stage(stage, stages) {
- if (pipeline->shaders[stage]) {
- radv_emit_userdata_address(cmd_buffer, pipeline, stage,
- AC_UD_PUSH_CONSTANTS, va);
+ va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
+ va += offset;
+
+ MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
+ cmd_buffer->cs, MESA_SHADER_STAGES * 4);
+
+ radv_foreach_stage(stage, stages) {
+ if (pipeline->shaders[stage]) {
+ radv_emit_userdata_address(cmd_buffer, pipeline, stage,
+ AC_UD_PUSH_CONSTANTS, va);
+ }
}
}
-
cmd_buffer->push_constant_stages &= ~stages;
assert(cmd_buffer->cs->cdw <= cdw_max);
}
--
2.14.3
More information about the mesa-dev
mailing list