Mesa (main): radv: New shader args for NGG culling settings and viewport.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Jul 14 00:44:52 UTC 2021
Module: Mesa
Branch: main
Commit: 9a95f5487f5ab83fa44bea12afa30cf1a25fc9db
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=9a95f5487f5ab83fa44bea12afa30cf1a25fc9db
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Mon Jun 7 23:23:38 2021 +0200
radv: New shader args for NGG culling settings and viewport.
Add new shader arguments in RADV for:
- NGG culling settings
- Viewport transform
These will be used by NGG culling shaders.
Additionally, some tweaks are made to some config registers
in order to make culling shaders more efficient.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
---
src/amd/vulkan/radv_cmd_buffer.c | 226 ++++++++++++++++++++++++++++++++++++++
src/amd/vulkan/radv_pipeline.c | 7 ++
src/amd/vulkan/radv_private.h | 7 ++
src/amd/vulkan/radv_shader.c | 2 +
src/amd/vulkan/radv_shader.h | 6 +-
src/amd/vulkan/radv_shader_args.c | 69 +++++++++++-
src/amd/vulkan/radv_shader_args.h | 3 +
7 files changed, 315 insertions(+), 5 deletions(-)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index be9ccaa14da..84ce6131446 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1327,6 +1327,19 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
+ if (pipeline->graphics.has_ngg_culling &&
+ pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
+ !cmd_buffer->state.last_nggc_settings) {
+ /* The already emitted RSRC2 contains the LDS required for NGG culling.
+ * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
+ * API GS always needs LDS, so this isn't useful there.
+ */
+ struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
+ radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+ (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
+ S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
+ }
+
if (!cmd_buffer->state.emitted_pipeline ||
cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
@@ -3839,6 +3852,8 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi
cmd_buffer->state.last_sx_ps_downconvert = -1;
cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
cmd_buffer->state.last_sx_blend_opt_control = -1;
+ cmd_buffer->state.last_nggc_settings = -1;
+ cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
cmd_buffer->usage_flags = pBeginInfo->flags;
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
@@ -4961,6 +4976,10 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
if (secondary->state.last_index_type != -1) {
primary->state.last_index_type = secondary->state.last_index_type;
}
+
+ primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
+ primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
+ primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
}
/* After executing commands from secondary buffers we have to dirty
@@ -5635,6 +5654,209 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
return false;
}
+enum {
+ ngg_cull_none = 0,
+ ngg_cull_front_face = 1,
+ ngg_cull_back_face = 2,
+ ngg_cull_face_is_ccw = 4,
+ ngg_cull_small_primitives = 8,
+};
+
+ALWAYS_INLINE static bool
+radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
+ bool indirect, unsigned num_viewports)
+{
+ /* If we have to draw only a few vertices, we get better latency if
+ * we disable NGG culling.
+ *
+ * When tessellation is used, what matters is the number of tessellated
+ * vertices, so let's always assume it's not a small draw.
+ *
+ * TODO: Figure out how to do culling with multiple viewports efficiently.
+ */
+ return !has_tess && !indirect && vtx_cnt < 512 && num_viewports == 1;
+}
+
+ALWAYS_INLINE static uint32_t
+radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
+{
+ const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
+
+ /* Cull every triangle when rasterizer discard is enabled. */
+ if (d->rasterizer_discard_enable ||
+ G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl))
+ return ngg_cull_front_face | ngg_cull_back_face;
+
+ uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
+ uint32_t nggc_settings = ngg_cull_none;
+
+ /* The culling code needs to know whether face is CW or CCW. */
+ bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
+ ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
+ : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
+
+ /* Take inverted viewport into account. */
+ ccw ^= vp_y_inverted;
+
+ if (ccw)
+ nggc_settings |= ngg_cull_face_is_ccw;
+
+ /* Face culling settings. */
+ if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
+ ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
+ : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
+ nggc_settings |= ngg_cull_front_face;
+ if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
+ ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
+ : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
+ nggc_settings |= ngg_cull_back_face;
+
+ /* Small primitive culling is only valid when conservative overestimation is not used. */
+ if (!pipeline->graphics.uses_conservative_overestimate) {
+ nggc_settings |= ngg_cull_small_primitives;
+
+ /* small_prim_precision = num_samples / 2^subpixel_bits
+ * num_samples is also always a power of two, so the small prim precision can only be
+ * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
+ */
+ unsigned subpixel_bits = 256;
+ int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits);
+ nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
+ }
+
+ return nggc_settings;
+}
+
+static void
+radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
+{
+ struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+ const unsigned stage = pipeline->graphics.last_vgt_api_stage;
+ const bool nggc_supported = pipeline->graphics.has_ngg_culling;
+
+ if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
+ /* Current shader doesn't support culling and culling was already disabled:
+ * No further steps needed, just remember the SGPR's location is not set.
+ */
+ cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
+ return;
+ }
+
+ /* Check dirty flags:
+ * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
+ * - Dirty dynamic flags: culling settings may have changed.
+ */
+ const bool dirty =
+ cmd_buffer->state.dirty &
+ (RADV_CMD_DIRTY_PIPELINE |
+ RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
+ RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
+
+ /* Check small draw status:
+ * For small draw calls, we disable culling by setting the SGPR to 0.
+ */
+ const bool skip =
+ radv_skip_ngg_culling(
+ stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect,
+ cmd_buffer->state.dynamic.viewport.count);
+
+ /* See if anything changed. */
+ if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
+ return;
+
+ /* Remember small draw state. */
+ cmd_buffer->state.last_nggc_skip = skip;
+ const struct radv_shader_variant *v = pipeline->shaders[stage];
+ assert(v->info.has_ngg_culling == nggc_supported);
+
+ /* Find the user SGPR. */
+ const uint32_t base_reg = pipeline->user_data_0[stage];
+ const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
+ assert(!nggc_supported || nggc_sgpr_idx != -1);
+
+ /* Get viewport transform. */
+ float vp_scale[3], vp_translate[3];
+ radv_get_viewport_xform(&cmd_buffer->state.dynamic.viewport.viewports[0], vp_scale, vp_translate);
+ bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
+
+ /* Get current culling settings. */
+ uint32_t nggc_settings = nggc_supported && !skip
+ ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
+ : ngg_cull_none;
+
+ bool emit_viewport = nggc_settings &&
+ (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
+ cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
+ !cmd_buffer->state.last_nggc_settings);
+
+ if (emit_viewport) {
+ /* Correction for inverted Y */
+ if (vp_y_inverted) {
+ vp_scale[1] = -vp_scale[1];
+ vp_translate[1] = -vp_translate[1];
+ }
+
+ /* Correction for number of samples per pixel. */
+ for (unsigned i = 0; i < 2; ++i) {
+ vp_scale[i] *= (float) pipeline->graphics.ms.num_samples;
+ vp_translate[i] *= (float) pipeline->graphics.ms.num_samples;
+ }
+
+ uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
+ const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
+ assert(vp_sgpr_idx != -1);
+ radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
+ radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
+ }
+
+ bool emit_settings = nggc_supported &&
+ (cmd_buffer->state.last_nggc_settings != nggc_settings ||
+ cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
+
+ /* This needs to be emitted when culling is turned on
+ * and when it's already on but some settings change.
+ */
+ if (emit_settings) {
+ assert(nggc_sgpr_idx >= 0);
+ radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
+ }
+
+ /* These only need to be emitted when culling is turned on or off,
+ * but not when it stays on and just some settings change.
+ */
+ if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
+ const struct radv_physical_device *physical_device = cmd_buffer->device->physical_device;
+ uint32_t rsrc2 = v->config.rsrc2;
+ uint32_t oversub_pc_lines = physical_device->rad_info.pc_lines / 4;
+
+ if (nggc_settings) {
+ /* Tweak the parameter cache oversubscription.
+ * This allows the HW to launch more NGG workgroups than the pre-allocated parameter
+ * cache would normally allow, yielding better perf when culling is on.
+ */
+ oversub_pc_lines = physical_device->rad_info.pc_lines * 3 / 4;
+ } else {
+ /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
+ if (stage != MESA_SHADER_GEOMETRY)
+ rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
+ }
+
+ /* When the pipeline is dirty, radv_emit_graphics_pipeline will write this register. */
+ if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)) {
+ radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+ }
+
+ /* Update parameter cache oversubscription setting. */
+ radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC,
+ S_030980_OVERSUB_EN(physical_device->rad_info.use_late_alloc) |
+ S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
+ }
+
+ cmd_buffer->state.last_nggc_settings = nggc_settings;
+ cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
+}
+
static void
radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
{
@@ -5644,6 +5866,10 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
radv_emit_rbplus_state(cmd_buffer);
+ if ((cmd_buffer->device->instance->perftest_flags & RADV_PERFTEST_NGGC) &&
+ cmd_buffer->state.pipeline->graphics.is_ngg)
+ radv_emit_ngg_culling_state(cmd_buffer, info);
+
if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
radv_emit_graphics_pipeline(cmd_buffer);
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 0d579d8c256..9b08c1ba1cf 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1790,6 +1790,10 @@ radv_pipeline_init_raster_state(struct radv_pipeline *pipeline,
S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) |
S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) |
S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+
+ pipeline->graphics.uses_conservative_overestimate =
+ radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) ==
+ VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
}
static void
@@ -5441,6 +5445,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline);
+ pipeline->graphics.has_ngg_culling =
+ pipeline->graphics.is_ngg &&
+ pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling;
radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 06ef5928dc4..b99bea00dd1 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1415,6 +1415,11 @@ struct radv_cmd_state {
bool pending_sqtt_barrier_end;
enum rgp_flush_bits sqtt_flush_bits;
+ /* NGG culling state. */
+ uint32_t last_nggc_settings;
+ int8_t last_nggc_settings_sgpr_idx;
+ bool last_nggc_skip;
+
uint8_t cb_mip[MAX_RTS];
/* Whether DRAW_{INDEX}_INDIRECT_MULTI is emitted. */
@@ -1762,6 +1767,7 @@ struct radv_pipeline {
unsigned pa_cl_clip_cntl;
unsigned cb_color_control;
bool uses_dynamic_stride;
+ bool uses_conservative_overestimate;
/* Used for rbplus */
uint32_t col_format;
@@ -1769,6 +1775,7 @@ struct radv_pipeline {
/* Whether the pipeline uses NGG (GFX10+). */
bool is_ngg;
+ bool has_ngg_culling;
/* Last pre-PS API stage */
gl_shader_stage last_vgt_api_stage;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index f1cb00f27ca..5a59e7f251a 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -969,6 +969,8 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
key->vs_common_out.export_prim_id,
key->vs.provoking_vtx_last);
+ info->has_ngg_culling = out_conf.can_cull;
+ info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
info->is_ngg_passthrough = out_conf.passthrough;
key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index b755c59094d..1ad54b93276 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -162,7 +162,9 @@ enum radv_ud_index {
AC_UD_VIEW_INDEX = 4,
AC_UD_STREAMOUT_BUFFERS = 5,
AC_UD_NGG_GS_STATE = 6,
- AC_UD_SHADER_START = 7,
+ AC_UD_NGG_CULLING_SETTINGS = 7,
+ AC_UD_NGG_VIEWPORT = 8,
+ AC_UD_SHADER_START = 9,
AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
AC_UD_VS_MAX_UD,
@@ -261,6 +263,8 @@ struct radv_shader_info {
bool need_indirect_descriptor_sets;
bool is_ngg;
bool is_ngg_passthrough;
+ bool has_ngg_culling;
+ uint32_t num_lds_blocks_when_not_culling;
uint32_t num_tess_patches;
struct {
uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX];
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c
index d0fde542d45..773a2364565 100644
--- a/src/amd/vulkan/radv_shader_args.c
+++ b/src/amd/vulkan/radv_shader_args.c
@@ -117,6 +117,19 @@ count_vs_user_sgprs(struct radv_shader_args *args)
return count;
}
+static unsigned
+count_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+{
+ unsigned count = 0;
+
+ if (stage == MESA_SHADER_GEOMETRY)
+ count += 1; /* ngg_gs_state */
+ if (args->shader_info->has_ngg_culling)
+ count += 5; /* ngg_culling_settings + 4x ngg_viewport_* */
+
+ return count;
+}
+
static void
allocate_inline_push_consts(struct radv_shader_args *args, struct user_sgpr_info *user_sgpr_info)
{
@@ -184,6 +197,8 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
case MESA_SHADER_VERTEX:
if (!args->is_gs_copy_shader)
user_sgpr_count += count_vs_user_sgprs(args);
+ if (args->options->key.vs_common_out.as_ngg)
+ user_sgpr_count += count_ngg_sgprs(args, stage);
break;
case MESA_SHADER_TESS_CTRL:
if (has_previous_stage) {
@@ -192,11 +207,13 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
}
break;
case MESA_SHADER_TESS_EVAL:
+ if (args->options->key.vs_common_out.as_ngg)
+ user_sgpr_count += count_ngg_sgprs(args, stage);
break;
case MESA_SHADER_GEOMETRY:
if (has_previous_stage) {
if (args->options->key.vs_common_out.as_ngg)
- user_sgpr_count++; /* NGG GS state */
+ user_sgpr_count += count_ngg_sgprs(args, stage);
if (previous_stage == MESA_SHADER_VERTEX) {
user_sgpr_count += count_vs_user_sgprs(args);
@@ -356,6 +373,22 @@ declare_tes_input_vgprs(struct radv_shader_args *args)
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
}
+static void
+declare_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+{
+ if (stage == MESA_SHADER_GEOMETRY) {
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state);
+ }
+
+ if (args->shader_info->has_ngg_culling) {
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_culling_settings);
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[0]);
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[1]);
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[0]);
+ ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_translate[1]);
+ }
+}
+
static void
set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info,
uint8_t *user_sgpr_idx)
@@ -405,6 +438,24 @@ set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage,
}
}
+static void
+set_ngg_sgprs_locs(struct radv_shader_args *args, gl_shader_stage stage, uint8_t *user_sgpr_idx)
+{
+ if (stage == MESA_SHADER_GEOMETRY) {
+ assert(args->ngg_gs_state.used);
+ set_loc_shader(args, AC_UD_NGG_GS_STATE, user_sgpr_idx, 1);
+ }
+
+ if (args->shader_info->has_ngg_culling) {
+ assert(args->ngg_culling_settings.used &&
+ args->ngg_viewport_scale[0].used && args->ngg_viewport_scale[1].used &&
+ args->ngg_viewport_translate[0].used && args->ngg_viewport_translate[1].used);
+
+ set_loc_shader(args, AC_UD_NGG_CULLING_SETTINGS, user_sgpr_idx, 1);
+ set_loc_shader(args, AC_UD_NGG_VIEWPORT, user_sgpr_idx, 4);
+ }
+}
+
/* Returns whether the stage is a stage that can be directly before the GS */
static bool
is_pre_gs_stage(gl_shader_stage stage)
@@ -488,6 +539,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
if (args->options->explicit_scratch_args) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
}
+ if (args->options->key.vs_common_out.as_ngg) {
+ declare_ngg_sgprs(args, stage);
+ }
declare_vs_input_vgprs(args);
break;
@@ -547,6 +601,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
if (args->options->explicit_scratch_args) {
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
}
+ if (args->options->key.vs_common_out.as_ngg) {
+ declare_ngg_sgprs(args, stage);
+ }
declare_tes_input_vgprs(args);
break;
case MESA_SHADER_GEOMETRY:
@@ -576,7 +633,7 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
}
if (args->options->key.vs_common_out.as_ngg) {
- ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state);
+ declare_ngg_sgprs(args, stage);
}
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
@@ -669,6 +726,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
if (args->ac.view_index.used)
set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+ if (args->options->key.vs_common_out.as_ngg)
+ set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
break;
case MESA_SHADER_TESS_CTRL:
set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
@@ -678,6 +737,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
case MESA_SHADER_TESS_EVAL:
if (args->ac.view_index.used)
set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+ if (args->options->key.vs_common_out.as_ngg)
+ set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
break;
case MESA_SHADER_GEOMETRY:
if (has_previous_stage) {
@@ -688,8 +749,8 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
if (args->ac.view_index.used)
set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
- if (args->ngg_gs_state.used)
- set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1);
+ if (args->options->key.vs_common_out.as_ngg)
+ set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
break;
case MESA_SHADER_FRAGMENT:
break;
diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h
index a6828cdf309..a7c13152fcb 100644
--- a/src/amd/vulkan/radv_shader_args.h
+++ b/src/amd/vulkan/radv_shader_args.h
@@ -41,6 +41,9 @@ struct radv_shader_args {
/* NGG GS */
struct ac_arg ngg_gs_state;
+ struct ac_arg ngg_culling_settings;
+ struct ac_arg ngg_viewport_scale[2];
+ struct ac_arg ngg_viewport_translate[2];
bool is_gs_copy_shader;
bool is_trap_handler_shader;
More information about the mesa-commit
mailing list