Mesa (main): radeonsi: add optimal multi draws and draw-level splitting for prim discard CS
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Jun 28 13:39:00 UTC 2021
Module: Mesa
Branch: main
Commit: b141e50282752cd1ad6de274fb0e66a3f7e6d011
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=b141e50282752cd1ad6de274fb0e66a3f7e6d011
Author: Marek Olšák <marek.olsak at amd.com>
Date: Mon May 31 21:59:28 2021 -0400
radeonsi: add optimal multi draws and draw-level splitting for prim discard CS
This is a partial rewrite of some parts of the code.
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11510>
---
.../drivers/radeonsi/si_compute_prim_discard.c | 309 +++++++++++++--------
src/gallium/drivers/radeonsi/si_pipe.h | 7 +-
src/gallium/drivers/radeonsi/si_shader_llvm.c | 3 +-
src/gallium/drivers/radeonsi/si_state_draw.cpp | 22 +-
4 files changed, 205 insertions(+), 136 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index ae043037ba0..3fd9560dacd 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -95,23 +95,24 @@
* - Bindless textures and images must not occur in the vertex shader.
*
* User data SGPR layout:
+ * VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader.
+ * START_OUT_INDEX: output index buffer offset / 12
+ * START_IN_INDEX: input index buffer offset / index_size
+ * VS.BASE_VERTEX: same value as VS
* INDEX_BUFFERS: pointer to constants
* 0..3: input index buffer - typed buffer view
* 4..7: output index buffer - typed buffer view
* 8..11: viewport state - scale.xy, translate.xy
- * VERTEX_COUNTER: address of "count" in the draw packet incremented
- * atomically by the shader.
* VS.VERTEX_BUFFERS: same value as VS
* VS.CONST_AND_SHADER_BUFFERS: same value as VS
* VS.SAMPLERS_AND_IMAGES: same value as VS
- * VS.BASE_VERTEX: same value as VS
* VS.START_INSTANCE: same value as VS
+ * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
* NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
* per instance for instancing.
* NUM_PRIMS_UDIV_TERMS:
* - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
* - Bits [5:31]: The number of primitives per instance for computing the remainder.
- * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
*
* How to test primitive restart (the most complicated part because it needs
* to get the primitive orientation right):
@@ -243,24 +244,26 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
memset(&ctx->args, 0, sizeof(ctx->args));
struct ac_arg param_index_buffers_and_constants, param_vertex_counter;
- struct ac_arg param_vb_desc, param_const_desc;
- struct ac_arg param_base_vertex, param_start_instance;
- struct ac_arg param_block_id, param_local_id;
- struct ac_arg param_smallprim_precision;
+ struct ac_arg param_vb_desc, param_const_desc, param_start_out_index;
+ struct ac_arg param_base_vertex, param_start_instance, param_start_in_index;
+ struct ac_arg param_block_id, param_local_id, param_smallprim_precision;
struct ac_arg param_num_prims_udiv_multiplier, param_num_prims_udiv_terms;
struct ac_arg param_sampler_desc;
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
- ¶m_index_buffers_and_constants);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_out_index);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_in_index);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_index_buffers_and_constants);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
- ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision);
+ if (key->opt.cs_instancing) {
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_multiplier);
+ ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_num_prims_udiv_terms);
+ }
/* Block ID and thread ID inputs. */
ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id);
@@ -358,6 +361,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
/* Fetch indices. */
if (key->opt.cs_indexed) {
for (unsigned i = 0; i < 3; i++) {
+ index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), "");
index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
1, 0, true, false, false);
index[i] = ac_to_integer(&ctx->ac, index[i]);
@@ -473,6 +477,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
/* Write indices for accepted primitives. */
LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, "");
+ vindex = LLVMBuildAdd(builder, vindex, ac_get_arg(&ctx->ac, param_start_out_index), "");
LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3);
if (!ac_has_vec3_support(ctx->ac.chip_class, true))
@@ -562,6 +567,8 @@ static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_s
sctx->index_ring_size_per_ib;
}
+#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7)
+
enum si_prim_discard_outcome
si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
unsigned drawid_offset,
@@ -680,12 +687,21 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
return SI_PRIM_DISCARD_DISABLED;
}
- unsigned num_subdraws = DIV_ROUND_UP(num_prims, PRIMS_PER_BATCH) * num_draws;
- unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
- 24 * (num_subdraws - 1) + /* subdraws */
- 30; /* leave some space at the end */
- unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0) +
- num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+ /* Compute how many CS dwords we need to reserve. */
+ unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE +
+ 11 /* shader */ +
+ 30; /* leave some space at the end */
+ unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0);
+
+ for (unsigned i = 0; i < num_draws; i++) {
+ unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH);
+
+ need_compute_dw += 8 * num_subdraws + /* signal REWIND */
+ 14 /* user SGPRs */ +
+ 4 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */
+ 11 * num_subdraws;
+ need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+ }
if (ring_full ||
!sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
@@ -708,6 +724,7 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
assert(compute_has_space);
assert(si_check_ring_space(sctx, out_indexbuf_size));
+ assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw);
return SI_PRIM_DISCARD_ENABLED;
}
@@ -745,22 +762,29 @@ void si_compute_signal_gfx(struct si_context *sctx)
/* Dispatch a primitive discard compute shader. */
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
const struct pipe_draw_info *info,
- unsigned count, unsigned index_size,
- unsigned base_vertex, uint64_t input_indexbuf_va,
- unsigned input_indexbuf_num_elements)
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws, unsigned index_size,
+ unsigned total_count, uint64_t input_indexbuf_va,
+ unsigned index_max_size)
{
struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs;
struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs;
- unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count);
- if (!num_prims_per_instance)
- return;
-
- unsigned num_prims = num_prims_per_instance * info->instance_count;
+ unsigned num_total_prims;
unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format;
+ if (!info->instance_count)
+ return;
+
switch (info->mode) {
case PIPE_PRIM_TRIANGLES:
case PIPE_PRIM_TRIANGLE_STRIP:
+ if (info->mode == PIPE_PRIM_TRIANGLES)
+ num_total_prims = total_count / 3;
+ else if (total_count >= 2)
+ num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */
+ else
+ num_total_prims = 0;
+
vertices_per_prim = 3;
output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
@@ -770,8 +794,13 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
return;
}
+ if (!num_total_prims)
+ return;
+
+ num_total_prims *= info->instance_count;
+
unsigned out_indexbuf_offset;
- uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+ uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4;
/* Initialize the compute IB if it's empty. */
if (!sctx->prim_discard_compute_ib_initialized) {
@@ -789,7 +818,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
/* This needs to be done at the beginning of IBs due to possible
* TTM buffer moves in the kernel.
*/
- if (sctx->chip_class >= GFX10) {
+ if (sctx->chip_class >= GFX10) { /* 8 DW */
radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(cs, 0); /* CP_COHER_CNTL */
@@ -811,9 +840,9 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
S_0085F0_SH_KCACHE_ACTION_ENA(1));
}
- si_emit_initial_compute_regs(sctx, cs);
+ si_emit_initial_compute_regs(sctx, cs); /* 39 DW */
- radeon_begin(cs);
+ radeon_begin(cs); /* 11 DW */
radeon_set_sh_reg(
cs, R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
@@ -832,12 +861,13 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
assert(!sctx->last_ib_barrier_fence);
radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ,
RADEON_PRIO_FENCE);
- si_cp_wait_mem(sctx, cs,
+ si_cp_wait_mem(sctx, cs, /* 7 DW */
sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset,
1, 1, WAIT_REG_MEM_EQUAL);
}
sctx->prim_discard_compute_ib_initialized = true;
+ assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE);
}
/* Allocate the output index buffer. */
@@ -864,7 +894,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
/* Input index buffer. */
desc[0] = input_indexbuf_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size);
- desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+ desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1);
if (sctx->chip_class >= GFX10) {
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
@@ -885,7 +915,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
desc[4] = out_indexbuf_va;
desc[5] =
S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4);
- desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+ desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
if (sctx->chip_class >= GFX10) {
desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
@@ -911,7 +941,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
/* Set user data SGPRs. */
/* This can't be >= 16 if we want the fastest launch rate. */
- unsigned user_sgprs = 10;
+ unsigned user_sgprs = info->instance_count > 1 ? 12 : 10;
uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
@@ -921,18 +951,9 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
uint64_t vb_desc_va = sctx->vb_descriptors_buffer
? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset
: 0;
- struct si_fast_udiv_info32 num_prims_udiv = {};
-
- if (info->instance_count > 1)
- num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
-
- /* Limitations on how these two are packed in the user SGPR. */
- assert(num_prims_udiv.post_shift < 32);
- assert(num_prims_per_instance < 1 << 27);
-
si_resource_reference(&indexbuf_desc, NULL);
- /* Set shader registers. */
+ /* Set the compute shader. */
struct si_shader *shader = sctx->cs_prim_discard_state.current;
if (shader != sctx->compute_ib_last_shader) {
@@ -969,91 +990,141 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0);
- /* Big draw calls are split into smaller dispatches and draw packets. */
- for (unsigned start_prim = 0; start_prim < num_prims; start_prim = num_prims /* implement splitting */) {
- unsigned num_subdraw_prims;
+ struct si_fast_udiv_info32 num_prims_udiv = {};
+
+ for (unsigned i = 0; i < num_draws; i++) {
+ unsigned count = draws[i].count;
+ unsigned num_prims_per_instance, num_prims;
- if (start_prim + PRIMS_PER_BATCH < num_prims)
- num_subdraw_prims = PRIMS_PER_BATCH;
+ /* Determine the number of primitives per instance. */
+ if (info->mode == PIPE_PRIM_TRIANGLES)
+ num_prims_per_instance = count / 3;
+ else if (count >= 2)
+ num_prims_per_instance = count - 2;
else
- num_subdraw_prims = num_prims - start_prim;
+ num_prims_per_instance = 0;
- /* Small dispatches are executed back to back until a specific primitive
- * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
- * to start drawing the batch. This batching adds latency to the gfx IB,
- * but CS_DONE and REWIND are too slow.
- */
- if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
- si_compute_signal_gfx(sctx);
+ if (!num_prims_per_instance)
+ continue;
- if (sctx->compute_num_prims_in_batch == 0) {
- assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
- sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+ num_prims = num_prims_per_instance;
- radeon_begin(gfx_cs);
- radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
- radeon_emit(gfx_cs, 0);
- radeon_end();
+ if (info->instance_count > 1) {
+ num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+ num_prims *= info->instance_count;
}
- sctx->compute_num_prims_in_batch += num_subdraw_prims;
+ /* Limitations on how these two are packed in the user SGPR. */
+ assert(num_prims_udiv.post_shift < 32);
+ assert(num_prims_per_instance < 1 << 27);
- uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
- uint64_t index_va = out_indexbuf_va + start_prim * 12;
+ /* Big draw calls are split into smaller dispatches and draw packets. */
+ for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) {
+ unsigned num_subdraw_prims;
- /* Emit the draw packet into the gfx IB. */
- radeon_begin(gfx_cs);
- radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
- radeon_emit(gfx_cs, num_prims * vertices_per_prim);
- radeon_emit(gfx_cs, index_va);
- radeon_emit(gfx_cs, index_va >> 32);
- radeon_emit(gfx_cs, 0);
- radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
- radeon_end();
+ if (start_prim + PRIMS_PER_BATCH < num_prims) {
+ num_subdraw_prims = PRIMS_PER_BATCH;
+ } else {
+ num_subdraw_prims = num_prims - start_prim;
+ }
- radeon_begin_again(cs);
-
- /* Continue with the compute IB. */
- if (start_prim == 0) {
- radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
- radeon_emit(cs, index_buffers_va);
- radeon_emit(cs, count_va);
- radeon_emit(cs, vb_desc_va);
- radeon_emit(cs, vs_const_desc_va);
- radeon_emit(cs, vs_sampler_desc_va);
- radeon_emit(cs, base_vertex);
- radeon_emit(cs, info->start_instance);
- radeon_emit(cs, num_prims_udiv.multiplier);
- radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
- /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
- radeon_emit(cs, fui(cull_info.small_prim_precision));
- } else {
-#if 0 /* TODO: draw splitting could be enabled */
- /* Only update the SGPRs that changed. */
- radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 1);
- radeon_emit(cs, count_va);
-#endif
- }
+ /* Small dispatches are executed back to back until a specific primitive
+ * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+ * to start drawing the batch. This batching adds latency to the gfx IB,
+ * but CS_DONE and REWIND are too slow.
+ */
+ if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+ si_compute_signal_gfx(sctx);
+
+ if (sctx->compute_num_prims_in_batch == 0) {
+ assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+ sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+ radeon_begin(gfx_cs);
+ radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+ radeon_emit(gfx_cs, 0);
+ radeon_end();
+ }
- /* Set grid dimensions. */
- unsigned start_block = start_prim / THREADGROUP_SIZE;
- unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
- unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
-
- radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
- radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
- S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
- S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
-
- radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
- radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
- radeon_emit(cs, 1);
- radeon_emit(cs, 1);
- radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
- S_00B800_ORDER_MODE(0 /* launch in order */));
- radeon_end();
+ sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+ uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+ uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+ /* Emit the draw packet into the gfx IB. */
+ radeon_begin(gfx_cs);
+ radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+ radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim);
+ radeon_emit(gfx_cs, index_va);
+ radeon_emit(gfx_cs, index_va >> 32);
+ radeon_emit(gfx_cs, 0);
+ radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+ radeon_end();
- assert(cs->current.cdw <= cs->current.max_dw);
- assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+ radeon_begin_again(cs);
+
+ /* Continue with the compute IB. */
+ if (start_prim == 0) {
+ if (i == 0) {
+ /* First draw. */
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, start_prim);
+ radeon_emit(cs, draws[i].start);
+ radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
+ radeon_emit(cs, index_buffers_va);
+ radeon_emit(cs, vb_desc_va);
+ radeon_emit(cs, vs_const_desc_va);
+ radeon_emit(cs, vs_sampler_desc_va);
+ radeon_emit(cs, info->start_instance);
+ /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+ radeon_emit(cs, fui(cull_info.small_prim_precision));
+
+ if (info->instance_count > 1) {
+ radeon_emit(cs, num_prims_udiv.multiplier);
+ radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
+ }
+ } else {
+ /* Subsequent draws. */
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, 0);
+ radeon_emit(cs, draws[i].start);
+ radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start);
+
+ if (info->instance_count > 1) {
+ radeon_set_sh_reg_seq(cs, R_00B928_COMPUTE_USER_DATA_10, 2);
+ radeon_emit(cs, num_prims_udiv.multiplier);
+ radeon_emit(cs, num_prims_udiv.post_shift | (num_prims_per_instance << 5));
+ }
+ }
+ } else {
+ /* Draw split. Only update the SGPRs that changed. */
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+ radeon_emit(cs, count_va);
+ radeon_emit(cs, start_prim);
+ }
+
+ /* Set grid dimensions. */
+ unsigned start_block = start_prim / THREADGROUP_SIZE;
+ unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+ unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+ radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+ radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+ S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+ S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+ radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
+ radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, 1);
+ radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+ S_00B800_ORDER_MODE(0 /* launch in order */));
+ radeon_end();
+
+ assert(cs->current.cdw <= cs->current.max_dw);
+ assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+ }
}
}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a58f26bb018..3d38cf05fc4 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1518,9 +1518,10 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
void si_compute_signal_gfx(struct si_context *sctx);
void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
const struct pipe_draw_info *info,
- unsigned count, unsigned index_size,
- unsigned base_vertex, uint64_t input_indexbuf_va,
- unsigned input_indexbuf_max_elements);
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws, unsigned index_size,
+ unsigned total_count, uint64_t input_indexbuf_va,
+ unsigned index_max_size);
void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
unsigned *prim_discard_vertex_count_threshold,
unsigned *index_ring_size_per_ib);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 574d8b351b2..2d643c58cf3 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -1292,7 +1292,8 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
}
/* Make sure the input is a pointer and not integer followed by inttoptr. */
- assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+ if (!shader->key.opt.vs_as_prim_discard_cs)
+ assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
/* Compile to bytecode. */
if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 1a525f8d8fa..a3823220f96 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -978,7 +978,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
unsigned drawid_base,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *draws,
- unsigned num_draws,
+ unsigned num_draws, unsigned total_count,
struct pipe_resource *indexbuf, unsigned index_size,
unsigned index_offset, unsigned instance_count,
bool dispatch_prim_discard_cs, unsigned original_index_size)
@@ -1194,13 +1194,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
radeon_end();
- for (unsigned i = 0; i < num_draws; i++) {
- uint64_t va = index_va + draws[i].start * original_index_size;
-
- si_dispatch_prim_discard_cs_and_draw(sctx, info, draws[i].count,
- original_index_size, base_vertex,
- va, MIN2(index_max_size, draws[i].count));
- }
+ si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
+ original_index_size, total_count, index_va,
+ index_max_size);
EMIT_SQTT_END_DRAW;
return;
}
@@ -2172,8 +2168,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
assert(sctx->dirty_atoms == 0);
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
- (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, index_size,
- index_offset, instance_count, dispatch_prim_discard_cs,
+ (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
+ index_size, index_offset, instance_count, dispatch_prim_discard_cs,
original_index_size);
/* <-- CUs are busy here. */
@@ -2211,9 +2207,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
assert(sctx->dirty_atoms == 0);
si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
- (sctx, info, drawid_offset, indirect, draws, num_draws, indexbuf, index_size,
- index_offset, instance_count,
- dispatch_prim_discard_cs, original_index_size);
+ (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
+ index_size, index_offset, instance_count, dispatch_prim_discard_cs,
+ original_index_size);
/* Prefetch the remaining shaders after the draw has been
* started. */
More information about the mesa-commit
mailing list