[Mesa-dev] [PATCH 11/11] radeonsi: use optimal packet order when doing a pipeline sync
Marek Olšák
maraeo at gmail.com
Sun Aug 6 22:20:26 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
Process most new SET packets in parallel with previous draw calls, then
flush caches and wait, start the draw, and do L2 prefetches last.
This decreases the [CP busy / SPI busy] ratio (verified with GRBM perf
counters). In other words, the time window when shaders are idle (between
(the wait and the draw) is much shorter now.
---
src/gallium/drivers/radeonsi/si_state_draw.c | 117 +++++++++++++++++++--------
1 file changed, 83 insertions(+), 34 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 9df5b7a..22b7f56 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1155,28 +1155,63 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx)
void si_ce_post_draw_synchronization(struct si_context *sctx)
{
if (sctx->ce_need_synchronization) {
radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
radeon_emit(sctx->b.gfx.cs, 0); /* unused */
sctx->ce_need_synchronization = false;
}
}
+static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
+ unsigned skip_atom_mask)
+{
+ /* Emit state atoms. */
+ unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
+ while (mask) {
+ struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+
+ atom->emit(&sctx->b, atom);
+ }
+ sctx->dirty_atoms &= skip_atom_mask;
+
+ /* Emit states. */
+ mask = sctx->dirty_states;
+ while (mask) {
+ unsigned i = u_bit_scan(&mask);
+ struct si_pm4_state *state = sctx->queued.array[i];
+
+ if (!state || sctx->emitted.array[i] == state)
+ continue;
+
+ si_pm4_emit(sctx, state);
+ sctx->emitted.array[i] = state;
+ }
+ sctx->dirty_states = 0;
+
+ /* Emit draw states. */
+ unsigned num_patches = 0;
+
+ si_emit_rasterizer_prim_state(sctx);
+ if (sctx->tes_shader.cso)
+ si_emit_derived_tess_state(sctx, info, &num_patches);
+ si_emit_vs_state(sctx, info);
+ si_emit_draw_registers(sctx, info, num_patches);
+}
+
void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
struct pipe_resource *indexbuf = info->index.resource;
- unsigned mask, dirty_tex_counter;
+ unsigned dirty_tex_counter;
enum pipe_prim_type rast_prim;
- unsigned num_patches = 0;
unsigned index_size = info->index_size;
unsigned index_offset = info->indirect ? info->start * index_size : 0;
if (likely(!info->indirect)) {
/* SI-CI treat instance_count==0 as instance_count==1. There is
* no workaround for indirect draws, but we can at least skip
* direct draws.
*/
if (unlikely(!info->instance_count))
return;
@@ -1244,23 +1279,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
sctx->do_update_shaders = true;
}
}
if (sctx->do_update_shaders && !si_update_shaders(sctx))
return;
- if (!si_upload_graphics_shader_descriptors(sctx))
- return;
-
if (index_size) {
/* Translate or upload, if needed. */
/* 8-bit indices are supported on VI. */
if (sctx->b.chip_class <= CIK && index_size == 1) {
unsigned start, count, start_offset, size, offset;
void *ptr;
si_get_draw_start_count(sctx, info, &start, &count);
start_offset = start * 2;
size = count * 2;
@@ -1335,58 +1367,75 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
*/
if (!si_upload_vertex_buffer_descriptors(sctx))
return;
/* GFX9 scissor bug workaround. There is also a more efficient but
* more involved alternative workaround. */
if (sctx->b.chip_class == GFX9 &&
si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
- /* Flush caches before the first state atom, which does L2 prefetches. */
- if (sctx->b.flags)
+ /* Use optimal packet order based on whether we need to sync the pipeline. */
+ if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+ SI_CONTEXT_FLUSH_AND_INV_DB |
+ SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+ /* If we have to wait for idle, set all states first, so that all
+ * SET packets are processed in parallel with previous draw calls.
+ * Then upload descriptors, set shader pointers, and draw, and
+ * prefetch at the end. This ensures that the time the CUs
+ * are idle is very short. (there are only CE dumps and SET_SH
+ * packets between the wait and the draw)
+ */
+ struct r600_atom *shader_pointers = &sctx->shader_pointers.atom;
+
+ /* Emit all states except shader pointers. */
+ si_emit_all_states(sctx, info, 1 << shader_pointers->id);
si_emit_cache_flush(sctx);
- if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
- cik_emit_prefetch_L2(sctx);
+ /* <-- CUs are idle here. */
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ return;
- /* Emit state atoms. */
- mask = sctx->dirty_atoms;
- while (mask) {
- struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+ /* Set shader pointers after descriptors are uploaded. */
+ if (si_is_atom_dirty(sctx, shader_pointers)) {
+ shader_pointers->emit(&sctx->b, NULL);
+ sctx->dirty_atoms = 0;
+ }
- atom->emit(&sctx->b, atom);
- }
- sctx->dirty_atoms = 0;
+ si_ce_pre_draw_synchronization(sctx);
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+ /* <-- CUs are busy here. */
- /* Emit states. */
- mask = sctx->dirty_states;
- while (mask) {
- unsigned i = u_bit_scan(&mask);
- struct si_pm4_state *state = sctx->queued.array[i];
+ /* Start prefetches after the draw has been started. Both will run
+ * in parallel, but starting the draw first is more important.
+ */
+ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx);
+ } else {
+ /* If we don't wait for idle, start prefetches first, then set
+ * states, and draw at the end.
+ */
+ if (sctx->b.flags)
+ si_emit_cache_flush(sctx);
- if (!state || sctx->emitted.array[i] == state)
- continue;
+ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+ cik_emit_prefetch_L2(sctx);
- si_pm4_emit(sctx, state);
- sctx->emitted.array[i] = state;
- }
- sctx->dirty_states = 0;
+ if (!si_upload_graphics_shader_descriptors(sctx))
+ return;
- si_emit_rasterizer_prim_state(sctx);
- if (sctx->tes_shader.cso)
- si_emit_derived_tess_state(sctx, info, &num_patches);
- si_emit_vs_state(sctx, info);
- si_emit_draw_registers(sctx, info, num_patches);
+ si_emit_all_states(sctx, info, 0);
+ si_ce_pre_draw_synchronization(sctx);
+ si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
+ }
- si_ce_pre_draw_synchronization(sctx);
- si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
si_ce_post_draw_synchronization(sctx);
if (sctx->trace_buf)
si_trace_emit(sctx);
/* Workaround for a VGT hang when streamout is enabled.
* It must be done after drawing. */
if ((sctx->b.family == CHIP_HAWAII ||
sctx->b.family == CHIP_TONGA ||
sctx->b.family == CHIP_FIJI) &&
--
2.7.4
More information about the mesa-dev
mailing list