[Mesa-dev] [PATCH 07/17] radeonsi: atomize L2 prefetches
Marek Olšák
maraeo at gmail.com
Thu Jan 26 16:04:23 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
to move the big conditional statement out of draw_vbo
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 39 +++++++++++++++++++++++++
src/gallium/drivers/radeonsi/si_descriptors.c | 2 ++
src/gallium/drivers/radeonsi/si_hw_context.c | 3 ++
src/gallium/drivers/radeonsi/si_pipe.h | 1 +
src/gallium/drivers/radeonsi/si_state.h | 1 +
src/gallium/drivers/radeonsi/si_state_draw.c | 37 +----------------------
src/gallium/drivers/radeonsi/si_state_shaders.c | 3 ++
7 files changed, 50 insertions(+), 36 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 582e599..b398256 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -399,14 +399,53 @@ void si_copy_buffer(struct si_context *sctx,
}
void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
uint64_t offset, unsigned size)
{
assert(sctx->b.chip_class >= CIK);
si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL);
}
+static void cik_prefetch_shader_async(struct si_context *sctx,
+ struct si_pm4_state *state)
+{
+ if (state) {
+ struct pipe_resource *bo = &state->bo[0]->b.b;
+ assert(state->nbo == 1);
+
+ cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+ }
+}
+
+static void cik_emit_prefetch_L2(struct si_context *sctx, struct r600_atom *atom)
+{
+ /* Prefetch shaders and VBO descriptors to TC L2. */
+ if (si_pm4_state_changed(sctx, ls))
+ cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+ if (si_pm4_state_changed(sctx, hs))
+ cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+ if (si_pm4_state_changed(sctx, es))
+ cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+ if (si_pm4_state_changed(sctx, gs))
+ cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+ if (si_pm4_state_changed(sctx, vs))
+ cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+
+ /* Vertex buffer descriptors are uploaded uncached, so prefetch
+ * them right after the VS binary. */
+ if (sctx->vertex_buffer_pointer_dirty) {
+ cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
+ sctx->vertex_buffers.buffer_offset,
+ sctx->vertex_elements->count * 16);
+ }
+ if (si_pm4_state_changed(sctx, ps))
+ cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+}
+
void si_init_cp_dma_functions(struct si_context *sctx)
{
sctx->b.clear_buffer = si_clear_buffer;
+
+ si_init_atom(sctx, &sctx->prefetch_L2, &sctx->atoms.s.prefetch_L2,
+ cik_emit_prefetch_L2);
}
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 4a9fcd0..4c1120a 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1031,20 +1031,22 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
(struct r600_resource*)vb->buffer,
RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
}
}
/* Don't flush the const cache. It would have a very negative effect
* on performance (confirmed by testing). New descriptors are always
* uploaded to a fresh new buffer, so I don't think flushing the const
* cache is needed. */
si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+ if (sctx->b.chip_class >= CIK)
+ si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
sctx->vertex_buffers_dirty = false;
sctx->vertex_buffer_pointer_dirty = true;
return true;
}
/* CONSTANT BUFFERS */
static unsigned
si_const_buffer_descriptors_idx(unsigned shader)
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 57eaac9..d862e26 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -200,20 +200,23 @@ void si_begin_new_cs(struct si_context *ctx)
si_pm4_emit(ctx, ctx->init_config_gs_rings);
if (ctx->ce_preamble_ib)
si_ce_enable_loads(ctx->ce_preamble_ib);
else if (ctx->ce_ib)
si_ce_enable_loads(ctx->ce_ib);
if (ctx->ce_preamble_ib)
si_ce_reinitialize_all_descriptors(ctx);
+ if (ctx->b.chip_class >= CIK)
+ si_mark_atom_dirty(ctx, &ctx->prefetch_L2);
+
ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
ctx->framebuffer.dirty_zsbuf = true;
si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
si_mark_atom_dirty(ctx, &ctx->clip_regs);
si_mark_atom_dirty(ctx, &ctx->clip_state.atom);
ctx->msaa_sample_locs.nr_samples = 0;
si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs.atom);
si_mark_atom_dirty(ctx, &ctx->msaa_config);
si_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6558474..b6474e6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -225,20 +225,21 @@ struct si_context {
bool compute_is_busy;
/* Atoms (direct states). */
union si_state_atoms atoms;
unsigned dirty_atoms; /* mask */
/* PM4 states (precomputed immutable states) */
union si_state queued;
union si_state emitted;
/* Atom declarations. */
+ struct r600_atom prefetch_L2;
struct si_framebuffer framebuffer;
struct si_sample_locs msaa_sample_locs;
struct r600_atom db_render_state;
struct r600_atom msaa_config;
struct si_sample_mask sample_mask;
struct r600_atom cb_render_state;
struct si_blend_color blend_color;
struct r600_atom clip_regs;
struct si_clip_state clip_state;
struct si_shader_data shader_userdata;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 03e5011..915a8eb 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -125,20 +125,21 @@ union si_state {
struct si_pm4_state *vgt_shader_config;
struct si_pm4_state *vs;
struct si_pm4_state *ps;
} named;
struct si_pm4_state *array[0];
};
union si_state_atoms {
struct {
/* The order matters. */
+ struct r600_atom *prefetch_L2;
struct r600_atom *render_cond;
struct r600_atom *streamout_begin;
struct r600_atom *streamout_enable; /* must be after streamout_begin */
struct r600_atom *framebuffer;
struct r600_atom *msaa_sample_locs;
struct r600_atom *db_render_state;
struct r600_atom *msaa_config;
struct r600_atom *sample_mask;
struct r600_atom *cb_render_state;
struct r600_atom *blend_color;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index db671c9..0374841 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -933,31 +933,20 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx)
void si_ce_post_draw_synchronization(struct si_context *sctx)
{
if (sctx->ce_need_synchronization) {
radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
radeon_emit(sctx->b.gfx.cs, 0);
sctx->ce_need_synchronization = false;
}
}
-static void cik_prefetch_shader_async(struct si_context *sctx,
- struct si_pm4_state *state)
-{
- if (state) {
- struct pipe_resource *bo = &state->bo[0]->b.b;
- assert(state->nbo == 1);
-
- cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
- }
-}
-
void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
struct pipe_index_buffer ib = {};
unsigned mask, dirty_fb_counter, dirty_tex_counter, rast_prim;
if (likely(!info->indirect)) {
/* SI-CI treat instance_count==0 as instance_count==1. There is
* no workaround for indirect draws, but we can at least skip
@@ -1122,48 +1111,24 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
si_need_cs_space(sctx);
/* Since we've called r600_context_add_resource_size for vertex buffers,
* this must be called after si_need_cs_space, because we must let
* need_cs_space flush before we add buffers to the buffer list.
*/
if (!si_upload_vertex_buffer_descriptors(sctx))
return;
- /* Flushed caches prior to prefetching shaders. */
+ /* Flush caches before the first state atom, which does L2 prefetches. */
if (sctx->b.flags)
si_emit_cache_flush(sctx);
- /* Prefetch shaders and VBO descriptors to TC L2. */
- if (sctx->b.chip_class >= CIK) {
- if (si_pm4_state_changed(sctx, ls))
- cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
- if (si_pm4_state_changed(sctx, hs))
- cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
- if (si_pm4_state_changed(sctx, es))
- cik_prefetch_shader_async(sctx, sctx->queued.named.es);
- if (si_pm4_state_changed(sctx, gs))
- cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
- if (si_pm4_state_changed(sctx, vs))
- cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-
- /* Vertex buffer descriptors are uploaded uncached, so prefetch
- * them right after the VS binary. */
- if (sctx->vertex_buffer_pointer_dirty) {
- cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
- sctx->vertex_buffers.buffer_offset,
- sctx->vertex_elements->count * 16);
- }
- if (si_pm4_state_changed(sctx, ps))
- cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
- }
-
/* Emit states. */
mask = sctx->dirty_atoms;
while (mask) {
struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
atom->emit(&sctx->b, atom);
}
sctx->dirty_atoms = 0;
si_pm4_emit_dirty(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b3616dc..02f8d6c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2518,20 +2518,23 @@ bool si_update_shaders(struct si_context *sctx)
if (si_pm4_state_changed(sctx, ls) ||
si_pm4_state_changed(sctx, hs) ||
si_pm4_state_changed(sctx, es) ||
si_pm4_state_changed(sctx, gs) ||
si_pm4_state_changed(sctx, vs) ||
si_pm4_state_changed(sctx, ps)) {
if (!si_update_spi_tmpring_size(sctx))
return false;
}
+ if (sctx->b.chip_class >= CIK)
+ si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
+
sctx->do_update_shaders = false;
return true;
}
void si_init_shader_functions(struct si_context *sctx)
{
si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
sctx->b.b.create_vs_state = si_create_shader_selector;
sctx->b.b.create_tcs_state = si_create_shader_selector;
--
2.7.4
More information about the mesa-dev
mailing list