[Mesa-dev] [PATCH 6/6] radeonsi: simplify and improve flushing

Marek Olšák maraeo at gmail.com
Wed Aug 28 10:17:29 PDT 2013


This mimics r600g. The R600_CONTEXT_xxx flags are added to rctx->b.flags
and si_emit_cache_flush emits the packets. That's it. The shared radeon code
tells us when the streamout cache should be flushed, so we have to check
the flags anyway.

There is a new atom "cache_flush", because caches must be flushed *after*
resource descriptors are changed in memory.

Functional changes:

* Write caches are flushed at the end of CS and read caches are flushed
  at its beginning.

* Sampler view states are removed from si_state, they only held the flush
  flags.

* Everytime a shader is changed, the I cache is flushed. Is this needed?
  Due to a hw bug, this also flushes the K cache.

* The WRITE_DATA packet is changed to use TC, which fixes a rendering issue
  in openarena. I'm not sure how TC interacts with CP DMA, but for now it
  seems to work better than any other solution I tried. (BTW CIK allows us
  to use TC for CP DMA.)

* Flush the K cache instead of the texture cache when updating resource
  descriptors (due to a hw bug, this also flushes the I cache).
  I think the K cache flush is correct here, but I'm not sure if the texture
  cache should be flushed too (probably not considering we use TC
  for WRITE_DATA, but we don't use TC for CP DMA).

* The number of resource contexts is decreased to 16. With all of these cache
  changes, 4 doesn't work, but 8 works, which suggests I'm actually doing
  the right thing here and the pipeline isn't drained during flushes.
---
 src/gallium/drivers/radeon/r600_pipe_common.h  |   1 +
 src/gallium/drivers/radeonsi/r600.h            |   3 -
 src/gallium/drivers/radeonsi/r600_hw_context.c |  45 +++-------
 src/gallium/drivers/radeonsi/radeonsi_pipe.c   |   4 +
 src/gallium/drivers/radeonsi/radeonsi_pipe.h   |   8 +-
 src/gallium/drivers/radeonsi/radeonsi_pm4.c    |  11 ---
 src/gallium/drivers/radeonsi/radeonsi_pm4.h    |   2 -
 src/gallium/drivers/radeonsi/si_commands.c     |   9 --
 src/gallium/drivers/radeonsi/si_descriptors.c  |  16 ++--
 src/gallium/drivers/radeonsi/si_state.c        |  46 +++++-----
 src/gallium/drivers/radeonsi/si_state.h        |   9 +-
 src/gallium/drivers/radeonsi/si_state_draw.c   | 111 ++++++++++++++++---------
 12 files changed, 125 insertions(+), 140 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 4b993ee..bd13488 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -42,6 +42,7 @@
 #define R600_CONTEXT_INV_VERTEX_CACHE		(1 << 0)
 #define R600_CONTEXT_INV_TEX_CACHE		(1 << 1)
 #define R600_CONTEXT_INV_CONST_CACHE		(1 << 2)
+#define R600_CONTEXT_INV_SHADER_CACHE		(1 << 3)
 /* read-write caches */
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1 << 8)
 #define R600_CONTEXT_FLUSH_AND_INV		(1 << 9)
diff --git a/src/gallium/drivers/radeonsi/r600.h b/src/gallium/drivers/radeonsi/r600.h
index ebadd97..46cfb14 100644
--- a/src/gallium/drivers/radeonsi/r600.h
+++ b/src/gallium/drivers/radeonsi/r600.h
@@ -69,9 +69,6 @@ struct r600_query {
 	struct list_head			list;
 };
 
-#define R600_CONTEXT_DST_CACHES_DIRTY	(1 << 1)
-#define R600_CONTEXT_CHECK_EVENT_FLUSH	(1 << 2)
-
 struct r600_context;
 struct r600_screen;
 
diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c
index 5631bdb..5826349 100644
--- a/src/gallium/drivers/radeonsi/r600_hw_context.c
+++ b/src/gallium/drivers/radeonsi/r600_hw_context.c
@@ -150,7 +150,7 @@ void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	}
 
 	/* Count in framebuffer cache flushes at the end of CS. */
-	num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
+	num_dw += ctx->atoms.cache_flush->num_dw;
 
 	/* Save 16 dwords for the fence mechanism. */
 	num_dw += 16;
@@ -167,37 +167,6 @@ void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	}
 }
 
-static void r600_flush_framebuffer(struct r600_context *ctx)
-{
-	struct si_pm4_state *pm4;
-
-	if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
-		return;
-
-	pm4 = si_pm4_alloc_state(ctx);
-
-	if (pm4 == NULL)
-		return;
-
-	si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
-				S_0085F0_CB1_DEST_BASE_ENA(1) |
-				S_0085F0_CB2_DEST_BASE_ENA(1) |
-				S_0085F0_CB3_DEST_BASE_ENA(1) |
-				S_0085F0_CB4_DEST_BASE_ENA(1) |
-				S_0085F0_CB5_DEST_BASE_ENA(1) |
-				S_0085F0_CB6_DEST_BASE_ENA(1) |
-				S_0085F0_CB7_DEST_BASE_ENA(1) |
-				S_0085F0_DB_ACTION_ENA(1) |
-				S_0085F0_DB_DEST_BASE_ENA(1));
-	si_cmd_flush_and_inv_cb_meta(pm4);
-
-	si_pm4_emit(ctx, pm4);
-	si_pm4_free_state(ctx, pm4, ~0);
-
-	ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
-	ctx->flush_and_inv_cb_meta = false;
-}
-
 void si_context_flush(struct r600_context *ctx, unsigned flags)
 {
 	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
@@ -223,7 +192,11 @@ void si_context_flush(struct r600_context *ctx, unsigned flags)
 	}
 #endif
 
-	r600_flush_framebuffer(ctx);
+	ctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV_CB |
+			R600_CONTEXT_FLUSH_AND_INV_CB_META |
+			R600_CONTEXT_FLUSH_AND_INV_DB |
+			R600_CONTEXT_INV_TEX_CACHE;
+	si_emit_cache_flush(&ctx->b, NULL);
 
 	/* partial flush is needed to avoid lockups on some chips with user fences */
 	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
@@ -268,7 +241,11 @@ void si_context_flush(struct r600_context *ctx, unsigned flags)
 #endif
 
 	ctx->pm4_dirty_cdwords = 0;
-	ctx->flags = 0;
+
+	/* Flush read caches at the beginning of CS. */
+	ctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE |
+			R600_CONTEXT_INV_CONST_CACHE |
+			R600_CONTEXT_INV_SHADER_CACHE;
 
 	/* set all valid group as dirty so they get reemited on
 	 * next draw command
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
index 0d864a0..3771215 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
@@ -244,6 +244,10 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 
 	si_init_all_descriptors(rctx);
 
+	/* Initialize cache_flush. */
+	rctx->cache_flush = si_atom_cache_flush;
+	rctx->atoms.cache_flush = &rctx->cache_flush;
+
 	switch (rctx->b.chip_class) {
 	case SI:
 	case CIK:
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
index e6e99c7..14c02df 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
@@ -131,8 +131,12 @@ struct r600_context {
 
 	union {
 		struct {
+			/* The order matters. */
 			struct r600_atom *const_buffers[SI_NUM_SHADERS];
 			struct r600_atom *sampler_views[SI_NUM_SHADERS];
+			/* Caches must be flushed after resource descriptors are
+			 * updated in memory. */
+			struct r600_atom *cache_flush;
 		};
 		struct r600_atom *array[0];
 	} atoms;
@@ -179,7 +183,6 @@ struct r600_context {
 
 	unsigned		backend_mask;
 	unsigned                max_db; /* for OQ */
-	unsigned		flags;
 	boolean                 predicate_drawing;
 
 	unsigned		num_so_targets;
@@ -198,12 +201,11 @@ struct r600_context {
 	/* With rasterizer discard, there doesn't have to be a pixel shader.
 	 * In that case, we bind this one: */
 	struct si_pipe_shader	*dummy_pixel_shader;
+	struct r600_atom	cache_flush;
 
 	/* SI state handling */
 	union si_state	queued;
 	union si_state	emitted;
-
-	bool flush_and_inv_cb_meta;
 };
 
 /* r600_blit.c */
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pm4.c b/src/gallium/drivers/radeonsi/radeonsi_pm4.c
index 9d0a7c0..37a199d 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pm4.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_pm4.c
@@ -145,17 +145,6 @@ void si_pm4_inval_texture_cache(struct si_pm4_state *state)
 	state->cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
 }
 
-void si_pm4_inval_fb_cache(struct si_pm4_state *state, unsigned nr_cbufs)
-{
-	state->cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1);
-	state->cp_coher_cntl |= ((1 << nr_cbufs) - 1) << S_0085F0_CB0_DEST_BASE_ENA_SHIFT;
-}
-
-void si_pm4_inval_zsbuf_cache(struct si_pm4_state *state)
-{
-	state->cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
-}
-
 void si_pm4_free_state(struct r600_context *rctx,
 		       struct si_pm4_state *state,
 		       unsigned idx)
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pm4.h b/src/gallium/drivers/radeonsi/radeonsi_pm4.h
index b74db08..2e32a19 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pm4.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_pm4.h
@@ -80,8 +80,6 @@ void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx)
 
 void si_pm4_inval_shader_cache(struct si_pm4_state *state);
 void si_pm4_inval_texture_cache(struct si_pm4_state *state);
-void si_pm4_inval_fb_cache(struct si_pm4_state *state, unsigned nr_cbufs);
-void si_pm4_inval_zsbuf_cache(struct si_pm4_state *state);
 
 void si_pm4_free_state(struct r600_context *rctx,
 		       struct si_pm4_state *state,
diff --git a/src/gallium/drivers/radeonsi/si_commands.c b/src/gallium/drivers/radeonsi/si_commands.c
index e498bd2..bf95924 100644
--- a/src/gallium/drivers/radeonsi/si_commands.c
+++ b/src/gallium/drivers/radeonsi/si_commands.c
@@ -78,12 +78,3 @@ void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl)
 		si_pm4_cmd_end(pm4, false);
 	}
 }
-
-void si_cmd_flush_and_inv_cb_meta(struct si_pm4_state *pm4)
-{
-	si_pm4_cmd_begin(pm4, PKT3_EVENT_WRITE);
-	si_pm4_cmd_add(pm4,
-		       EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
-		       EVENT_INDEX(0));
-	si_pm4_cmd_end(pm4, false);
-}
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2983d75..5d85448 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -30,7 +30,7 @@
 
 #include "util/u_memory.h"
 
-#define SI_NUM_CONTEXTS 256
+#define SI_NUM_CONTEXTS 16
 
 static uint32_t null_desc[8]; /* zeros */
 
@@ -142,7 +142,8 @@ static void si_release_descriptors(struct si_descriptors *desc)
 	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
 }
 
-static void si_update_descriptors(struct si_descriptors *desc)
+static void si_update_descriptors(struct r600_context *rctx,
+				  struct si_descriptors *desc)
 {
 	if (desc->dirty_mask) {
 		desc->atom.num_dw =
@@ -150,6 +151,8 @@ static void si_update_descriptors(struct si_descriptors *desc)
 			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask) + /* update */
 			4; /* pointer update */
 		desc->atom.dirty = true;
+		/* The descriptors are read with the K cache. */
+		rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE;
 	} else {
 		desc->atom.dirty = false;
 	}
@@ -185,6 +188,7 @@ static void si_emit_descriptors(struct r600_context *rctx,
 	va_base = r600_resource_va(rctx->b.b.screen, &desc->buffer->b.b);
 
 	/* Copy the descriptors to a new context slot. */
+	/* XXX Consider using TC or L2 for this copy on CIK. */
 	si_emit_cp_dma_copy_buffer(rctx,
 				   va_base + new_context_id * desc->context_size,
 				   va_base + desc->current_context_id * desc->context_size,
@@ -215,7 +219,7 @@ static void si_emit_descriptors(struct r600_context *rctx,
 			packet_size = 2 + desc->element_dw_size;
 
 			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
+			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_TC_OR_L2) |
 					     PKT3_WRITE_DATA_WR_CONFIRM |
 					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
 			radeon_emit(cs, va & 0xFFFFFFFFUL);
@@ -322,7 +326,7 @@ void si_set_sampler_view(struct r600_context *rctx, unsigned shader,
 	}
 
 	views->desc.dirty_mask |= 1 << slot;
-	si_update_descriptors(&views->desc);
+	si_update_descriptors(rctx, &views->desc);
 }
 
 /* BUFFER RESOURCES */
@@ -405,8 +409,6 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE;
-
 	assert(slot < buffers->num_buffers);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
@@ -451,7 +453,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	}
 
 	buffers->desc.dirty_mask |= 1 << slot;
-	si_update_descriptors(&buffers->desc);
+	si_update_descriptors(rctx, &buffers->desc);
 }
 
 /* INIT/DEINIT */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 5ac55f2..3c4197c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2238,11 +2238,13 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	if (pm4 == NULL)
 		return;
 
-	si_pm4_inval_fb_cache(pm4, state->nr_cbufs);
-	rctx->flush_and_inv_cb_meta = true;
-
-	if (state->zsbuf)
-		si_pm4_inval_zsbuf_cache(pm4);
+	if (rctx->framebuffer.nr_cbufs) {
+		rctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV_CB |
+				 R600_CONTEXT_FLUSH_AND_INV_CB_META;
+	}
+	if (rctx->framebuffer.zsbuf) {
+		rctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV_DB;
+	}
 
 	util_copy_framebuffer_state(&rctx->framebuffer, state);
 
@@ -2468,6 +2470,8 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 		si_pm4_bind_state(rctx, vs, sel->current->pm4);
 	else
 		si_pm4_bind_state(rctx, vs, rctx->dummy_pixel_shader->pm4);
+
+	rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE;
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
@@ -2484,6 +2488,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 		si_pm4_bind_state(rctx, ps, sel->current->pm4);
 	else
 		si_pm4_bind_state(rctx, ps, rctx->dummy_pixel_shader->pm4);
+
+	rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE;
 }
 
 static void si_delete_shader_selector(struct pipe_context *ctx,
@@ -2826,17 +2832,14 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
 
 /* XXX consider moving this function to si_descriptors.c for gcc to inline
  *     the si_set_sampler_view calls. LTO might help too. */
-static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
-						 unsigned shader, unsigned count,
-						 struct pipe_sampler_view **views)
+static void si_set_sampler_views(struct r600_context *rctx,
+				 unsigned shader, unsigned count,
+				 struct pipe_sampler_view **views)
 {
 	struct r600_textures_info *samplers = &rctx->samplers[shader];
 	struct si_pipe_sampler_view **rviews = (struct si_pipe_sampler_view **)views;
-	struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
 	int i;
 
-	si_pm4_inval_texture_cache(pm4);
-
 	for (i = 0; i < count; i++) {
 		if (views[i]) {
 			struct r600_texture *rtex =
@@ -2879,27 +2882,23 @@ static struct si_pm4_state *si_set_sampler_views(struct r600_context *rctx,
 	}
 
 	samplers->n_views = count;
-	return pm4;
+	rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
 }
 
 static void si_set_vs_sampler_views(struct pipe_context *ctx, unsigned count,
 				    struct pipe_sampler_view **views)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct si_pm4_state *pm4;
 
-	pm4 = si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views);
-	si_pm4_set_state(rctx, vs_sampler_views, pm4);
+	si_set_sampler_views(rctx, PIPE_SHADER_VERTEX, count, views);
 }
 
 static void si_set_ps_sampler_views(struct pipe_context *ctx, unsigned count,
 				    struct pipe_sampler_view **views)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct si_pm4_state *pm4;
 
-	pm4 = si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views);
-	si_pm4_set_state(rctx, ps_sampler_views, pm4);
+	si_set_sampler_views(rctx, PIPE_SHADER_FRAGMENT, count, views);
 }
 
 static struct si_pm4_state *si_bind_sampler_states(struct r600_context *rctx, unsigned count,
@@ -2915,7 +2914,7 @@ static struct si_pm4_state *si_bind_sampler_states(struct r600_context *rctx, un
 	if (!count)
 		goto out;
 
-	si_pm4_inval_texture_cache(pm4);
+	rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
 
 	si_pm4_sh_data_begin(pm4);
 	for (i = 0; i < count; i++) {
@@ -3128,14 +3127,9 @@ static void si_set_polygon_stipple(struct pipe_context *ctx,
 static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
-
-	if (pm4 == NULL)
-		return;
 
-	si_pm4_inval_texture_cache(pm4);
-	si_pm4_inval_fb_cache(pm4, rctx->framebuffer.nr_cbufs);
-	si_pm4_set_state(rctx, texture_barrier, pm4);
+	rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE |
+			 R600_CONTEXT_FLUSH_AND_INV_CB;
 }
 
 static void *si_create_blend_custom(struct r600_context *rctx, unsigned mode)
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 82fac4a..94a1521 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -28,6 +28,7 @@
 #define SI_STATE_H
 
 #include "radeonsi_pm4.h"
+#include "../radeon/r600_pipe_common.h"
 
 struct si_state_blend {
 	struct si_pm4_state	pm4;
@@ -74,8 +75,6 @@ struct si_vertex_element
 
 union si_state {
 	struct {
-		struct si_pm4_state		*sync;
-		struct si_pm4_state		*flush_and_inv_cb_meta;
 		struct si_pm4_state		*init;
 		struct si_state_blend		*blend;
 		struct si_pm4_state		*blend_color;
@@ -90,14 +89,11 @@ union si_state {
 		struct si_pm4_state		*fb_blend;
 		struct si_pm4_state		*dsa_stencil_ref;
 		struct si_pm4_state		*vs;
-		struct si_pm4_state		*vs_sampler_views;
 		struct si_pm4_state		*vs_sampler;
 		struct si_pm4_state		*ps;
-		struct si_pm4_state		*ps_sampler_views;
 		struct si_pm4_state		*ps_sampler;
 		struct si_pm4_state		*spi;
 		struct si_pm4_state		*vertex_buffers;
-		struct si_pm4_state		*texture_barrier;
 		struct si_pm4_state		*draw_info;
 		struct si_pm4_state		*draw;
 	} named;
@@ -214,6 +210,8 @@ void si_init_state_functions(struct r600_context *rctx);
 void si_init_config(struct r600_context *rctx);
 
 /* si_state_draw.c */
+extern const struct r600_atom si_atom_cache_flush;
+void si_emit_cache_flush(struct r600_common_context *rctx, struct r600_atom *atom);
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo);
 
 /* si_commands.c */
@@ -224,6 +222,5 @@ void si_cmd_draw_index_2(struct si_pm4_state *pm4, uint32_t max_size,
 void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count,
 			    uint32_t initiator, bool predicate);
 void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl);
-void si_cmd_flush_and_inv_cb_meta(struct si_pm4_state *pm4);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 1e555de..1d7da1f 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -52,8 +52,6 @@ static void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *s
 	if (pm4 == NULL)
 		return;
 
-	si_pm4_inval_shader_cache(pm4);
-
 	/* Certain attributes (position, psize, etc.) don't count as params.
 	 * VS is required to export at least one param and r600_shader_from_tgsi()
 	 * takes care of adding a dummy export.
@@ -116,6 +114,7 @@ static void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *s
 	}
 
 	si_pm4_bind_state(rctx, vs, shader->pm4);
+	rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE;
 }
 
 static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader)
@@ -133,8 +132,6 @@ static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *s
 	if (pm4 == NULL)
 		return;
 
-	si_pm4_inval_shader_cache(pm4);
-
 	db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
 			    S_02880C_ALPHA_TO_MASK_DISABLE(rctx->fb_cb0_is_integer);
 
@@ -244,6 +241,7 @@ static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *s
 	shader->cb0_is_integer = rctx->fb_cb0_is_integer;
 	shader->sprite_coord_enable = rctx->sprite_coord_enable;
 	si_pm4_bind_state(rctx, ps, shader->pm4);
+	rctx->b.flags |= R600_CONTEXT_INV_SHADER_CACHE;
 }
 
 /*
@@ -461,9 +459,8 @@ static void si_vertex_buffer_update(struct r600_context *rctx)
 	unsigned i, count;
 	uint64_t va;
 
-	si_pm4_inval_texture_cache(pm4);
+	rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
 
-	/* bind vertex buffer once */
 	count = rctx->vertex_elements->count;
 	assert(count <= 256 / 4);
 
@@ -568,11 +565,74 @@ static void si_state_draw(struct r600_context *rctx,
 	si_pm4_set_state(rctx, draw, pm4);
 }
 
+void si_emit_cache_flush(struct r600_common_context *rctx, struct r600_atom *atom)
+{
+	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	uint32_t cp_coher_cntl = 0;
+
+	/* XXX SI flushes both ICACHE and KCACHE if either flag is set.
+	 * XXX CIK shouldn't have this issue. Test CIK before separating the flags
+	 * XXX to ensure there is no regression. Also find out if there is another
+	 * XXX way to flush either ICACHE or KCACHE but not both for SI. */
+	if (rctx->flags & (R600_CONTEXT_INV_SHADER_CACHE |
+			   R600_CONTEXT_INV_CONST_CACHE)) {
+		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1) |
+				 S_0085F0_SH_KCACHE_ACTION_ENA(1);
+	}
+	if (rctx->flags & (R600_CONTEXT_INV_TEX_CACHE |
+			   R600_CONTEXT_STREAMOUT_FLUSH)) {
+		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) |
+				 S_0085F0_TCL1_ACTION_ENA(1);
+	}
+	if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB) {
+		cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
+				 S_0085F0_CB0_DEST_BASE_ENA(1) |
+			         S_0085F0_CB1_DEST_BASE_ENA(1) |
+			         S_0085F0_CB2_DEST_BASE_ENA(1) |
+			         S_0085F0_CB3_DEST_BASE_ENA(1) |
+			         S_0085F0_CB4_DEST_BASE_ENA(1) |
+			         S_0085F0_CB5_DEST_BASE_ENA(1) |
+			         S_0085F0_CB6_DEST_BASE_ENA(1) |
+			         S_0085F0_CB7_DEST_BASE_ENA(1);
+	}
+	if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_DB) {
+		cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
+				 S_0085F0_DB_DEST_BASE_ENA(1);
+	}
+
+	if (cp_coher_cntl) {
+		if (rctx->chip_class >= CIK) {
+			radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+			radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
+			radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
+			radeon_emit(cs, 0xff);            /* CP_COHER_SIZE_HI */
+			radeon_emit(cs, 0);               /* CP_COHER_BASE */
+			radeon_emit(cs, 0);               /* CP_COHER_BASE_HI */
+			radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
+		} else {
+			radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+			radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
+			radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
+			radeon_emit(cs, 0);               /* CP_COHER_BASE */
+			radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
+		}
+	}
+
+	if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_CB_META) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+	}
+
+	rctx->flags = 0;
+}
+
+const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 9 }; /* number of CS dwords */
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_index_buffer ib = {};
-	uint32_t cp_coher_cntl, i;
+	uint32_t i;
 
 	if (!info->count && (info->indexed || !info->count_from_stream_output))
 		return;
@@ -605,40 +665,15 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	si_state_draw(rctx, info, &ib);
 
-	/* Cache flushing via CP_COHER_CNTL. */
-	cp_coher_cntl = si_pm4_sync_flags(rctx);
-
-	if (rctx->b.flags & R600_CONTEXT_INV_CONST_CACHE) {
-		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1) |
-				 S_0085F0_SH_KCACHE_ACTION_ENA(1);
-	}
-
-	if (cp_coher_cntl) {
-		struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
-
-		if (pm4 == NULL)
-			return;
-
-		si_cmd_surface_sync(pm4, cp_coher_cntl);
-		si_pm4_set_state(rctx, sync, pm4);
-	}
-
-	if (rctx->flush_and_inv_cb_meta) {
-		struct si_pm4_state *pm4 = si_pm4_alloc_state(rctx);
-
-		if (pm4 == NULL)
-			return;
-
-		si_cmd_flush_and_inv_cb_meta(pm4);
-		si_pm4_set_state(rctx, flush_and_inv_cb_meta, pm4);
-		rctx->flush_and_inv_cb_meta = false;
-	}
-
-	/* Emit states. */
 	rctx->pm4_dirty_cdwords += si_pm4_dirty_dw(rctx);
 
+	/* Check flush flags. */
+	if (rctx->b.flags)
+		rctx->atoms.cache_flush->dirty = true;
+
 	si_need_cs_space(rctx, 0, TRUE);
 
+	/* Emit states. */
 	for (i = 0; i < SI_NUM_ATOMS(rctx); i++) {
 		if (rctx->atoms.array[i]->dirty) {
 			rctx->atoms.array[i]->emit(&rctx->b, rctx->atoms.array[i]);
@@ -663,8 +698,6 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	}
 #endif
 
-	rctx->flags |= R600_CONTEXT_DST_CACHES_DIRTY;
-
 	/* Set the depth buffer as dirty. */
 	if (rctx->framebuffer.zsbuf) {
 		struct pipe_surface *surf = rctx->framebuffer.zsbuf;
-- 
1.8.1.2



More information about the mesa-dev mailing list