[Mesa-dev] [PATCH 16/17] radeonsi: atomize the scratch buffer state

Mon Jan 30 11:51:09 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

The update frequency is very low.

Difference: Only account for the size when allocating a new one and when
            starting a new IB. (v2)
---
 src/gallium/drivers/radeonsi/si_cp_dma.c        |  3 ++-
 src/gallium/drivers/radeonsi/si_hw_context.c    |  5 ++++-
 src/gallium/drivers/radeonsi/si_pipe.h          |  2 +-
 src/gallium/drivers/radeonsi/si_state.h         |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c    | 24 ------------------------
 src/gallium/drivers/radeonsi/si_state_shaders.c | 24 ++++++++++++++++++++++--
 6 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index b398256..e198765 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -276,21 +276,22 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
 	 * idle at this point.
 	 */
 	if (!sctx->scratch_buffer ||
 	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
 		r600_resource_reference(&sctx->scratch_buffer, NULL);
 		sctx->scratch_buffer = (struct r600_resource*)
 			pipe_buffer_create(&sctx->screen->b.b, 0,
 					   PIPE_USAGE_DEFAULT, scratch_size);
 		if (!sctx->scratch_buffer)
 			return;
-		sctx->emit_scratch_reloc = true;
+
+		si_mark_atom_dirty(sctx, &sctx->scratch_state);
 	}
 
 	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
 			  &sctx->scratch_buffer->b.b, size, size, user_flags,
 			  is_first, &dma_flags);
 
 	va = sctx->scratch_buffer->gpu_address;
 	si_emit_cp_dma(sctx, va, va + CP_DMA_ALIGNMENT, size, dma_flags,
 		       R600_COHERENCY_SHADER);
 }
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index e5da730..c504ac5 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -228,34 +228,37 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
 	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
 	si_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
 	si_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
 
+	si_mark_atom_dirty(ctx, &ctx->scratch_state);
+	r600_context_add_resource_size(&ctx->b.b,
+				       &ctx->scratch_buffer->b.b);
+
 	r600_postflush_resume_features(&ctx->b);
 
 	assert(!ctx->b.gfx.cs->prev_dw);
 	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
 
 	/* Invalidate various draw states so that they are emitted before
 	 * the first draw call. */
 	si_invalidate_draw_sh_constants(ctx);
 	ctx->last_index_size = -1;
 	ctx->last_primitive_restart_en = -1;
 	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
 	ctx->last_gs_out_prim = -1;
 	ctx->last_prim = -1;
 	ctx->last_multi_vgt_param = -1;
 	ctx->last_rast_prim = -1;
 	ctx->last_sc_line_stipple = ~0;
-	ctx->emit_scratch_reloc = true;
 	ctx->last_ls = NULL;
 	ctx->last_tcs = NULL;
 	ctx->last_tes_sh_base = -1;
 	ctx->last_num_tcs_input_cp = -1;
 
 	ctx->cs_shader_state.initialized = false;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 724d89e..d17d55a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -345,22 +345,22 @@ struct si_context {
 	int			last_restart_index;
 	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
 	int			last_rast_prim;
 	unsigned		last_sc_line_stipple;
 	int			current_rast_prim; /* primitive type after TES, GS */
 	bool			gs_tri_strip_adj_fix;
 
 	/* Scratch buffer */
+	struct r600_atom	scratch_state;
 	struct r600_resource	*scratch_buffer;
-	bool			emit_scratch_reloc;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
 
 	struct r600_resource	*compute_scratch_buffer;
 
 	/* Emitted derived tessellation state. */
 	struct si_shader	*last_ls; /* local shader (VS) */
 	struct si_shader_selector *last_tcs;
 	int			last_num_tcs_input_cp;
 	int			last_tes_sh_base;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 19880c5..6bb0673 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -145,20 +145,21 @@ union si_state_atoms {
 		struct r600_atom *sample_mask;
 		struct r600_atom *cb_render_state;
 		struct r600_atom *blend_color;
 		struct r600_atom *clip_regs;
 		struct r600_atom *clip_state;
 		struct r600_atom *shader_userdata;
 		struct r600_atom *scissors;
 		struct r600_atom *viewports;
 		struct r600_atom *stencil_ref;
 		struct r600_atom *spi_map;
+		struct r600_atom *scratch_state;
 	} s;
 	struct r600_atom *array[0];
 };
 
 #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
 
 struct si_shader_data {
 	struct r600_atom	atom;
 	uint32_t		sh_base[SI_NUM_SHADERS];
 };
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 1ce8ac8..e4a6b7f 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -452,39 +452,20 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		    G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
 		    (info->indirect ||
 		     (info->instance_count > 1 &&
 		      si_num_prims_for_vertices(info) <= 1)))
 			sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 	}
 
 	return ia_multi_vgt_param;
 }
 
-static void si_emit_scratch_reloc(struct si_context *sctx)
-{
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-
-	if (!sctx->emit_scratch_reloc)
-		return;
-
-	radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
-			       sctx->spi_tmpring_size);
-
-	if (sctx->scratch_buffer) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
-				      RADEON_PRIO_SCRATCH_BUFFER);
-
-	}
-	sctx->emit_scratch_reloc = false;
-}
-
 /* rast_prim is the primitive type after GS. */
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned rast_prim = sctx->current_rast_prim;
 	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
 
 	/* Skip this if not rendering lines. */
 	if (rast_prim != PIPE_PRIM_LINES &&
 	    rast_prim != PIPE_PRIM_LINE_LOOP &&
@@ -1120,22 +1101,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		r600_resource(info->indirect)->TC_L2_dirty = false;
 	}
 
 	if (info->indirect_params &&
 	    r600_resource(info->indirect_params)->TC_L2_dirty) {
 		sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 		r600_resource(info->indirect_params)->TC_L2_dirty = false;
 	}
 
 	/* Add buffer sizes for memory checking in need_cs_space. */
-	if (sctx->emit_scratch_reloc && sctx->scratch_buffer)
-		r600_context_add_resource_size(ctx, &sctx->scratch_buffer->b.b);
 	if (info->indirect)
 		r600_context_add_resource_size(ctx, info->indirect);
 
 	si_need_cs_space(sctx);
 
 	/* Since we've called r600_context_add_resource_size for vertex buffers,
 	 * this must be called after si_need_cs_space, because we must let
 	 * need_cs_space flush before we add buffers to the buffer list.
 	 */
 	if (!si_upload_vertex_buffer_descriptors(sctx))
@@ -1161,28 +1140,25 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		struct si_pm4_state *state = sctx->queued.array[i];
 
 		if (!state || sctx->emitted.array[i] == state)
 			continue;
 
 		si_pm4_emit(sctx, state);
 		sctx->emitted.array[i] = state;
 	}
 	sctx->dirty_states = 0;
 
-	si_emit_scratch_reloc(sctx);
 	si_emit_rasterizer_prim_state(sctx);
 	si_emit_draw_registers(sctx, info);
 
 	si_ce_pre_draw_synchronization(sctx);
-
 	si_emit_draw_packets(sctx, info, &ib);
-
 	si_ce_post_draw_synchronization(sctx);
 
 	if (sctx->trace_buf)
 		si_trace_emit(sctx);
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
 	if ((sctx->b.family == CHIP_HAWAII ||
 	     sctx->b.family == CHIP_TONGA ||
 	     sctx->b.family == CHIP_FIJI) &&
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ebd2435..bde02f5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2193,21 +2193,24 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 	if (scratch_needed_size > 0) {
 		if (scratch_needed_size > current_scratch_buffer_size) {
 			/* Create a bigger scratch buffer */
 			r600_resource_reference(&sctx->scratch_buffer, NULL);
 
 			sctx->scratch_buffer = (struct r600_resource*)
 					pipe_buffer_create(&sctx->screen->b.b, 0,
 	                                PIPE_USAGE_DEFAULT, scratch_needed_size);
 			if (!sctx->scratch_buffer)
 				return false;
-			sctx->emit_scratch_reloc = true;
+
+			si_mark_atom_dirty(sctx, &sctx->scratch_state);
+			r600_context_add_resource_size(&sctx->b.b,
+						       &sctx->scratch_buffer->b.b);
 		}
 
 		/* Update the shaders, so they are using the latest scratch.  The
 		 * scratch buffer may have been changed since these shaders were
 		 * last used, so we still need to try to update them, even if
 		 * they require scratch buffers smaller than the current size.
 		 */
 		r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
 		if (r < 0)
 			return false;
@@ -2252,21 +2255,21 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
 	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
 		"scratch size should already be aligned correctly.");
 
 	spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
 			   S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 	if (spi_tmpring_size != sctx->spi_tmpring_size) {
 		sctx->spi_tmpring_size = spi_tmpring_size;
-		sctx->emit_scratch_reloc = true;
+		si_mark_atom_dirty(sctx, &sctx->scratch_state);
 	}
 	return true;
 }
 
 static void si_init_tess_factor_ring(struct si_context *sctx)
 {
 	bool double_offchip_buffers = sctx->b.chip_class >= CIK;
 	unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
 	unsigned max_offchip_buffers = max_offchip_buffers_per_se *
 				       sctx->screen->b.info.max_se;
@@ -2581,23 +2584,40 @@ bool si_update_shaders(struct si_context *sctx)
 			return false;
 	}
 
 	if (sctx->b.chip_class >= CIK)
 		si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
 
 	sctx->do_update_shaders = false;
 	return true;
 }
 
+static void si_emit_scratch_state(struct si_context *sctx,
+				  struct r600_atom *atom)
+{
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+
+	radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
+			       sctx->spi_tmpring_size);
+
+	if (sctx->scratch_buffer) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+				      RADEON_PRIO_SCRATCH_BUFFER);
+	}
+}
+
 void si_init_shader_functions(struct si_context *sctx)
 {
 	si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
+	si_init_atom(sctx, &sctx->scratch_state, &sctx->atoms.s.scratch_state,
+		     si_emit_scratch_state);
 
 	sctx->b.b.create_vs_state = si_create_shader_selector;
 	sctx->b.b.create_tcs_state = si_create_shader_selector;
 	sctx->b.b.create_tes_state = si_create_shader_selector;
 	sctx->b.b.create_gs_state = si_create_shader_selector;
 	sctx->b.b.create_fs_state = si_create_shader_selector;
 
 	sctx->b.b.bind_vs_state = si_bind_vs_shader;
 	sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
 	sctx->b.b.bind_tes_state = si_bind_tes_shader;
-- 
2.7.4