[Mesa-dev] [PATCH 16/17] radeonsi: atomize the scratch buffer state

Mon Jan 30 14:24:00 UTC 2017

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

On 30.01.2017 12:54, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> The update frequency is very low.
>
> Difference: Only account for the size when allocating a new one and when
>             starting a new IB, and check for NULL. (v3)
> ---
>  src/gallium/drivers/radeonsi/si_cp_dma.c        |  3 ++-
>  src/gallium/drivers/radeonsi/si_hw_context.c    |  7 ++++++-
>  src/gallium/drivers/radeonsi/si_pipe.h          |  2 +-
>  src/gallium/drivers/radeonsi/si_state.h         |  1 +
>  src/gallium/drivers/radeonsi/si_state_draw.c    | 24 ------------------------
>  src/gallium/drivers/radeonsi/si_state_shaders.c | 24 ++++++++++++++++++++++--
>  6 files changed, 32 insertions(+), 29 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index b398256..e198765 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -276,21 +276,22 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
>  	 * idle at this point.
>  	 */
>  	if (!sctx->scratch_buffer ||
>  	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
>  		r600_resource_reference(&sctx->scratch_buffer, NULL);
>  		sctx->scratch_buffer = (struct r600_resource*)
>  			pipe_buffer_create(&sctx->screen->b.b, 0,
>  					   PIPE_USAGE_DEFAULT, scratch_size);
>  		if (!sctx->scratch_buffer)
>  			return;
> -		sctx->emit_scratch_reloc = true;
> +
> +		si_mark_atom_dirty(sctx, &sctx->scratch_state);
>  	}
>
>  	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
>  			  &sctx->scratch_buffer->b.b, size, size, user_flags,
>  			  is_first, &dma_flags);
>
>  	va = sctx->scratch_buffer->gpu_address;
>  	si_emit_cp_dma(sctx, va, va + CP_DMA_ALIGNMENT, size, dma_flags,
>  		       R600_COHERENCY_SHADER);
>  }
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
> index e5da730..c80b884 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -228,34 +228,39 @@ void si_begin_new_cs(struct si_context *ctx)
>  	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
>  	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
>  	si_all_descriptors_begin_new_cs(ctx);
>
>  	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
>  	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
>  	ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
>  	si_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
>  	si_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
>
> +	si_mark_atom_dirty(ctx, &ctx->scratch_state);
> +	if (ctx->scratch_buffer) {
> +		r600_context_add_resource_size(&ctx->b.b,
> +					       &ctx->scratch_buffer->b.b);
> +	}
> +
>  	r600_postflush_resume_features(&ctx->b);
>
>  	assert(!ctx->b.gfx.cs->prev_dw);
>  	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
>
>  	/* Invalidate various draw states so that they are emitted before
>  	 * the first draw call. */
>  	si_invalidate_draw_sh_constants(ctx);
>  	ctx->last_index_size = -1;
>  	ctx->last_primitive_restart_en = -1;
>  	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
>  	ctx->last_gs_out_prim = -1;
>  	ctx->last_prim = -1;
>  	ctx->last_multi_vgt_param = -1;
>  	ctx->last_rast_prim = -1;
>  	ctx->last_sc_line_stipple = ~0;
> -	ctx->emit_scratch_reloc = true;
>  	ctx->last_ls = NULL;
>  	ctx->last_tcs = NULL;
>  	ctx->last_tes_sh_base = -1;
>  	ctx->last_num_tcs_input_cp = -1;
>
>  	ctx->cs_shader_state.initialized = false;
>  }
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 724d89e..d17d55a 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -345,22 +345,22 @@ struct si_context {
>  	int			last_restart_index;
>  	int			last_gs_out_prim;
>  	int			last_prim;
>  	int			last_multi_vgt_param;
>  	int			last_rast_prim;
>  	unsigned		last_sc_line_stipple;
>  	int			current_rast_prim; /* primitive type after TES, GS */
>  	bool			gs_tri_strip_adj_fix;
>
>  	/* Scratch buffer */
> +	struct r600_atom	scratch_state;
>  	struct r600_resource	*scratch_buffer;
> -	bool			emit_scratch_reloc;
>  	unsigned		scratch_waves;
>  	unsigned		spi_tmpring_size;
>
>  	struct r600_resource	*compute_scratch_buffer;
>
>  	/* Emitted derived tessellation state. */
>  	struct si_shader	*last_ls; /* local shader (VS) */
>  	struct si_shader_selector *last_tcs;
>  	int			last_num_tcs_input_cp;
>  	int			last_tes_sh_base;
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 19880c5..6bb0673 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -145,20 +145,21 @@ union si_state_atoms {
>  		struct r600_atom *sample_mask;
>  		struct r600_atom *cb_render_state;
>  		struct r600_atom *blend_color;
>  		struct r600_atom *clip_regs;
>  		struct r600_atom *clip_state;
>  		struct r600_atom *shader_userdata;
>  		struct r600_atom *scissors;
>  		struct r600_atom *viewports;
>  		struct r600_atom *stencil_ref;
>  		struct r600_atom *spi_map;
> +		struct r600_atom *scratch_state;
>  	} s;
>  	struct r600_atom *array[0];
>  };
>
>  #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
>
>  struct si_shader_data {
>  	struct r600_atom	atom;
>  	uint32_t		sh_base[SI_NUM_SHADERS];
>  };
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 1ce8ac8..e4a6b7f 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -452,39 +452,20 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
>  		    G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
>  		    (info->indirect ||
>  		     (info->instance_count > 1 &&
>  		      si_num_prims_for_vertices(info) <= 1)))
>  			sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
>  	}
>
>  	return ia_multi_vgt_param;
>  }
>
> -static void si_emit_scratch_reloc(struct si_context *sctx)
> -{
> -	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> -
> -	if (!sctx->emit_scratch_reloc)
> -		return;
> -
> -	radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
> -			       sctx->spi_tmpring_size);
> -
> -	if (sctx->scratch_buffer) {
> -		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
> -				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
> -				      RADEON_PRIO_SCRATCH_BUFFER);
> -
> -	}
> -	sctx->emit_scratch_reloc = false;
> -}
> -
>  /* rast_prim is the primitive type after GS. */
>  static void si_emit_rasterizer_prim_state(struct si_context *sctx)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	unsigned rast_prim = sctx->current_rast_prim;
>  	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
>
>  	/* Skip this if not rendering lines. */
>  	if (rast_prim != PIPE_PRIM_LINES &&
>  	    rast_prim != PIPE_PRIM_LINE_LOOP &&
> @@ -1120,22 +1101,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>  		r600_resource(info->indirect)->TC_L2_dirty = false;
>  	}
>
>  	if (info->indirect_params &&
>  	    r600_resource(info->indirect_params)->TC_L2_dirty) {
>  		sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
>  		r600_resource(info->indirect_params)->TC_L2_dirty = false;
>  	}
>
>  	/* Add buffer sizes for memory checking in need_cs_space. */
> -	if (sctx->emit_scratch_reloc && sctx->scratch_buffer)
> -		r600_context_add_resource_size(ctx, &sctx->scratch_buffer->b.b);
>  	if (info->indirect)
>  		r600_context_add_resource_size(ctx, info->indirect);
>
>  	si_need_cs_space(sctx);
>
>  	/* Since we've called r600_context_add_resource_size for vertex buffers,
>  	 * this must be called after si_need_cs_space, because we must let
>  	 * need_cs_space flush before we add buffers to the buffer list.
>  	 */
>  	if (!si_upload_vertex_buffer_descriptors(sctx))
> @@ -1161,28 +1140,25 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>  		struct si_pm4_state *state = sctx->queued.array[i];
>
>  		if (!state || sctx->emitted.array[i] == state)
>  			continue;
>
>  		si_pm4_emit(sctx, state);
>  		sctx->emitted.array[i] = state;
>  	}
>  	sctx->dirty_states = 0;
>
> -	si_emit_scratch_reloc(sctx);
>  	si_emit_rasterizer_prim_state(sctx);
>  	si_emit_draw_registers(sctx, info);
>
>  	si_ce_pre_draw_synchronization(sctx);
> -
>  	si_emit_draw_packets(sctx, info, &ib);
> -
>  	si_ce_post_draw_synchronization(sctx);
>
>  	if (sctx->trace_buf)
>  		si_trace_emit(sctx);
>
>  	/* Workaround for a VGT hang when streamout is enabled.
>  	 * It must be done after drawing. */
>  	if ((sctx->b.family == CHIP_HAWAII ||
>  	     sctx->b.family == CHIP_TONGA ||
>  	     sctx->b.family == CHIP_FIJI) &&
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index ebd2435..bde02f5 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -2193,21 +2193,24 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
>  	if (scratch_needed_size > 0) {
>  		if (scratch_needed_size > current_scratch_buffer_size) {
>  			/* Create a bigger scratch buffer */
>  			r600_resource_reference(&sctx->scratch_buffer, NULL);
>
>  			sctx->scratch_buffer = (struct r600_resource*)
>  					pipe_buffer_create(&sctx->screen->b.b, 0,
>  	                                PIPE_USAGE_DEFAULT, scratch_needed_size);
>  			if (!sctx->scratch_buffer)
>  				return false;
> -			sctx->emit_scratch_reloc = true;
> +
> +			si_mark_atom_dirty(sctx, &sctx->scratch_state);
> +			r600_context_add_resource_size(&sctx->b.b,
> +						       &sctx->scratch_buffer->b.b);
>  		}
>
>  		/* Update the shaders, so they are using the latest scratch.  The
>  		 * scratch buffer may have been changed since these shaders were
>  		 * last used, so we still need to try to update them, even if
>  		 * they require scratch buffers smaller than the current size.
>  		 */
>  		r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
>  		if (r < 0)
>  			return false;
> @@ -2252,21 +2255,21 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
>  	}
>
>  	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
>  	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
>  		"scratch size should already be aligned correctly.");
>
>  	spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
>  			   S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
>  	if (spi_tmpring_size != sctx->spi_tmpring_size) {
>  		sctx->spi_tmpring_size = spi_tmpring_size;
> -		sctx->emit_scratch_reloc = true;
> +		si_mark_atom_dirty(sctx, &sctx->scratch_state);
>  	}
>  	return true;
>  }
>
>  static void si_init_tess_factor_ring(struct si_context *sctx)
>  {
>  	bool double_offchip_buffers = sctx->b.chip_class >= CIK;
>  	unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
>  	unsigned max_offchip_buffers = max_offchip_buffers_per_se *
>  				       sctx->screen->b.info.max_se;
> @@ -2581,23 +2584,40 @@ bool si_update_shaders(struct si_context *sctx)
>  			return false;
>  	}
>
>  	if (sctx->b.chip_class >= CIK)
>  		si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
>
>  	sctx->do_update_shaders = false;
>  	return true;
>  }
>
> +static void si_emit_scratch_state(struct si_context *sctx,
> +				  struct r600_atom *atom)
> +{
> +	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> +
> +	radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
> +			       sctx->spi_tmpring_size);
> +
> +	if (sctx->scratch_buffer) {
> +		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
> +				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
> +				      RADEON_PRIO_SCRATCH_BUFFER);
> +	}
> +}
> +
>  void si_init_shader_functions(struct si_context *sctx)
>  {
>  	si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
> +	si_init_atom(sctx, &sctx->scratch_state, &sctx->atoms.s.scratch_state,
> +		     si_emit_scratch_state);
>
>  	sctx->b.b.create_vs_state = si_create_shader_selector;
>  	sctx->b.b.create_tcs_state = si_create_shader_selector;
>  	sctx->b.b.create_tes_state = si_create_shader_selector;
>  	sctx->b.b.create_gs_state = si_create_shader_selector;
>  	sctx->b.b.create_fs_state = si_create_shader_selector;
>
>  	sctx->b.b.bind_vs_state = si_bind_vs_shader;
>  	sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
>  	sctx->b.b.bind_tes_state = si_bind_tes_shader;
>