[Mesa-dev] [PATCH 16/17] radeonsi: atomize the scratch buffer state
Nicolai Hähnle
nhaehnle at gmail.com
Mon Jan 30 14:24:00 UTC 2017
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
On 30.01.2017 12:54, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> The update frequency is very low.
>
> Difference: Only account for the size when allocating a new one and when
> starting a new IB, and check for NULL. (v3)
> ---
> src/gallium/drivers/radeonsi/si_cp_dma.c | 3 ++-
> src/gallium/drivers/radeonsi/si_hw_context.c | 7 ++++++-
> src/gallium/drivers/radeonsi/si_pipe.h | 2 +-
> src/gallium/drivers/radeonsi/si_state.h | 1 +
> src/gallium/drivers/radeonsi/si_state_draw.c | 24 ------------------------
> src/gallium/drivers/radeonsi/si_state_shaders.c | 24 ++++++++++++++++++++++--
> 6 files changed, 32 insertions(+), 29 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index b398256..e198765 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -276,21 +276,22 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
> * idle at this point.
> */
> if (!sctx->scratch_buffer ||
> sctx->scratch_buffer->b.b.width0 < scratch_size) {
> r600_resource_reference(&sctx->scratch_buffer, NULL);
> sctx->scratch_buffer = (struct r600_resource*)
> pipe_buffer_create(&sctx->screen->b.b, 0,
> PIPE_USAGE_DEFAULT, scratch_size);
> if (!sctx->scratch_buffer)
> return;
> - sctx->emit_scratch_reloc = true;
> +
> + si_mark_atom_dirty(sctx, &sctx->scratch_state);
> }
>
> si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
> &sctx->scratch_buffer->b.b, size, size, user_flags,
> is_first, &dma_flags);
>
> va = sctx->scratch_buffer->gpu_address;
> si_emit_cp_dma(sctx, va, va + CP_DMA_ALIGNMENT, size, dma_flags,
> R600_COHERENCY_SHADER);
> }
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
> index e5da730..c80b884 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -228,34 +228,39 @@ void si_begin_new_cs(struct si_context *ctx)
> si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
> si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
> si_all_descriptors_begin_new_cs(ctx);
>
> ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
> ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
> ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
> si_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
> si_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
>
> + si_mark_atom_dirty(ctx, &ctx->scratch_state);
> + if (ctx->scratch_buffer) {
> + r600_context_add_resource_size(&ctx->b.b,
> + &ctx->scratch_buffer->b.b);
> + }
> +
> r600_postflush_resume_features(&ctx->b);
>
> assert(!ctx->b.gfx.cs->prev_dw);
> ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw;
>
> /* Invalidate various draw states so that they are emitted before
> * the first draw call. */
> si_invalidate_draw_sh_constants(ctx);
> ctx->last_index_size = -1;
> ctx->last_primitive_restart_en = -1;
> ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
> ctx->last_gs_out_prim = -1;
> ctx->last_prim = -1;
> ctx->last_multi_vgt_param = -1;
> ctx->last_rast_prim = -1;
> ctx->last_sc_line_stipple = ~0;
> - ctx->emit_scratch_reloc = true;
> ctx->last_ls = NULL;
> ctx->last_tcs = NULL;
> ctx->last_tes_sh_base = -1;
> ctx->last_num_tcs_input_cp = -1;
>
> ctx->cs_shader_state.initialized = false;
> }
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 724d89e..d17d55a 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -345,22 +345,22 @@ struct si_context {
> int last_restart_index;
> int last_gs_out_prim;
> int last_prim;
> int last_multi_vgt_param;
> int last_rast_prim;
> unsigned last_sc_line_stipple;
> int current_rast_prim; /* primitive type after TES, GS */
> bool gs_tri_strip_adj_fix;
>
> /* Scratch buffer */
> + struct r600_atom scratch_state;
> struct r600_resource *scratch_buffer;
> - bool emit_scratch_reloc;
> unsigned scratch_waves;
> unsigned spi_tmpring_size;
>
> struct r600_resource *compute_scratch_buffer;
>
> /* Emitted derived tessellation state. */
> struct si_shader *last_ls; /* local shader (VS) */
> struct si_shader_selector *last_tcs;
> int last_num_tcs_input_cp;
> int last_tes_sh_base;
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 19880c5..6bb0673 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -145,20 +145,21 @@ union si_state_atoms {
> struct r600_atom *sample_mask;
> struct r600_atom *cb_render_state;
> struct r600_atom *blend_color;
> struct r600_atom *clip_regs;
> struct r600_atom *clip_state;
> struct r600_atom *shader_userdata;
> struct r600_atom *scissors;
> struct r600_atom *viewports;
> struct r600_atom *stencil_ref;
> struct r600_atom *spi_map;
> + struct r600_atom *scratch_state;
> } s;
> struct r600_atom *array[0];
> };
>
> #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
>
> struct si_shader_data {
> struct r600_atom atom;
> uint32_t sh_base[SI_NUM_SHADERS];
> };
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 1ce8ac8..e4a6b7f 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -452,39 +452,20 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
> G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
> (info->indirect ||
> (info->instance_count > 1 &&
> si_num_prims_for_vertices(info) <= 1)))
> sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
> }
>
> return ia_multi_vgt_param;
> }
>
> -static void si_emit_scratch_reloc(struct si_context *sctx)
> -{
> - struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> -
> - if (!sctx->emit_scratch_reloc)
> - return;
> -
> - radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
> - sctx->spi_tmpring_size);
> -
> - if (sctx->scratch_buffer) {
> - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
> - sctx->scratch_buffer, RADEON_USAGE_READWRITE,
> - RADEON_PRIO_SCRATCH_BUFFER);
> -
> - }
> - sctx->emit_scratch_reloc = false;
> -}
> -
> /* rast_prim is the primitive type after GS. */
> static void si_emit_rasterizer_prim_state(struct si_context *sctx)
> {
> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> unsigned rast_prim = sctx->current_rast_prim;
> struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
>
> /* Skip this if not rendering lines. */
> if (rast_prim != PIPE_PRIM_LINES &&
> rast_prim != PIPE_PRIM_LINE_LOOP &&
> @@ -1120,22 +1101,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> r600_resource(info->indirect)->TC_L2_dirty = false;
> }
>
> if (info->indirect_params &&
> r600_resource(info->indirect_params)->TC_L2_dirty) {
> sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> r600_resource(info->indirect_params)->TC_L2_dirty = false;
> }
>
> /* Add buffer sizes for memory checking in need_cs_space. */
> - if (sctx->emit_scratch_reloc && sctx->scratch_buffer)
> - r600_context_add_resource_size(ctx, &sctx->scratch_buffer->b.b);
> if (info->indirect)
> r600_context_add_resource_size(ctx, info->indirect);
>
> si_need_cs_space(sctx);
>
> /* Since we've called r600_context_add_resource_size for vertex buffers,
> * this must be called after si_need_cs_space, because we must let
> * need_cs_space flush before we add buffers to the buffer list.
> */
> if (!si_upload_vertex_buffer_descriptors(sctx))
> @@ -1161,28 +1140,25 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> struct si_pm4_state *state = sctx->queued.array[i];
>
> if (!state || sctx->emitted.array[i] == state)
> continue;
>
> si_pm4_emit(sctx, state);
> sctx->emitted.array[i] = state;
> }
> sctx->dirty_states = 0;
>
> - si_emit_scratch_reloc(sctx);
> si_emit_rasterizer_prim_state(sctx);
> si_emit_draw_registers(sctx, info);
>
> si_ce_pre_draw_synchronization(sctx);
> -
> si_emit_draw_packets(sctx, info, &ib);
> -
> si_ce_post_draw_synchronization(sctx);
>
> if (sctx->trace_buf)
> si_trace_emit(sctx);
>
> /* Workaround for a VGT hang when streamout is enabled.
> * It must be done after drawing. */
> if ((sctx->b.family == CHIP_HAWAII ||
> sctx->b.family == CHIP_TONGA ||
> sctx->b.family == CHIP_FIJI) &&
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index ebd2435..bde02f5 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -2193,21 +2193,24 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
> if (scratch_needed_size > 0) {
> if (scratch_needed_size > current_scratch_buffer_size) {
> /* Create a bigger scratch buffer */
> r600_resource_reference(&sctx->scratch_buffer, NULL);
>
> sctx->scratch_buffer = (struct r600_resource*)
> pipe_buffer_create(&sctx->screen->b.b, 0,
> PIPE_USAGE_DEFAULT, scratch_needed_size);
> if (!sctx->scratch_buffer)
> return false;
> - sctx->emit_scratch_reloc = true;
> +
> + si_mark_atom_dirty(sctx, &sctx->scratch_state);
> + r600_context_add_resource_size(&sctx->b.b,
> + &sctx->scratch_buffer->b.b);
> }
>
> /* Update the shaders, so they are using the latest scratch. The
> * scratch buffer may have been changed since these shaders were
> * last used, so we still need to try to update them, even if
> * they require scratch buffers smaller than the current size.
> */
> r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
> if (r < 0)
> return false;
> @@ -2252,21 +2255,21 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
> }
>
> /* The LLVM shader backend should be reporting aligned scratch_sizes. */
> assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
> "scratch size should already be aligned correctly.");
>
> spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
> S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
> if (spi_tmpring_size != sctx->spi_tmpring_size) {
> sctx->spi_tmpring_size = spi_tmpring_size;
> - sctx->emit_scratch_reloc = true;
> + si_mark_atom_dirty(sctx, &sctx->scratch_state);
> }
> return true;
> }
>
> static void si_init_tess_factor_ring(struct si_context *sctx)
> {
> bool double_offchip_buffers = sctx->b.chip_class >= CIK;
> unsigned max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
> unsigned max_offchip_buffers = max_offchip_buffers_per_se *
> sctx->screen->b.info.max_se;
> @@ -2581,23 +2584,40 @@ bool si_update_shaders(struct si_context *sctx)
> return false;
> }
>
> if (sctx->b.chip_class >= CIK)
> si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
>
> sctx->do_update_shaders = false;
> return true;
> }
>
> +static void si_emit_scratch_state(struct si_context *sctx,
> + struct r600_atom *atom)
> +{
> + struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> +
> + radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
> + sctx->spi_tmpring_size);
> +
> + if (sctx->scratch_buffer) {
> + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
> + sctx->scratch_buffer, RADEON_USAGE_READWRITE,
> + RADEON_PRIO_SCRATCH_BUFFER);
> + }
> +}
> +
> void si_init_shader_functions(struct si_context *sctx)
> {
> si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
> + si_init_atom(sctx, &sctx->scratch_state, &sctx->atoms.s.scratch_state,
> + si_emit_scratch_state);
>
> sctx->b.b.create_vs_state = si_create_shader_selector;
> sctx->b.b.create_tcs_state = si_create_shader_selector;
> sctx->b.b.create_tes_state = si_create_shader_selector;
> sctx->b.b.create_gs_state = si_create_shader_selector;
> sctx->b.b.create_fs_state = si_create_shader_selector;
>
> sctx->b.b.bind_vs_state = si_bind_vs_shader;
> sctx->b.b.bind_tcs_state = si_bind_tcs_shader;
> sctx->b.b.bind_tes_state = si_bind_tes_shader;
>
More information about the mesa-dev
mailing list