[Mesa-dev] [PATCH 8/8] radeonsi: don't update dependent states if it has no effect
Nicolai Hähnle
nhaehnle at gmail.com
Wed Jun 7 12:34:49 UTC 2017
On 06.06.2017 21:16, Samuel Pitoiset wrote:
> I really like the idea. :-)
Seconded!
> Though, I have two general comments:
>
> 1) I think it would be better to introduce some sort of compare helper
> functions for the different state changes. Also, for correctness it
> might be safer to do the opposite checks (if someone introduce a new
> field and forget to update the relevant part).
What do you mean by opposite checks, and how would it help?
I've also been occasionally thinking about how we track the various
dirtiness bits. It would certainly be nice if we could have some way of
getting more confidence that no dependencies are missing, to make
re-juggling these kinds of optimizations easier. The test coverage in
typical test suites for these things is probably quite bad.
It would be cool if we could either generate some of the dependency
tracking automatically, or at least have a static verifier.
Anyway, it's still definitely worth making the changes of this patch.
> 2) I would suggest to introduce uses_instance_divisors in a separate patch.
Yeah, it wouldn't hurt to split this up a bit.
Two more comments below.
>
> Btw, I'm not going to review patch 4 because I'm not familiar enough
> with this part.
>
> On 06/05/2017 06:51 PM, Marek Olšák wrote:
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> This and the previous commit decrease IB sizes and the number of
>> si_update_shaders invocations as follows:
>>
>> IB size si_update_shader calls
>> Borderlands 2 -10% -27%
>> Deus Ex: MD -5% -11%
>> Talos Principle -8% -30%
>> ---
>> src/gallium/drivers/radeonsi/si_state.c | 68
>> ++++++++++++++++++++++---
>> src/gallium/drivers/radeonsi/si_state.h | 1 +
>> src/gallium/drivers/radeonsi/si_state_shaders.c | 24 +++++++--
>> 3 files changed, 80 insertions(+), 13 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_state.c
>> b/src/gallium/drivers/radeonsi/si_state.c
>> index 5e5f564..323ec5e 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.c
>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>> @@ -596,23 +596,41 @@ static void *si_create_blend_state_mode(struct
>> pipe_context *ctx,
>> static void *si_create_blend_state(struct pipe_context *ctx,
>> const struct pipe_blend_state *state)
>> {
>> return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
>> }
>> static void si_bind_blend_state(struct pipe_context *ctx, void *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> - si_pm4_bind_state(sctx, blend, (struct si_state_blend *)state);
>> - si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>> - sctx->do_update_shaders = true;
>> + struct si_state_blend *old_blend = sctx->queued.named.blend;
>> + struct si_state_blend *blend = (struct si_state_blend *)state;
>> +
>> + if (!state)
>> + return;
Does this ever happen? If it does, shouldn't we still bind the NULL state?
>> +
>> + if (!old_blend ||
>> + old_blend->cb_target_mask != blend->cb_target_mask ||
>> + old_blend->dual_src_blend != blend->dual_src_blend)
>> + si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>> +
>> + si_pm4_bind_state(sctx, blend, state);
>> +
>> + if (!old_blend ||
>> + old_blend->cb_target_mask != blend->cb_target_mask ||
>> + old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
>> + old_blend->alpha_to_one != blend->alpha_to_one ||
>> + old_blend->dual_src_blend != blend->dual_src_blend ||
>> + old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
>> + old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
>> + sctx->do_update_shaders = true;
>> }
>> static void si_delete_blend_state(struct pipe_context *ctx, void
>> *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
>> }
>> static void si_set_blend_color(struct pipe_context *ctx,
>> const struct pipe_blend_color *state)
>> @@ -914,24 +932,41 @@ static void si_bind_rs_state(struct pipe_context
>> *ctx, void *state)
>> }
>> sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
>> sctx->current_vs_state |=
>> S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
>> r600_viewport_set_rast_deps(&sctx->b, rs->scissor_enable,
>> rs->clip_halfz);
>> si_pm4_bind_state(sctx, rasterizer, rs);
>> si_update_poly_offset_state(sctx);
>> - si_mark_atom_dirty(sctx, &sctx->clip_regs);
>> + if (!old_rs ||
>> + old_rs->clip_plane_enable != rs->clip_plane_enable ||
>> + old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
>> + si_mark_atom_dirty(sctx, &sctx->clip_regs);
>> +
>> sctx->ia_multi_vgt_param_key.u.line_stipple_enabled =
>> rs->line_stipple_enable;
>> - sctx->do_update_shaders = true;
>> +
>> + if (!old_rs ||
>> + old_rs->clip_plane_enable != rs->clip_plane_enable ||
>> + old_rs->rasterizer_discard != rs->rasterizer_discard ||
>> + old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
>> + old_rs->flatshade != rs->flatshade ||
>> + old_rs->two_side != rs->two_side ||
>> + old_rs->multisample_enable != rs->multisample_enable ||
>> + old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
>> + old_rs->poly_smooth != rs->poly_smooth ||
>> + old_rs->line_smooth != rs->line_smooth ||
>> + old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
>> + old_rs->force_persample_interp != rs->force_persample_interp)
>> + sctx->do_update_shaders = true;
>> }
>> static void si_delete_rs_state(struct pipe_context *ctx, void *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> if (sctx->queued.named.rasterizer == state)
>> si_pm4_bind_state(sctx, poly_offset, NULL);
>> si_pm4_delete_state(sctx, rasterizer, (struct
>> si_state_rasterizer *)state);
>> }
>> @@ -1055,33 +1090,36 @@ static void *si_create_dsa_state(struct
>> pipe_context *ctx,
>> si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN,
>> fui(state->depth.bounds_min));
>> si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX,
>> fui(state->depth.bounds_max));
>> }
>> return dsa;
>> }
>> static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> + struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
>> struct si_state_dsa *dsa = state;
>> if (!state)
>> return;
>> si_pm4_bind_state(sctx, dsa, dsa);
>> if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
>> sizeof(struct si_dsa_stencil_ref_part)) != 0) {
>> sctx->stencil_ref.dsa_part = dsa->stencil_ref;
>> si_mark_atom_dirty(sctx, &sctx->stencil_ref.atom);
>> }
>> - sctx->do_update_shaders = true;
>> +
>> + if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func)
>> + sctx->do_update_shaders = true;
>> }
>> static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
>> }
>> static void *si_create_db_flush_dsa(struct si_context *sctx)
>> {
>> @@ -2437,20 +2475,21 @@ static void si_framebuffer_cache_flush(struct
>> si_context *sctx)
>> static void si_set_framebuffer_state(struct pipe_context *ctx,
>> const struct pipe_framebuffer_state *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> struct pipe_constant_buffer constbuf = {0};
>> struct r600_surface *surf = NULL;
>> struct r600_texture *rtex;
>> bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
>> unsigned old_nr_samples = sctx->framebuffer.nr_samples;
>> + unsigned old_colorbuf_enabled_4bit =
>> sctx->framebuffer.colorbuf_enabled_4bit;
>> bool unbound = false;
>> int i;
>> /* Fast path for linear <-> sRGB changes and no-op changes. */
>> if (state->nr_cbufs &&
>> state->nr_cbufs == sctx->framebuffer.state.nr_cbufs &&
>> state->zsbuf == sctx->framebuffer.state.zsbuf) {
>> struct pipe_surface **cbufs = sctx->framebuffer.state.cbufs;
>> /* See if the surfaces are equivalent. */
>> @@ -2593,23 +2632,25 @@ static void si_set_framebuffer_state(struct
>> pipe_context *ctx,
>> surf = (struct r600_surface*)state->zsbuf;
>> rtex = (struct r600_texture*)surf->base.texture;
>> if (!surf->depth_initialized) {
>> si_init_depth_surface(sctx, surf);
>> }
>> r600_context_add_resource_size(ctx, surf->base.texture);
>> }
>> si_update_poly_offset_state(sctx);
>> - si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>> si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
>> + if (old_colorbuf_enabled_4bit !=
>> sctx->framebuffer.colorbuf_enabled_4bit)
>> + si_mark_atom_dirty(sctx, &sctx->cb_render_state);
si_emit_cb_render_state uses more info from sctx->framebuffer when RB+
is used.
Cheers,
Nicolai
>> +
>> if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
>> si_mark_atom_dirty(sctx, &sctx->msaa_config);
>> if (sctx->framebuffer.nr_samples != old_nr_samples) {
>> si_mark_atom_dirty(sctx, &sctx->msaa_config);
>> si_mark_atom_dirty(sctx, &sctx->db_render_state);
>> /* Set sample locations as fragment shader constants. */
>> switch (sctx->framebuffer.nr_samples) {
>> case 1:
>> @@ -3727,20 +3768,23 @@ static void *si_create_vertex_elements(struct
>> pipe_context *ctx,
>> unsigned data_format, num_format;
>> int first_non_void;
>> unsigned vbo_index = elements[i].vertex_buffer_index;
>> unsigned char swizzle[4];
>> if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
>> FREE(v);
>> return NULL;
>> }
>> + if (elements[i].instance_divisor)
>> + v->uses_instance_divisors = true;
>> +
>> if (!used[vbo_index]) {
>> v->first_vb_use_mask |= 1 << i;
>> used[vbo_index] = true;
>> }
>> desc = util_format_description(elements[i].src_format);
>> first_non_void =
>> util_format_get_first_non_void_channel(elements[i].src_format);
>> data_format = si_translate_buffer_dataformat(ctx->screen,
>> desc, first_non_void);
>> num_format = si_translate_buffer_numformat(ctx->screen,
>> desc, first_non_void);
>> channel = first_non_void >= 0 ?
>> &desc->channel[first_non_void] : NULL;
>> @@ -3840,25 +3884,33 @@ static void *si_create_vertex_elements(struct
>> pipe_context *ctx,
>> S_008F0C_DATA_FORMAT(data_format);
>> }
>> memcpy(v->elements, elements, sizeof(struct pipe_vertex_element)
>> * count);
>> return v;
>> }
>> static void si_bind_vertex_elements(struct pipe_context *ctx, void
>> *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> + struct si_vertex_element *old = sctx->vertex_elements;
>> struct si_vertex_element *v = (struct si_vertex_element*)state;
>> sctx->vertex_elements = v;
>> sctx->vertex_buffers_dirty = true;
>> - sctx->do_update_shaders = true;
>> +
>> + if (v &&
>> + (!old ||
>> + old->count != v->count ||
>> + old->uses_instance_divisors != v->uses_instance_divisors ||
>> + v->uses_instance_divisors || /* we don't check which
>> divisors changed */
>> + memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0])
>> * v->count)))
>> + sctx->do_update_shaders = true;
>> }
>> static void si_delete_vertex_element(struct pipe_context *ctx, void
>> *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> if (sctx->vertex_elements == state)
>> sctx->vertex_elements = NULL;
>> FREE(state);
>> }
>> diff --git a/src/gallium/drivers/radeonsi/si_state.h
>> b/src/gallium/drivers/radeonsi/si_state.h
>> index 275f830..4da51be 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.h
>> +++ b/src/gallium/drivers/radeonsi/si_state.h
>> @@ -102,20 +102,21 @@ struct si_vertex_element
>> {
>> unsigned count;
>> unsigned first_vb_use_mask;
>> /* Vertex buffer descriptor list size aligned for optimal
>> prefetch. */
>> unsigned desc_list_byte_size;
>> uint8_t fix_fetch[SI_MAX_ATTRIBS];
>> uint32_t rsrc_word3[SI_MAX_ATTRIBS];
>> uint32_t format_size[SI_MAX_ATTRIBS];
>> struct pipe_vertex_element elements[SI_MAX_ATTRIBS];
>> + bool uses_instance_divisors;
>> };
>> union si_state {
>> struct {
>> struct si_state_blend *blend;
>> struct si_state_rasterizer *rasterizer;
>> struct si_state_dsa *dsa;
>> struct si_pm4_state *poly_offset;
>> struct si_pm4_state *ls;
>> struct si_pm4_state *hs;
>> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c
>> b/src/gallium/drivers/radeonsi/si_state_shaders.c
>> index c21f855..677a6de 100644
>> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>> @@ -2307,32 +2307,39 @@ static void si_bind_tes_shader(struct
>> pipe_context *ctx, void *state)
>> r600_update_vs_writes_viewport_index(&sctx->b,
>> si_get_vs_info(sctx));
>> si_set_active_descriptors_for_shader(sctx, sel);
>> si_update_streamout_state(sctx);
>> si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant,
>> si_get_vs(sctx)->cso, si_get_vs_state(sctx));
>> }
>> static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> + struct si_shader_selector *old_sel = sctx->ps_shader.cso;
>> struct si_shader_selector *sel = state;
>> /* skip if supplied shader is one already in use */
>> - if (sctx->ps_shader.cso == sel)
>> + if (old_sel == sel)
>> return;
>> sctx->ps_shader.cso = sel;
>> sctx->ps_shader.current = sel ? sel->first_variant : NULL;
>> sctx->do_update_shaders = true;
>> - if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
>> - si_update_tess_uses_prim_id(sctx);
>> - si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>> +
>> + if (sel) {
>> + if (sctx->ia_multi_vgt_param_key.u.uses_tess)
>> + si_update_tess_uses_prim_id(sctx);
>> +
>> + if (!old_sel ||
>> + old_sel->info.colors_written != sel->info.colors_written)
>> + si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>> + }
>> si_set_active_descriptors_for_shader(sctx, sel);
>> }
>> static void si_delete_shader(struct si_context *sctx, struct
>> si_shader *shader)
>> {
>> if (shader->is_optimized) {
>>
>> util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
>> &shader->optimized_ready);
>> util_queue_fence_destroy(&shader->optimized_ready);
>> }
>> @@ -3081,20 +3088,23 @@ static void si_update_vgt_shader_config(struct
>> si_context *sctx)
>> si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
>> }
>> bool si_update_shaders(struct si_context *sctx)
>> {
>> struct pipe_context *ctx = (struct pipe_context*)sctx;
>> struct si_compiler_ctx_state compiler_state;
>> struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>> struct si_shader *old_vs = si_get_vs_state(sctx);
>> bool old_clip_disable = old_vs ?
>> old_vs->key.opt.hw_vs.clip_disable : false;
>> + struct si_shader *old_ps = sctx->ps_shader.current;
>> + unsigned old_spi_shader_col_format =
>> + old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
>> int r;
>> compiler_state.tm = sctx->tm;
>> compiler_state.debug = sctx->b.debug;
>> compiler_state.is_debug_context = sctx->is_debug;
>> /* Update stages before GS. */
>> if (sctx->tes_shader.cso) {
>> if (!sctx->tf_ring) {
>> si_init_tess_factor_ring(sctx);
>> @@ -3205,21 +3215,25 @@ bool si_update_shaders(struct si_context *sctx)
>> S_02880C_KILL_ENABLE(si_get_alpha_test_func(sctx) !=
>> PIPE_FUNC_ALWAYS);
>> if (si_pm4_state_changed(sctx, ps) ||
>> si_pm4_state_changed(sctx, vs) ||
>> sctx->sprite_coord_enable != rs->sprite_coord_enable ||
>> sctx->flatshade != rs->flatshade) {
>> sctx->sprite_coord_enable = rs->sprite_coord_enable;
>> sctx->flatshade = rs->flatshade;
>> si_mark_atom_dirty(sctx, &sctx->spi_map);
>> }
>> - if (sctx->screen->b.rbplus_allowed &&
>> si_pm4_state_changed(sctx, ps))
>> + if (sctx->screen->b.rbplus_allowed &&
>> + si_pm4_state_changed(sctx, ps) &&
>> + (!old_ps ||
>> + old_spi_shader_col_format !=
>> +
>> sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
>> si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>> if (sctx->ps_db_shader_control != db_shader_control) {
>> sctx->ps_db_shader_control = db_shader_control;
>> si_mark_atom_dirty(sctx, &sctx->db_render_state);
>> }
>> if (sctx->smoothing_enabled !=
>> sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
>> sctx->smoothing_enabled =
>> sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
>> si_mark_atom_dirty(sctx, &sctx->msaa_config);
>>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list