[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
Nicolai Hähnle
nhaehnle at gmail.com
Thu May 18 09:31:13 UTC 2017
On 17.05.2017 21:38, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> This decreases the size of CE RAM dumps to L2, or the size of descriptor
> uploads without CE.
> ---
> src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++--
> src/gallium/drivers/radeonsi/si_descriptors.c | 85 ++++++++++++++++++++-----
> src/gallium/drivers/radeonsi/si_state.h | 18 +++++-
> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++
> 4 files changed, 113 insertions(+), 24 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 22ef111..4c98066 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
> return NULL;
> }
> }
>
> return program;
> }
>
> static void si_bind_compute_state(struct pipe_context *ctx, void *state)
> {
> struct si_context *sctx = (struct si_context*)ctx;
> - sctx->cs_shader_state.program = (struct si_compute*)state;
> + struct si_compute *program = (struct si_compute*)state;
> +
> + sctx->cs_shader_state.program = program;
> + if (!program)
> + return;
> +
> + /* Wait because we need active slot usage masks. */
> + if (program->ir_type == PIPE_SHADER_IR_TGSI)
> + util_queue_fence_wait(&program->ready);
> +
> + si_set_active_descriptors(sctx,
> + SI_DESCS_FIRST_COMPUTE +
> + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
> + program->active_const_and_shader_buffers);
> + si_set_active_descriptors(sctx,
> + SI_DESCS_FIRST_COMPUTE +
> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
> + program->active_samplers_and_images);
> }
>
> static void si_set_global_binding(
> struct pipe_context *ctx, unsigned first, unsigned n,
> struct pipe_resource **resources,
> uint32_t **handles)
> {
> unsigned i;
> struct si_context *sctx = (struct si_context*)ctx;
> struct si_compute *program = sctx->cs_shader_state.program;
> @@ -749,26 +766,23 @@ static void si_launch_grid(
> bool cs_regalloc_hang =
> (sctx->b.chip_class == SI ||
> sctx->b.family == CHIP_BONAIRE ||
> sctx->b.family == CHIP_KABINI) &&
> info->block[0] * info->block[1] * info->block[2] > 256;
>
> if (cs_regalloc_hang)
> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> SI_CONTEXT_CS_PARTIAL_FLUSH;
>
> - if (program->ir_type == PIPE_SHADER_IR_TGSI) {
> - util_queue_fence_wait(&program->ready);
> -
> - if (program->shader.compilation_failed)
> - return;
> - }
> + if (program->ir_type == PIPE_SHADER_IR_TGSI &&
> + program->shader.compilation_failed)
> + return;
>
> si_decompress_compute_textures(sctx);
>
> /* Add buffer sizes for memory checking in need_cs_space. */
> r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
> /* TODO: add the scratch buffer */
>
> if (info->indirect) {
> r600_context_add_resource_size(ctx, info->indirect);
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 38e4ae1..a2f40a8 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc,
> }
> }
>
> static void si_release_descriptors(struct si_descriptors *desc)
> {
> r600_resource_reference(&desc->buffer, NULL);
> FREE(desc->list);
> }
>
> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
> - unsigned *out_offset, struct r600_resource **out_buf) {
> + unsigned *out_offset, struct r600_resource **out_buf)
> +{
> uint64_t va;
>
> u_suballocator_alloc(sctx->ce_suballocator, size,
> - sctx->screen->b.info.tcc_cache_line_size,
> - out_offset, (struct pipe_resource**)out_buf);
> + si_optimal_tcc_alignment(sctx, size),
> + (unsigned*)out_offset,
The extra cast of out_offset is unnecessary.
> + (struct pipe_resource**)out_buf);
> if (!out_buf)
> return false;
>
> va = (*out_buf)->gpu_address + *out_offset;
>
> radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
> radeon_emit(sctx->ce_ib, ce_offset);
> radeon_emit(sctx->ce_ib, size / 4);
> radeon_emit(sctx->ce_ib, va);
> radeon_emit(sctx->ce_ib, va >> 32);
> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
> radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
> radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
> CONTEXT_CONTROL_LOAD_CE_RAM(1));
> radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
> }
>
> static bool si_upload_descriptors(struct si_context *sctx,
> struct si_descriptors *desc,
> struct r600_atom * atom)
> {
> - unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
> + unsigned slot_size = desc->element_dw_size * 4;
> + unsigned first_slot_offset = desc->first_active_slot * slot_size;
> + unsigned upload_size = desc->num_active_slots * slot_size;
> +
> + if (!upload_size)
> + return true;
The early-out here means that desc->num_active_slots *does* control what
is written to CE RAM, contrary to what its descriptive comment says. It
needs to be moved further down.
Cheers,
Nicolai
>
> if (sctx->ce_ib && desc->uses_ce) {
> uint32_t const* list = (uint32_t const*)desc->list;
>
> while(desc->dirty_mask) {
> int begin, count;
> u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
> &count);
>
> begin *= desc->element_dw_size;
> count *= desc->element_dw_size;
>
> radeon_emit(sctx->ce_ib,
> PKT3(PKT3_WRITE_CONST_RAM, count, 0));
> radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
> radeon_emit_array(sctx->ce_ib, list + begin, count);
> }
>
> - if (!si_ce_upload(sctx, desc->ce_offset, list_size,
> - &desc->buffer_offset, &desc->buffer))
> + if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
> + upload_size, (unsigned*)&desc->buffer_offset,
> + &desc->buffer))
> return false;
> } else {
> - void *ptr;
> + uint32_t *ptr;
>
> - u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
> - sctx->screen->b.info.tcc_cache_line_size,
> - &desc->buffer_offset,
> - (struct pipe_resource**)&desc->buffer, &ptr);
> + u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
> + si_optimal_tcc_alignment(sctx, upload_size),
> + (unsigned*)&desc->buffer_offset,
> + (struct pipe_resource**)&desc->buffer,
> + (void**)&ptr);
> if (!desc->buffer)
> return false; /* skip the draw call */
>
> - util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
> - desc->gpu_list = ptr;
> + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
> + upload_size);
> + desc->gpu_list = ptr - first_slot_offset / 4;
>
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
> RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
> }
> +
> + /* The shader pointer should point to slot 0. */
> + desc->buffer_offset -= first_slot_offset;
> +
> desc->dirty_mask = 0;
>
> if (atom)
> si_mark_atom_dirty(sctx, atom);
>
> return true;
> }
>
> static void
> si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
> @@ -1023,21 +1037,21 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
> desc_list_byte_size = velems->desc_list_byte_size;
> first_vb_use_mask = velems->first_vb_use_mask;
>
> /* Vertex buffer descriptors are the only ones which are uploaded
> * directly through a staging buffer and don't go through
> * the fine-grained upload path.
> */
> u_upload_alloc(sctx->b.b.const_uploader, 0,
> desc_list_byte_size,
> si_optimal_tcc_alignment(sctx, desc_list_byte_size),
> - &desc->buffer_offset,
> + (unsigned*)&desc->buffer_offset,
> (struct pipe_resource**)&desc->buffer, (void**)&ptr);
> if (!desc->buffer)
> return false;
>
> radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
> desc->buffer, RADEON_USAGE_READ,
> RADEON_PRIO_DESCRIPTORS);
>
> assert(count <= SI_MAX_ATTRIBS);
>
> @@ -1883,21 +1897,22 @@ void si_shader_change_notify(struct si_context *sctx)
> }
> }
>
> static void si_emit_shader_pointer(struct si_context *sctx,
> struct si_descriptors *desc,
> unsigned sh_base)
> {
> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> uint64_t va;
>
> - assert(desc->buffer);
> + if (!desc->buffer)
> + return; /* the pointer is not used by current shaders */
>
> va = desc->buffer->gpu_address +
> desc->buffer_offset;
>
> radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
> radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
> radeon_emit(cs, va);
> radeon_emit(cs, va >> 32);
> }
>
> @@ -2026,20 +2041,22 @@ void si_init_all_descriptors(struct si_context *sctx)
> }
>
> si_init_buffer_resources(&sctx->rw_buffers,
> &sctx->descriptors[SI_DESCS_RW_BUFFERS],
> SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
> /* The second set of usage/priority is used by
> * const buffers in RW buffer slots. */
> RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
> RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
> &ce_offset);
> + sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
> +
> si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
> 4, SI_NUM_VERTEX_BUFFERS, NULL);
>
> sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
> sctx->total_ce_ram_allocated = ce_offset;
>
> if (sctx->b.chip_class >= GFX9)
> assert(ce_offset <= 4096);
> else
> assert(ce_offset <= 32768);
> @@ -2148,10 +2165,48 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
> si_image_views_begin_new_cs(sctx, &sctx->images[i]);
> }
> si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
> si_vertex_buffers_begin_new_cs(sctx);
>
> for (i = 0; i < SI_NUM_DESCS; ++i)
> si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
>
> si_shader_userdata_begin_new_cs(sctx);
> }
> +
> +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
> + uint64_t new_active_mask)
> +{
> + struct si_descriptors *desc = &sctx->descriptors[desc_idx];
> +
> + /* Ignore no-op updates and updates that disable all slots. */
> + if (!new_active_mask ||
> + new_active_mask == u_bit_consecutive64(desc->first_active_slot,
> + desc->num_active_slots))
> + return;
> +
> + int first, count;
> + u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
> + assert(new_active_mask == 0);
> +
> + /* Upload/dump descriptors if slots are being enabled. */
> + if (first < desc->first_active_slot ||
> + first + count > desc->first_active_slot + desc->num_active_slots)
> + sctx->descriptors_dirty |= 1u << desc_idx;
> +
> + desc->first_active_slot = first;
> + desc->num_active_slots = count;
> +}
> +
> +void si_set_active_descriptors_for_shader(struct si_context *sctx,
> + struct si_shader_selector *sel)
> +{
> + if (!sel)
> + return;
> +
> + si_set_active_descriptors(sctx,
> + si_const_and_shader_buffer_descriptors_idx(sel->type),
> + sel->active_const_and_shader_buffers);
> + si_set_active_descriptors(sctx,
> + si_sampler_and_image_descriptors_idx(sel->type),
> + sel->active_samplers_and_images);
> +}
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index f2003a5..dfabaa3 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -35,20 +35,21 @@
>
> #define SI_MAX_ATTRIBS 16
> #define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS
> #define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */
> #define SI_NUM_CONST_BUFFERS 16
> #define SI_NUM_IMAGES 16
> #define SI_NUM_SHADER_BUFFERS 16
>
> struct si_screen;
> struct si_shader;
> +struct si_shader_selector;
>
> struct si_state_blend {
> struct si_pm4_state pm4;
> uint32_t cb_target_mask;
> bool alpha_to_coverage;
> bool alpha_to_one;
> bool dual_src_blend;
> /* Set 0xf or 0x0 (4 bits) per render target if the following is
> * true. ANDed with spi_shader_col_format.
> */
> @@ -215,26 +216,34 @@ struct si_descriptors {
> uint32_t *list;
> /* The list in mapped GPU memory. */
> uint32_t *gpu_list;
> /* The size of one descriptor. */
> unsigned element_dw_size;
> /* The maximum number of descriptors. */
> unsigned num_elements;
>
> /* The buffer where the descriptors have been uploaded. */
> struct r600_resource *buffer;
> - unsigned buffer_offset;
> + int buffer_offset; /* can be negative if not using lower slots */
>
> /* Offset in CE RAM */
> unsigned ce_offset;
>
> - /* elements of the list that are changed and need to be uploaded */
> + /* Slots that are used by currently-bound shaders.
> + * With CE: It determines which slots are dumped to L2.
> + * It doesn't skip uploads to CE RAM.
> + * Without CE: It determines which slots are uploaded.
> + */
> + unsigned first_active_slot;
> + unsigned num_active_slots;
> +
> + /* Slots that have been changed and need to be uploaded. */
> uint64_t dirty_mask;
>
> /* Whether CE is used to upload this descriptor array. */
> bool uses_ce;
>
> /* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
> * array will be stored. */
> unsigned shader_userdata_offset;
> };
>
> @@ -308,20 +317,25 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx);
> void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
> const uint8_t *ptr, unsigned size, uint32_t *const_offset);
> void si_update_all_texture_descriptors(struct si_context *sctx);
> void si_shader_change_notify(struct si_context *sctx);
> void si_update_compressed_colortex_masks(struct si_context *sctx);
> void si_emit_graphics_shader_userdata(struct si_context *sctx,
> struct r600_atom *atom);
> void si_emit_compute_shader_userdata(struct si_context *sctx);
> void si_set_rw_buffer(struct si_context *sctx,
> uint slot, const struct pipe_constant_buffer *input);
> +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
> + uint64_t new_active_mask);
> +void si_set_active_descriptors_for_shader(struct si_context *sctx,
> + struct si_shader_selector *sel);
> +
> /* si_state.c */
> struct si_shader_selector;
>
> void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
> struct r600_atom **list_elem,
> void (*emit_func)(struct si_context *ctx, struct r600_atom *state));
> void si_init_state_functions(struct si_context *sctx);
> void si_init_screen_state_functions(struct si_screen *sscreen);
> void
> si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 45d996b..8ac4309 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -2144,20 +2144,21 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
> struct si_shader_selector *sel = state;
>
> if (sctx->vs_shader.cso == sel)
> return;
>
> sctx->vs_shader.cso = sel;
> sctx->vs_shader.current = sel ? sel->first_variant : NULL;
> sctx->do_update_shaders = true;
> si_mark_atom_dirty(sctx, &sctx->clip_regs);
> r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
> + si_set_active_descriptors_for_shader(sctx, sel);
> }
>
> static void si_update_tess_uses_prim_id(struct si_context *sctx)
> {
> sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
> (sctx->tes_shader.cso &&
> sctx->tes_shader.cso->info.uses_primid) ||
> (sctx->tcs_shader.cso &&
> sctx->tcs_shader.cso->info.uses_primid) ||
> (sctx->gs_shader.cso &&
> @@ -2181,38 +2182,41 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
> sctx->do_update_shaders = true;
> si_mark_atom_dirty(sctx, &sctx->clip_regs);
> sctx->last_rast_prim = -1; /* reset this so that it gets updated */
>
> if (enable_changed) {
> si_shader_change_notify(sctx);
> if (sctx->ia_multi_vgt_param_key.u.uses_tess)
> si_update_tess_uses_prim_id(sctx);
> }
> r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
> + si_set_active_descriptors_for_shader(sctx, sel);
> }
>
> static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
> {
> struct si_context *sctx = (struct si_context *)ctx;
> struct si_shader_selector *sel = state;
> bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
>
> if (sctx->tcs_shader.cso == sel)
> return;
>
> sctx->tcs_shader.cso = sel;
> sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
> si_update_tess_uses_prim_id(sctx);
> sctx->do_update_shaders = true;
>
> if (enable_changed)
> sctx->last_tcs = NULL; /* invalidate derived tess state */
> +
> + si_set_active_descriptors_for_shader(sctx, sel);
> }
>
> static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
> {
> struct si_context *sctx = (struct si_context *)ctx;
> struct si_shader_selector *sel = state;
> bool enable_changed = !!sctx->tes_shader.cso != !!sel;
>
> if (sctx->tes_shader.cso == sel)
> return;
> @@ -2223,37 +2227,39 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
> si_update_tess_uses_prim_id(sctx);
> sctx->do_update_shaders = true;
> si_mark_atom_dirty(sctx, &sctx->clip_regs);
> sctx->last_rast_prim = -1; /* reset this so that it gets updated */
>
> if (enable_changed) {
> si_shader_change_notify(sctx);
> sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
> }
> r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
> + si_set_active_descriptors_for_shader(sctx, sel);
> }
>
> static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
> {
> struct si_context *sctx = (struct si_context *)ctx;
> struct si_shader_selector *sel = state;
>
> /* skip if supplied shader is one already in use */
> if (sctx->ps_shader.cso == sel)
> return;
>
> sctx->ps_shader.cso = sel;
> sctx->ps_shader.current = sel ? sel->first_variant : NULL;
> sctx->do_update_shaders = true;
> if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
> si_update_tess_uses_prim_id(sctx);
> si_mark_atom_dirty(sctx, &sctx->cb_render_state);
> + si_set_active_descriptors_for_shader(sctx, sel);
> }
>
> static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
> {
> if (shader->is_optimized) {
> util_queue_fence_wait(&shader->optimized_ready);
> util_queue_fence_destroy(&shader->optimized_ready);
> }
>
> if (shader->pm4) {
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list