[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
Marek Olšák
maraeo at gmail.com
Wed May 17 19:38:50 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++--
src/gallium/drivers/radeonsi/si_descriptors.c | 85 ++++++++++++++++++++-----
src/gallium/drivers/radeonsi/si_state.h | 18 +++++-
src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++
4 files changed, 113 insertions(+), 24 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
return NULL;
}
}
return program;
}
static void si_bind_compute_state(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context*)ctx;
- sctx->cs_shader_state.program = (struct si_compute*)state;
+ struct si_compute *program = (struct si_compute*)state;
+
+ sctx->cs_shader_state.program = program;
+ if (!program)
+ return;
+
+ /* Wait because we need active slot usage masks. */
+ if (program->ir_type == PIPE_SHADER_IR_TGSI)
+ util_queue_fence_wait(&program->ready);
+
+ si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+ program->active_const_and_shader_buffers);
+ si_set_active_descriptors(sctx,
+ SI_DESCS_FIRST_COMPUTE +
+ SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+ program->active_samplers_and_images);
}
static void si_set_global_binding(
struct pipe_context *ctx, unsigned first, unsigned n,
struct pipe_resource **resources,
uint32_t **handles)
{
unsigned i;
struct si_context *sctx = (struct si_context*)ctx;
struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
bool cs_regalloc_hang =
(sctx->b.chip_class == SI ||
sctx->b.family == CHIP_BONAIRE ||
sctx->b.family == CHIP_KABINI) &&
info->block[0] * info->block[1] * info->block[2] > 256;
if (cs_regalloc_hang)
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH;
- if (program->ir_type == PIPE_SHADER_IR_TGSI) {
- util_queue_fence_wait(&program->ready);
-
- if (program->shader.compilation_failed)
- return;
- }
+ if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+ program->shader.compilation_failed)
+ return;
si_decompress_compute_textures(sctx);
/* Add buffer sizes for memory checking in need_cs_space. */
r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
/* TODO: add the scratch buffer */
if (info->indirect) {
r600_context_add_resource_size(ctx, info->indirect);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors *desc,
}
}
static void si_release_descriptors(struct si_descriptors *desc)
{
r600_resource_reference(&desc->buffer, NULL);
FREE(desc->list);
}
static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
- unsigned *out_offset, struct r600_resource **out_buf) {
+ unsigned *out_offset, struct r600_resource **out_buf)
+{
uint64_t va;
u_suballocator_alloc(sctx->ce_suballocator, size,
- sctx->screen->b.info.tcc_cache_line_size,
- out_offset, (struct pipe_resource**)out_buf);
+ si_optimal_tcc_alignment(sctx, size),
+ (unsigned*)out_offset,
+ (struct pipe_resource**)out_buf);
if (!out_buf)
return false;
va = (*out_buf)->gpu_address + *out_offset;
radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
radeon_emit(sctx->ce_ib, ce_offset);
radeon_emit(sctx->ce_ib, size / 4);
radeon_emit(sctx->ce_ib, va);
radeon_emit(sctx->ce_ib, va >> 32);
@@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
CONTEXT_CONTROL_LOAD_CE_RAM(1));
radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
}
static bool si_upload_descriptors(struct si_context *sctx,
struct si_descriptors *desc,
struct r600_atom * atom)
{
- unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+ unsigned slot_size = desc->element_dw_size * 4;
+ unsigned first_slot_offset = desc->first_active_slot * slot_size;
+ unsigned upload_size = desc->num_active_slots * slot_size;
+
+ if (!upload_size)
+ return true;
if (sctx->ce_ib && desc->uses_ce) {
uint32_t const* list = (uint32_t const*)desc->list;
while(desc->dirty_mask) {
int begin, count;
u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
&count);
begin *= desc->element_dw_size;
count *= desc->element_dw_size;
radeon_emit(sctx->ce_ib,
PKT3(PKT3_WRITE_CONST_RAM, count, 0));
radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
radeon_emit_array(sctx->ce_ib, list + begin, count);
}
- if (!si_ce_upload(sctx, desc->ce_offset, list_size,
- &desc->buffer_offset, &desc->buffer))
+ if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+ upload_size, (unsigned*)&desc->buffer_offset,
+ &desc->buffer))
return false;
} else {
- void *ptr;
+ uint32_t *ptr;
- u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
- sctx->screen->b.info.tcc_cache_line_size,
- &desc->buffer_offset,
- (struct pipe_resource**)&desc->buffer, &ptr);
+ u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
+ si_optimal_tcc_alignment(sctx, upload_size),
+ (unsigned*)&desc->buffer_offset,
+ (struct pipe_resource**)&desc->buffer,
+ (void**)&ptr);
if (!desc->buffer)
return false; /* skip the draw call */
- util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
- desc->gpu_list = ptr;
+ util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
+ upload_size);
+ desc->gpu_list = ptr - first_slot_offset / 4;
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
}
+
+ /* The shader pointer should point to slot 0. */
+ desc->buffer_offset -= first_slot_offset;
+
desc->dirty_mask = 0;
if (atom)
si_mark_atom_dirty(sctx, atom);
return true;
}
static void
si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
@@ -1023,21 +1037,21 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
desc_list_byte_size = velems->desc_list_byte_size;
first_vb_use_mask = velems->first_vb_use_mask;
/* Vertex buffer descriptors are the only ones which are uploaded
* directly through a staging buffer and don't go through
* the fine-grained upload path.
*/
u_upload_alloc(sctx->b.b.const_uploader, 0,
desc_list_byte_size,
si_optimal_tcc_alignment(sctx, desc_list_byte_size),
- &desc->buffer_offset,
+ (unsigned*)&desc->buffer_offset,
(struct pipe_resource**)&desc->buffer, (void**)&ptr);
if (!desc->buffer)
return false;
radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
desc->buffer, RADEON_USAGE_READ,
RADEON_PRIO_DESCRIPTORS);
assert(count <= SI_MAX_ATTRIBS);
@@ -1883,21 +1897,22 @@ void si_shader_change_notify(struct si_context *sctx)
}
}
static void si_emit_shader_pointer(struct si_context *sctx,
struct si_descriptors *desc,
unsigned sh_base)
{
struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
uint64_t va;
- assert(desc->buffer);
+ if (!desc->buffer)
+ return; /* the pointer is not used by current shaders */
va = desc->buffer->gpu_address +
desc->buffer_offset;
radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
}
@@ -2026,20 +2041,22 @@ void si_init_all_descriptors(struct si_context *sctx)
}
si_init_buffer_resources(&sctx->rw_buffers,
&sctx->descriptors[SI_DESCS_RW_BUFFERS],
SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
/* The second set of usage/priority is used by
* const buffers in RW buffer slots. */
RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
&ce_offset);
+ sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
4, SI_NUM_VERTEX_BUFFERS, NULL);
sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
sctx->total_ce_ram_allocated = ce_offset;
if (sctx->b.chip_class >= GFX9)
assert(ce_offset <= 4096);
else
assert(ce_offset <= 32768);
@@ -2148,10 +2165,48 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
si_image_views_begin_new_cs(sctx, &sctx->images[i]);
}
si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
si_vertex_buffers_begin_new_cs(sctx);
for (i = 0; i < SI_NUM_DESCS; ++i)
si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
si_shader_userdata_begin_new_cs(sctx);
}
+
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+ uint64_t new_active_mask)
+{
+ struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+
+ /* Ignore no-op updates and updates that disable all slots. */
+ if (!new_active_mask ||
+ new_active_mask == u_bit_consecutive64(desc->first_active_slot,
+ desc->num_active_slots))
+ return;
+
+ int first, count;
+ u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+ assert(new_active_mask == 0);
+
+ /* Upload/dump descriptors if slots are being enabled. */
+ if (first < desc->first_active_slot ||
+ first + count > desc->first_active_slot + desc->num_active_slots)
+ sctx->descriptors_dirty |= 1u << desc_idx;
+
+ desc->first_active_slot = first;
+ desc->num_active_slots = count;
+}
+
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+ struct si_shader_selector *sel)
+{
+ if (!sel)
+ return;
+
+ si_set_active_descriptors(sctx,
+ si_const_and_shader_buffer_descriptors_idx(sel->type),
+ sel->active_const_and_shader_buffers);
+ si_set_active_descriptors(sctx,
+ si_sampler_and_image_descriptors_idx(sel->type),
+ sel->active_samplers_and_images);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f2003a5..dfabaa3 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -35,20 +35,21 @@
#define SI_MAX_ATTRIBS 16
#define SI_NUM_VERTEX_BUFFERS SI_MAX_ATTRIBS
#define SI_NUM_SAMPLERS 32 /* OpenGL textures units per shader */
#define SI_NUM_CONST_BUFFERS 16
#define SI_NUM_IMAGES 16
#define SI_NUM_SHADER_BUFFERS 16
struct si_screen;
struct si_shader;
+struct si_shader_selector;
struct si_state_blend {
struct si_pm4_state pm4;
uint32_t cb_target_mask;
bool alpha_to_coverage;
bool alpha_to_one;
bool dual_src_blend;
/* Set 0xf or 0x0 (4 bits) per render target if the following is
* true. ANDed with spi_shader_col_format.
*/
@@ -215,26 +216,34 @@ struct si_descriptors {
uint32_t *list;
/* The list in mapped GPU memory. */
uint32_t *gpu_list;
/* The size of one descriptor. */
unsigned element_dw_size;
/* The maximum number of descriptors. */
unsigned num_elements;
/* The buffer where the descriptors have been uploaded. */
struct r600_resource *buffer;
- unsigned buffer_offset;
+ int buffer_offset; /* can be negative if not using lower slots */
/* Offset in CE RAM */
unsigned ce_offset;
- /* elements of the list that are changed and need to be uploaded */
+ /* Slots that are used by currently-bound shaders.
+ * With CE: It determines which slots are dumped to L2.
+ * It doesn't skip uploads to CE RAM.
+ * Without CE: It determines which slots are uploaded.
+ */
+ unsigned first_active_slot;
+ unsigned num_active_slots;
+
+ /* Slots that have been changed and need to be uploaded. */
uint64_t dirty_mask;
/* Whether CE is used to upload this descriptor array. */
bool uses_ce;
/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
* array will be stored. */
unsigned shader_userdata_offset;
};
@@ -308,20 +317,25 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx);
void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
const uint8_t *ptr, unsigned size, uint32_t *const_offset);
void si_update_all_texture_descriptors(struct si_context *sctx);
void si_shader_change_notify(struct si_context *sctx);
void si_update_compressed_colortex_masks(struct si_context *sctx);
void si_emit_graphics_shader_userdata(struct si_context *sctx,
struct r600_atom *atom);
void si_emit_compute_shader_userdata(struct si_context *sctx);
void si_set_rw_buffer(struct si_context *sctx,
uint slot, const struct pipe_constant_buffer *input);
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+ uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+ struct si_shader_selector *sel);
+
/* si_state.c */
struct si_shader_selector;
void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
struct r600_atom **list_elem,
void (*emit_func)(struct si_context *ctx, struct r600_atom *state));
void si_init_state_functions(struct si_context *sctx);
void si_init_screen_state_functions(struct si_screen *sscreen);
void
si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 45d996b..8ac4309 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2144,20 +2144,21 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
struct si_shader_selector *sel = state;
if (sctx->vs_shader.cso == sel)
return;
sctx->vs_shader.cso = sel;
sctx->vs_shader.current = sel ? sel->first_variant : NULL;
sctx->do_update_shaders = true;
si_mark_atom_dirty(sctx, &sctx->clip_regs);
r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+ si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_update_tess_uses_prim_id(struct si_context *sctx)
{
sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
(sctx->tes_shader.cso &&
sctx->tes_shader.cso->info.uses_primid) ||
(sctx->tcs_shader.cso &&
sctx->tcs_shader.cso->info.uses_primid) ||
(sctx->gs_shader.cso &&
@@ -2181,38 +2182,41 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
sctx->do_update_shaders = true;
si_mark_atom_dirty(sctx, &sctx->clip_regs);
sctx->last_rast_prim = -1; /* reset this so that it gets updated */
if (enable_changed) {
si_shader_change_notify(sctx);
if (sctx->ia_multi_vgt_param_key.u.uses_tess)
si_update_tess_uses_prim_id(sctx);
}
r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+ si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_shader_selector *sel = state;
bool enable_changed = !!sctx->tcs_shader.cso != !!sel;
if (sctx->tcs_shader.cso == sel)
return;
sctx->tcs_shader.cso = sel;
sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
si_update_tess_uses_prim_id(sctx);
sctx->do_update_shaders = true;
if (enable_changed)
sctx->last_tcs = NULL; /* invalidate derived tess state */
+
+ si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_shader_selector *sel = state;
bool enable_changed = !!sctx->tes_shader.cso != !!sel;
if (sctx->tes_shader.cso == sel)
return;
@@ -2223,37 +2227,39 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
si_update_tess_uses_prim_id(sctx);
sctx->do_update_shaders = true;
si_mark_atom_dirty(sctx, &sctx->clip_regs);
sctx->last_rast_prim = -1; /* reset this so that it gets updated */
if (enable_changed) {
si_shader_change_notify(sctx);
sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
}
r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+ si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_shader_selector *sel = state;
/* skip if supplied shader is one already in use */
if (sctx->ps_shader.cso == sel)
return;
sctx->ps_shader.cso = sel;
sctx->ps_shader.current = sel ? sel->first_variant : NULL;
sctx->do_update_shaders = true;
if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
si_update_tess_uses_prim_id(sctx);
si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+ si_set_active_descriptors_for_shader(sctx, sel);
}
static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
{
if (shader->is_optimized) {
util_queue_fence_wait(&shader->optimized_ready);
util_queue_fence_destroy(&shader->optimized_ready);
}
if (shader->pm4) {
--
2.7.4
More information about the mesa-dev
mailing list