[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
Marek Olšák
maraeo at gmail.com
Thu May 18 10:41:21 UTC 2017
On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 17.05.2017 21:38, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> This decreases the size of CE RAM dumps to L2, or the size of descriptor
>> uploads without CE.
>> ---
>> src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++--
>> src/gallium/drivers/radeonsi/si_descriptors.c | 85
>> ++++++++++++++++++++-----
>> src/gallium/drivers/radeonsi/si_state.h | 18 +++++-
>> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++
>> 4 files changed, 113 insertions(+), 24 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>> b/src/gallium/drivers/radeonsi/si_compute.c
>> index 22ef111..4c98066 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
>> return NULL;
>> }
>> }
>>
>> return program;
>> }
>>
>> static void si_bind_compute_state(struct pipe_context *ctx, void *state)
>> {
>> struct si_context *sctx = (struct si_context*)ctx;
>> - sctx->cs_shader_state.program = (struct si_compute*)state;
>> + struct si_compute *program = (struct si_compute*)state;
>> +
>> + sctx->cs_shader_state.program = program;
>> + if (!program)
>> + return;
>> +
>> + /* Wait because we need active slot usage masks. */
>> + if (program->ir_type == PIPE_SHADER_IR_TGSI)
>> + util_queue_fence_wait(&program->ready);
>> +
>> + si_set_active_descriptors(sctx,
>> + SI_DESCS_FIRST_COMPUTE +
>> +
>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
>> +
>> program->active_const_and_shader_buffers);
>> + si_set_active_descriptors(sctx,
>> + SI_DESCS_FIRST_COMPUTE +
>> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
>> + program->active_samplers_and_images);
>> }
>>
>> static void si_set_global_binding(
>> struct pipe_context *ctx, unsigned first, unsigned n,
>> struct pipe_resource **resources,
>> uint32_t **handles)
>> {
>> unsigned i;
>> struct si_context *sctx = (struct si_context*)ctx;
>> struct si_compute *program = sctx->cs_shader_state.program;
>> @@ -749,26 +766,23 @@ static void si_launch_grid(
>> bool cs_regalloc_hang =
>> (sctx->b.chip_class == SI ||
>> sctx->b.family == CHIP_BONAIRE ||
>> sctx->b.family == CHIP_KABINI) &&
>> info->block[0] * info->block[1] * info->block[2] > 256;
>>
>> if (cs_regalloc_hang)
>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> SI_CONTEXT_CS_PARTIAL_FLUSH;
>>
>> - if (program->ir_type == PIPE_SHADER_IR_TGSI) {
>> - util_queue_fence_wait(&program->ready);
>> -
>> - if (program->shader.compilation_failed)
>> - return;
>> - }
>> + if (program->ir_type == PIPE_SHADER_IR_TGSI &&
>> + program->shader.compilation_failed)
>> + return;
>>
>> si_decompress_compute_textures(sctx);
>>
>> /* Add buffer sizes for memory checking in need_cs_space. */
>> r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
>> /* TODO: add the scratch buffer */
>>
>> if (info->indirect) {
>> r600_context_add_resource_size(ctx, info->indirect);
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index 38e4ae1..a2f40a8 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct
>> si_descriptors *desc,
>> }
>> }
>>
>> static void si_release_descriptors(struct si_descriptors *desc)
>> {
>> r600_resource_reference(&desc->buffer, NULL);
>> FREE(desc->list);
>> }
>>
>> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
>> unsigned size,
>> - unsigned *out_offset, struct r600_resource
>> **out_buf) {
>> + unsigned *out_offset, struct r600_resource
>> **out_buf)
>> +{
>> uint64_t va;
>>
>> u_suballocator_alloc(sctx->ce_suballocator, size,
>> - sctx->screen->b.info.tcc_cache_line_size,
>> - out_offset, (struct pipe_resource**)out_buf);
>> + si_optimal_tcc_alignment(sctx, size),
>> + (unsigned*)out_offset,
>
>
> The extra cast of out_offset is unnecessary.
>
>
>> + (struct pipe_resource**)out_buf);
>> if (!out_buf)
>> return false;
>>
>> va = (*out_buf)->gpu_address + *out_offset;
>>
>> radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
>> radeon_emit(sctx->ce_ib, ce_offset);
>> radeon_emit(sctx->ce_ib, size / 4);
>> radeon_emit(sctx->ce_ib, va);
>> radeon_emit(sctx->ce_ib, va >> 32);
>> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
>> radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
>> radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
>> CONTEXT_CONTROL_LOAD_CE_RAM(1));
>> radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
>> }
>>
>> static bool si_upload_descriptors(struct si_context *sctx,
>> struct si_descriptors *desc,
>> struct r600_atom * atom)
>> {
>> - unsigned list_size = desc->num_elements * desc->element_dw_size *
>> 4;
>> + unsigned slot_size = desc->element_dw_size * 4;
>> + unsigned first_slot_offset = desc->first_active_slot * slot_size;
>> + unsigned upload_size = desc->num_active_slots * slot_size;
>> +
>> + if (!upload_size)
>> + return true;
>
>
> The early-out here means that desc->num_active_slots *does* control what is
> written to CE RAM, contrary to what its descriptive comment says. It needs
> to be moved further down.
True, but I think it doesn't matter, because dirty_mask stays dirty
and the descriptors will be uploaded when there is a shader using
them.
Marek
More information about the mesa-dev
mailing list