[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders
Nicolai Hähnle
nhaehnle at gmail.com
Thu May 18 11:28:32 UTC 2017
On 18.05.2017 12:43, Marek Olšák wrote:
> On Thu, May 18, 2017 at 12:41 PM, Marek Olšák <maraeo at gmail.com> wrote:
>> On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>>> On 17.05.2017 21:38, Marek Olšák wrote:
>>>>
>>>> From: Marek Olšák <marek.olsak at amd.com>
>>>>
>>>> This decreases the size of CE RAM dumps to L2, or the size of descriptor
>>>> uploads without CE.
>>>> ---
>>>> src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++--
>>>> src/gallium/drivers/radeonsi/si_descriptors.c | 85
>>>> ++++++++++++++++++++-----
>>>> src/gallium/drivers/radeonsi/si_state.h | 18 +++++-
>>>> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++
>>>> 4 files changed, 113 insertions(+), 24 deletions(-)
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>>>> b/src/gallium/drivers/radeonsi/si_compute.c
>>>> index 22ef111..4c98066 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>>> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
>>>> return NULL;
>>>> }
>>>> }
>>>>
>>>> return program;
>>>> }
>>>>
>>>> static void si_bind_compute_state(struct pipe_context *ctx, void *state)
>>>> {
>>>> struct si_context *sctx = (struct si_context*)ctx;
>>>> - sctx->cs_shader_state.program = (struct si_compute*)state;
>>>> + struct si_compute *program = (struct si_compute*)state;
>>>> +
>>>> + sctx->cs_shader_state.program = program;
>>>> + if (!program)
>>>> + return;
>>>> +
>>>> + /* Wait because we need active slot usage masks. */
>>>> + if (program->ir_type == PIPE_SHADER_IR_TGSI)
>>>> + util_queue_fence_wait(&program->ready);
>>>> +
>>>> + si_set_active_descriptors(sctx,
>>>> + SI_DESCS_FIRST_COMPUTE +
>>>> +
>>>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
>>>> +
>>>> program->active_const_and_shader_buffers);
>>>> + si_set_active_descriptors(sctx,
>>>> + SI_DESCS_FIRST_COMPUTE +
>>>> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
>>>> + program->active_samplers_and_images);
>>>> }
>>>>
>>>> static void si_set_global_binding(
>>>> struct pipe_context *ctx, unsigned first, unsigned n,
>>>> struct pipe_resource **resources,
>>>> uint32_t **handles)
>>>> {
>>>> unsigned i;
>>>> struct si_context *sctx = (struct si_context*)ctx;
>>>> struct si_compute *program = sctx->cs_shader_state.program;
>>>> @@ -749,26 +766,23 @@ static void si_launch_grid(
>>>> bool cs_regalloc_hang =
>>>> (sctx->b.chip_class == SI ||
>>>> sctx->b.family == CHIP_BONAIRE ||
>>>> sctx->b.family == CHIP_KABINI) &&
>>>> info->block[0] * info->block[1] * info->block[2] > 256;
>>>>
>>>> if (cs_regalloc_hang)
>>>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>>> SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>>
>>>> - if (program->ir_type == PIPE_SHADER_IR_TGSI) {
>>>> - util_queue_fence_wait(&program->ready);
>>>> -
>>>> - if (program->shader.compilation_failed)
>>>> - return;
>>>> - }
>>>> + if (program->ir_type == PIPE_SHADER_IR_TGSI &&
>>>> + program->shader.compilation_failed)
>>>> + return;
>>>>
>>>> si_decompress_compute_textures(sctx);
>>>>
>>>> /* Add buffer sizes for memory checking in need_cs_space. */
>>>> r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
>>>> /* TODO: add the scratch buffer */
>>>>
>>>> if (info->indirect) {
>>>> r600_context_add_resource_size(ctx, info->indirect);
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> index 38e4ae1..a2f40a8 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct
>>>> si_descriptors *desc,
>>>> }
>>>> }
>>>>
>>>> static void si_release_descriptors(struct si_descriptors *desc)
>>>> {
>>>> r600_resource_reference(&desc->buffer, NULL);
>>>> FREE(desc->list);
>>>> }
>>>>
>>>> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
>>>> unsigned size,
>>>> - unsigned *out_offset, struct r600_resource
>>>> **out_buf) {
>>>> + unsigned *out_offset, struct r600_resource
>>>> **out_buf)
>>>> +{
>>>> uint64_t va;
>>>>
>>>> u_suballocator_alloc(sctx->ce_suballocator, size,
>>>> - sctx->screen->b.info.tcc_cache_line_size,
>>>> - out_offset, (struct pipe_resource**)out_buf);
>>>> + si_optimal_tcc_alignment(sctx, size),
>>>> + (unsigned*)out_offset,
>>>
>>>
>>> The extra cast of out_offset is unnecessary.
>>>
>>>
>>>> + (struct pipe_resource**)out_buf);
>>>> if (!out_buf)
>>>> return false;
>>>>
>>>> va = (*out_buf)->gpu_address + *out_offset;
>>>>
>>>> radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
>>>> radeon_emit(sctx->ce_ib, ce_offset);
>>>> radeon_emit(sctx->ce_ib, size / 4);
>>>> radeon_emit(sctx->ce_ib, va);
>>>> radeon_emit(sctx->ce_ib, va >> 32);
>>>> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
>>>> radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
>>>> radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
>>>> CONTEXT_CONTROL_LOAD_CE_RAM(1));
>>>> radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
>>>> }
>>>>
>>>> static bool si_upload_descriptors(struct si_context *sctx,
>>>> struct si_descriptors *desc,
>>>> struct r600_atom * atom)
>>>> {
>>>> - unsigned list_size = desc->num_elements * desc->element_dw_size *
>>>> 4;
>>>> + unsigned slot_size = desc->element_dw_size * 4;
>>>> + unsigned first_slot_offset = desc->first_active_slot * slot_size;
>>>> + unsigned upload_size = desc->num_active_slots * slot_size;
>>>> +
>>>> + if (!upload_size)
>>>> + return true;
>>>
>>>
>>> The early-out here means that desc->num_active_slots *does* control what is
>>> written to CE RAM, contrary to what its descriptive comment says. It needs
>>> to be moved further down.
>>
>> True, but I think it doesn't matter, because dirty_mask stays dirty
>> and the descriptors will be uploaded when there is a shader using
>> them.
>
> Is this enough?
Yes, makes sense to me.
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
> b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 58d2723..89588c3 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -136,7 +136,7 @@ static bool si_ce_upload(struct si_context *sctx,
> unsigned ce_offset, unsigned s
>
> u_suballocator_alloc(sctx->ce_suballocator, size,
> si_optimal_tcc_alignment(sctx, size),
> - (unsigned*)out_offset,
> + out_offset,
> (struct pipe_resource**)out_buf);
> if (!out_buf)
> return false;
> @@ -204,6 +204,10 @@ static bool si_upload_descriptors(struct si_context *sctx,
> unsigned first_slot_offset = desc->first_active_slot * slot_size;
> unsigned upload_size = desc->num_active_slots * slot_size;
>
> + /* Skip the upload if no shader is using the descriptors. dirty_mask
> + * will stay dirty and the descriptors will be uploaded when there is
> + * a shader using them.
> + */
> if (!upload_size)
> return true;
>
> Marek
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list