[Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Thu May 18 11:28:32 UTC 2017

On 18.05.2017 12:43, Marek Olšák wrote:
> On Thu, May 18, 2017 at 12:41 PM, Marek Olšák <maraeo at gmail.com> wrote:
>> On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>>> On 17.05.2017 21:38, Marek Olšák wrote:
>>>>
>>>> From: Marek Olšák <marek.olsak at amd.com>
>>>>
>>>> This decreases the size of CE RAM dumps to L2, or the size of descriptor
>>>> uploads without CE.
>>>> ---
>>>>  src/gallium/drivers/radeonsi/si_compute.c       | 28 ++++++--
>>>>  src/gallium/drivers/radeonsi/si_descriptors.c   | 85
>>>> ++++++++++++++++++++-----
>>>>  src/gallium/drivers/radeonsi/si_state.h         | 18 +++++-
>>>>  src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
>>>>  4 files changed, 113 insertions(+), 24 deletions(-)
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
>>>> b/src/gallium/drivers/radeonsi/si_compute.c
>>>> index 22ef111..4c98066 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>>> @@ -201,21 +201,38 @@ static void *si_create_compute_state(
>>>>                         return NULL;
>>>>                 }
>>>>         }
>>>>
>>>>         return program;
>>>>  }
>>>>
>>>>  static void si_bind_compute_state(struct pipe_context *ctx, void *state)
>>>>  {
>>>>         struct si_context *sctx = (struct si_context*)ctx;
>>>> -       sctx->cs_shader_state.program = (struct si_compute*)state;
>>>> +       struct si_compute *program = (struct si_compute*)state;
>>>> +
>>>> +       sctx->cs_shader_state.program = program;
>>>> +       if (!program)
>>>> +               return;
>>>> +
>>>> +       /* Wait because we need active slot usage masks. */
>>>> +       if (program->ir_type == PIPE_SHADER_IR_TGSI)
>>>> +               util_queue_fence_wait(&program->ready);
>>>> +
>>>> +       si_set_active_descriptors(sctx,
>>>> +                                 SI_DESCS_FIRST_COMPUTE +
>>>> +
>>>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
>>>> +
>>>> program->active_const_and_shader_buffers);
>>>> +       si_set_active_descriptors(sctx,
>>>> +                                 SI_DESCS_FIRST_COMPUTE +
>>>> +                                 SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
>>>> +                                 program->active_samplers_and_images);
>>>>  }
>>>>
>>>>  static void si_set_global_binding(
>>>>         struct pipe_context *ctx, unsigned first, unsigned n,
>>>>         struct pipe_resource **resources,
>>>>         uint32_t **handles)
>>>>  {
>>>>         unsigned i;
>>>>         struct si_context *sctx = (struct si_context*)ctx;
>>>>         struct si_compute *program = sctx->cs_shader_state.program;
>>>> @@ -749,26 +766,23 @@ static void si_launch_grid(
>>>>         bool cs_regalloc_hang =
>>>>                 (sctx->b.chip_class == SI ||
>>>>                  sctx->b.family == CHIP_BONAIRE ||
>>>>                  sctx->b.family == CHIP_KABINI) &&
>>>>                 info->block[0] * info->block[1] * info->block[2] > 256;
>>>>
>>>>         if (cs_regalloc_hang)
>>>>                 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>>>                                  SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>>
>>>> -       if (program->ir_type == PIPE_SHADER_IR_TGSI) {
>>>> -               util_queue_fence_wait(&program->ready);
>>>> -
>>>> -               if (program->shader.compilation_failed)
>>>> -                       return;
>>>> -       }
>>>> +       if (program->ir_type == PIPE_SHADER_IR_TGSI &&
>>>> +           program->shader.compilation_failed)
>>>> +               return;
>>>>
>>>>         si_decompress_compute_textures(sctx);
>>>>
>>>>         /* Add buffer sizes for memory checking in need_cs_space. */
>>>>         r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
>>>>         /* TODO: add the scratch buffer */
>>>>
>>>>         if (info->indirect) {
>>>>                 r600_context_add_resource_size(ctx, info->indirect);
>>>>
>>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> index 38e4ae1..a2f40a8 100644
>>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct
>>>> si_descriptors *desc,
>>>>         }
>>>>  }
>>>>
>>>>  static void si_release_descriptors(struct si_descriptors *desc)
>>>>  {
>>>>         r600_resource_reference(&desc->buffer, NULL);
>>>>         FREE(desc->list);
>>>>  }
>>>>
>>>>  static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset,
>>>> unsigned size,
>>>> -                        unsigned *out_offset, struct r600_resource
>>>> **out_buf) {
>>>> +                        unsigned *out_offset, struct r600_resource
>>>> **out_buf)
>>>> +{
>>>>         uint64_t va;
>>>>
>>>>         u_suballocator_alloc(sctx->ce_suballocator, size,
>>>> -                            sctx->screen->b.info.tcc_cache_line_size,
>>>> -                            out_offset, (struct pipe_resource**)out_buf);
>>>> +                            si_optimal_tcc_alignment(sctx, size),
>>>> +                            (unsigned*)out_offset,
>>>
>>>
>>> The extra cast of out_offset is unnecessary.
>>>
>>>
>>>> +                            (struct pipe_resource**)out_buf);
>>>>         if (!out_buf)
>>>>                         return false;
>>>>
>>>>         va = (*out_buf)->gpu_address + *out_offset;
>>>>
>>>>         radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
>>>>         radeon_emit(sctx->ce_ib, ce_offset);
>>>>         radeon_emit(sctx->ce_ib, size / 4);
>>>>         radeon_emit(sctx->ce_ib, va);
>>>>         radeon_emit(sctx->ce_ib, va >> 32);
>>>> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
>>>>         radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
>>>>         radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
>>>>                         CONTEXT_CONTROL_LOAD_CE_RAM(1));
>>>>         radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
>>>>  }
>>>>
>>>>  static bool si_upload_descriptors(struct si_context *sctx,
>>>>                                   struct si_descriptors *desc,
>>>>                                   struct r600_atom * atom)
>>>>  {
>>>> -       unsigned list_size = desc->num_elements * desc->element_dw_size *
>>>> 4;
>>>> +       unsigned slot_size = desc->element_dw_size * 4;
>>>> +       unsigned first_slot_offset = desc->first_active_slot * slot_size;
>>>> +       unsigned upload_size = desc->num_active_slots * slot_size;
>>>> +
>>>> +       if (!upload_size)
>>>> +               return true;
>>>
>>>
>>> The early-out here means that desc->num_active_slots *does* control what is
>>> written to CE RAM, contrary to what its descriptive comment says. It needs
>>> to be moved further down.
>>
>> True, but I think it doesn't matter, because dirty_mask stays dirty
>> and the descriptors will be uploaded when there is a shader using
>> them.
>
> Is this enough?

Yes, makes sense to me.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
> b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 58d2723..89588c3 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -136,7 +136,7 @@ static bool si_ce_upload(struct si_context *sctx,
> unsigned ce_offset, unsigned s
>
>         u_suballocator_alloc(sctx->ce_suballocator, size,
>                              si_optimal_tcc_alignment(sctx, size),
> -                            (unsigned*)out_offset,
> +                            out_offset,
>                              (struct pipe_resource**)out_buf);
>         if (!out_buf)
>                         return false;
> @@ -204,6 +204,10 @@ static bool si_upload_descriptors(struct si_context *sctx,
>         unsigned first_slot_offset = desc->first_active_slot * slot_size;
>         unsigned upload_size = desc->num_active_slots * slot_size;
>
> +       /* Skip the upload if no shader is using the descriptors. dirty_mask
> +        * will stay dirty and the descriptors will be uploaded when there is
> +        * a shader using them.
> +        */
>         if (!upload_size)
>                 return true;
>
> Marek
>

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.