[Mesa-dev] [PATCH] radeonsi: Save and restore entire CE RAM.

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Mon Jun 6 15:46:10 UTC 2016


On Mon, Jun 6, 2016 at 5:14 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 06.06.2016 00:28, Bas Nieuwenhuizen wrote:
>>
>> This fixes a problem with the CE preamble and restoring only stuff in the
>> preamble when needed.
>>
>> To illustrate suppose we have two graphics IB's 1 and 2, which  are
>> submitted in
>> that order. Furthermore suppose IB 1 does not use CE ram, but IB 2 does,
>> and we
>> have a context switch at the start of IB 1, but not between IB 1 and IB 2.
>>
>> The old code put the CE RAM loads in the preamble of IB 2. As the preamble
>> of
>> IB 1 does not have the loads and the preamble of IB 2 does not get
>> executed, the
>> old values are not load into CE RAM.
>>
>> Fix this by always restoring the entire CE RAM.
>
>
> Nice catch!
>
> Have you considered restoring from the desc->buffers instead? The
> double-dump seems a bit redundant. But maybe it's easier this way...
>
> Also, do we really need to dump the entire CE RAM? After initializing the
> descriptor sets, we should know exactly how much we need...

We can do both.

However, wrt using the desc->buffers, note that this results in 21
loads (and adding potentially multiple buffers to the CS bo list)
while the current approach is 1 load
and 1 store. Furthermore  the CE is <= 15% busy in pretty much all
games I tried, as measured by polling CP_STAT.CE_BUSY. As such I
haven't really considered optimizing this GPU side and I'm not really
sure if it is worth it in this case to do a bit more work on the CPU
to avoid some work on the GPU.

Though I'm probably overthinking this given how small an impact the
whole CE stuff had on performance and we're now talking about a per IB
cost instead of per draw.

- Bas


> Nicolai
>
>
>>
>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>> Cc: "12.0" <mesa-stable at lists.freedesktop.org>
>> ---
>>   src/gallium/drivers/radeonsi/si_descriptors.c | 56
>> +++++++++++++--------------
>>   src/gallium/drivers/radeonsi/si_hw_context.c  |  4 ++
>>   src/gallium/drivers/radeonsi/si_pipe.c        |  7 ++++
>>   src/gallium/drivers/radeonsi/si_pipe.h        |  1 +
>>   src/gallium/drivers/radeonsi/si_state.h       |  7 ++--
>>   5 files changed, 43 insertions(+), 32 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>> index baddc5f..5a1a344 100644
>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>> @@ -160,30 +160,39 @@ static bool si_ce_upload(struct si_context *sctx,
>> unsigned ce_offset, unsigned s
>>         return true;
>>   }
>>
>> -static void si_reinitialize_ce_ram(struct si_context *sctx,
>> -                            struct si_descriptors *desc)
>> +void si_ce_save_ram(struct si_context *sctx)
>>   {
>> -       if (desc->buffer) {
>> -               struct r600_resource *buffer = (struct
>> r600_resource*)desc->buffer;
>> -               unsigned list_size = desc->num_elements *
>> desc->element_dw_size * 4;
>> -               uint64_t va = buffer->gpu_address + desc->buffer_offset;
>> -               struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
>> +       if (!sctx->ce_ib)
>> +               return;
>>
>> -               if (!ib)
>> -                       ib = sctx->ce_ib;
>> +       radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
>> +       radeon_emit(sctx->ce_ib, 0);
>> +       radeon_emit(sctx->ce_ib, SI_CE_RAM_SIZE / 4);
>> +       radeon_emit(sctx->ce_ib, sctx->ce_ram_data->gpu_address);
>> +       radeon_emit(sctx->ce_ib, sctx->ce_ram_data->gpu_address >> 32);
>>
>> -               list_size = align(list_size, 32);
>> +       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> sctx->ce_ram_data,
>> +                              RADEON_USAGE_WRITE,
>> RADEON_PRIO_DESCRIPTORS);
>> +}
>>
>> -               radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
>> -               radeon_emit(ib, va);
>> -               radeon_emit(ib, va >> 32);
>> -               radeon_emit(ib, list_size / 4);
>> -               radeon_emit(ib, desc->ce_offset);
>> +void si_ce_restore_ram(struct si_context *sctx)
>> +{
>> +       struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
>>
>> -               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> desc->buffer,
>> -                                   RADEON_USAGE_READ,
>> RADEON_PRIO_DESCRIPTORS);
>> -       }
>> -       desc->ce_ram_dirty = false;
>> +       if (!ib)
>> +               ib = sctx->ce_ib;
>> +
>> +       if (!ib)
>> +               return;
>> +
>> +       radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
>> +       radeon_emit(ib, sctx->ce_ram_data->gpu_address);
>> +       radeon_emit(ib, sctx->ce_ram_data->gpu_address >> 32);
>> +       radeon_emit(ib, SI_CE_RAM_SIZE / 4);
>> +       radeon_emit(ib, 0);
>> +
>> +       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> sctx->ce_ram_data,
>> +                           RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
>>   }
>>
>>   void si_ce_enable_loads(struct radeon_winsys_cs *ib)
>> @@ -206,9 +215,6 @@ static bool si_upload_descriptors(struct si_context
>> *sctx,
>>         if (sctx->ce_ib) {
>>                 uint32_t const* list = (uint32_t const*)desc->list;
>>
>> -               if (desc->ce_ram_dirty)
>> -                       si_reinitialize_ce_ram(sctx, desc);
>> -
>>                 while(desc->dirty_mask) {
>>                         int begin, count;
>>                         u_bit_scan_consecutive_range(&desc->dirty_mask,
>> &begin,
>> @@ -287,8 +293,6 @@ static void si_sampler_views_begin_new_cs(struct
>> si_context *sctx,
>>                                            RADEON_USAGE_READ);
>>         }
>>
>> -       views->desc.ce_ram_dirty = true;
>> -
>>         if (!views->desc.buffer)
>>                 return;
>>         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> views->desc.buffer,
>> @@ -489,8 +493,6 @@ si_image_views_begin_new_cs(struct si_context *sctx,
>> struct si_images_info *imag
>>                                            RADEON_USAGE_READWRITE);
>>         }
>>
>> -       images->desc.ce_ram_dirty = true;
>> -
>>         if (images->desc.buffer) {
>>                 radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>>                                           images->desc.buffer,
>> @@ -737,8 +739,6 @@ static void si_buffer_resources_begin_new_cs(struct
>> si_context *sctx,
>>                                       buffers->shader_usage,
>> buffers->priority);
>>         }
>>
>> -       buffers->desc.ce_ram_dirty = true;
>> -
>>         if (!buffers->desc.buffer)
>>                 return;
>>         radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c
>> b/src/gallium/drivers/radeonsi/si_hw_context.c
>> index fa6a2cb..1088c21 100644
>> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
>> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
>> @@ -121,6 +121,8 @@ void si_context_gfx_flush(void *context, unsigned
>> flags,
>>                 ctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2 |
>>                                 SI_CONTEXT_INV_VMEM_L1;
>>
>> +       si_ce_save_ram(ctx);
>> +
>>         si_emit_cache_flush(ctx, NULL);
>>
>>         /* force to keep tiling flags */
>> @@ -213,6 +215,8 @@ void si_begin_new_cs(struct si_context *ctx)
>>         else if (ctx->ce_ib)
>>                 si_ce_enable_loads(ctx->ce_ib);
>>
>> +       si_ce_restore_ram(ctx);
>> +
>>         ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
>>         ctx->framebuffer.dirty_zsbuf = true;
>>         si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
>> b/src/gallium/drivers/radeonsi/si_pipe.c
>> index 7bb3dc2..a204b9a 100644
>> --- a/src/gallium/drivers/radeonsi/si_pipe.c
>> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
>> @@ -47,6 +47,7 @@ static void si_destroy_context(struct pipe_context
>> *context)
>>         if (sctx->ce_suballocator)
>>                 u_suballocator_destroy(sctx->ce_suballocator);
>>
>> +       r600_resource_reference(&sctx->ce_ram_data, NULL);
>>         pipe_resource_reference(&sctx->esgs_ring, NULL);
>>         pipe_resource_reference(&sctx->gsvs_ring, NULL);
>>         pipe_resource_reference(&sctx->tf_ring, NULL);
>> @@ -168,6 +169,12 @@ static struct pipe_context *si_create_context(struct
>> pipe_screen *screen,
>>                                                       PIPE_USAGE_DEFAULT,
>> FALSE);
>>                 if (!sctx->ce_suballocator)
>>                         goto fail;
>> +
>> +               sctx->ce_ram_data = (struct r600_resource*)
>> +                                   pipe_buffer_create(screen,
>> PIPE_BIND_CUSTOM,
>> +                                   PIPE_USAGE_DEFAULT, SI_CE_RAM_SIZE);
>> +               if (!sctx->ce_ram_data)
>> +                       goto fail;
>>         }
>>
>>         sctx->b.gfx.flush = si_context_gfx_flush;
>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
>> b/src/gallium/drivers/radeonsi/si_pipe.h
>> index b3fe656..a62d558 100644
>> --- a/src/gallium/drivers/radeonsi/si_pipe.h
>> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>> @@ -199,6 +199,7 @@ struct si_context {
>>         struct radeon_winsys_cs         *ce_preamble_ib;
>>         bool                            ce_need_synchronization;
>>         struct u_suballocator           *ce_suballocator;
>> +       struct r600_resource            *ce_ram_data;
>>
>>         struct pipe_fence_handle        *last_gfx_fence;
>>         struct si_shader_ctx_state      fixed_func_tcs_shader;
>> diff --git a/src/gallium/drivers/radeonsi/si_state.h
>> b/src/gallium/drivers/radeonsi/si_state.h
>> index e5795eb..2dd166f 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.h
>> +++ b/src/gallium/drivers/radeonsi/si_state.h
>> @@ -41,6 +41,7 @@
>>   #define SI_NUM_SHADER_BUFFERS         16
>>
>>   #define SI_TESS_OFFCHIP_BLOCK_SIZE    (8192 * 4)
>> +#define SI_CE_RAM_SIZE                 32768
>>
>>   struct si_screen;
>>   struct si_shader;
>> @@ -205,10 +206,6 @@ struct si_descriptors {
>>         /* elements of the list that are changed and need to be uploaded
>> */
>>         unsigned dirty_mask;
>>
>> -       /* Whether the CE ram is dirty and needs to be reinitialized
>> entirely
>> -        * before we can do partial updates. */
>> -       bool ce_ram_dirty;
>> -
>>         /* The shader userdata offset within a shader where the 64-bit
>> pointer to the descriptor
>>          * array will be stored. */
>>         unsigned shader_userdata_offset;
>> @@ -250,6 +247,8 @@ struct si_buffer_resources {
>>         } while(0)
>>
>>   /* si_descriptors.c */
>> +void si_ce_save_ram(struct si_context *sctx);
>> +void si_ce_restore_ram(struct si_context *sctx);
>>   void si_ce_enable_loads(struct radeon_winsys_cs *ib);
>>   void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
>>                                     const struct radeon_surf_level
>> *base_level_info,
>>
>


More information about the mesa-dev mailing list