[Mesa-dev] [PATCH 5/6] radeonsi: use slot indexes for bindless handles
Nicolai Hähnle
nhaehnle at gmail.com
Mon Jul 17 20:01:15 UTC 2017
Hi Samuel,
On 07.07.2017 03:45, Samuel Pitoiset wrote:
> On 07/05/2017 01:42 PM, Nicolai Hähnle wrote:
>> On 04.07.2017 15:05, Samuel Pitoiset wrote:
>>> Using VRAM address as bindless handles is not a good idea because
>>> we have to use LLVMIntToPTr and the LLVM CSE pass can't optimize
>>> because it has no information about the pointer.
>>>
>>> Instead, use slots indexes like the existing descriptors.
>>>
>>> This improves performance with DOW3 by +7%.
>>
>> Wow.
>>
>> The thing is, burning a pair of user SGPRs for this seems a bit
>> overkill, especially since it also hurts apps that don't use bindless
>> at all.
>>
>> Do you have some examples of how LLVM fails here? Could we perhaps
>> avoid most of the performance issues by casting 0 to an appropriate
>> pointer type once, and then using the bindless handle as an index
>> relative to that pointer?
>
> Here's two shaders, 1) is with master, 2) is with this series:
>
> 1) https://hastebin.com/uvamarelig
> 2) https://hastebin.com/voguqihilu
>
> The first shader contains a bunch of s_buffer_load_dword that the second
> one doesn't need because CSE do its job there. This is because of
> IntToPtr but if we use noalias, the pass is able to eliminate the
> redundant descriptor load operations.
So I looked into your example again in more detail, compiling both
shaders with
llc -march=amdgcn -mcpu=tonga
and also just extracting the assembly, and I think your analysis is
actually flawed. It's true that the shader in (2) has fewer buffer
loads, but the only buffer loads that are actually removed rather than
just shuffled around are ones that load .y components. So basically, the
reason you get fewer buffer loads is that you're effectly using 32 bit
pointers. It's a bit shocking that that gives you a 7% performance
improvement...
If there are really examples where it makes a difference for CSE, then
perhaps the same result could be achieved with alias/noalias metadata on
the load/store instructions.
In the meantime, perhaps a good comparison would be to use the original,
inttoptr-based code, but only load the lower 32 bits for a handle and
use bit shifts to get a full 64 bit pointers. That way, at least it
should be possible to more easily find shaders where there is a genuine
difference.
Cheers,
Nicolai
> About the number of SGPRs loaded by SPI for non bindless applications,
> it should be possible to avoid that by scanning the shader.
>
>>
>> Cheers,
>> Nicolai
>>
>>
>>>
>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>>> ---
>>> src/gallium/drivers/radeonsi/si_descriptors.c | 342
>>> +++++++++++-----------
>>> src/gallium/drivers/radeonsi/si_pipe.c | 12 -
>>> src/gallium/drivers/radeonsi/si_pipe.h | 23 +-
>>> src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 32 +-
>>> 4 files changed, 185 insertions(+), 224 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> index 06a171ff9e..601b18069d 100644
>>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
>>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
>>> @@ -1852,16 +1852,20 @@ static void si_rebind_buffer(struct
>>> pipe_context *ctx, struct pipe_resource *buf
>>> /* Bindless texture handles */
>>> if (rbuffer->texture_handle_allocated) {
>>> + struct si_descriptors *descs = &sctx->bindless_descriptors;
>>> +
>>> util_dynarray_foreach(&sctx->resident_tex_handles,
>>> struct si_texture_handle *, tex_handle) {
>>> struct pipe_sampler_view *view = (*tex_handle)->view;
>>> - struct si_bindless_descriptor *desc = (*tex_handle)->desc;
>>> + unsigned desc_slot = (*tex_handle)->desc_slot;
>>> if (view->texture == buf) {
>>> si_set_buf_desc_address(rbuffer,
>>> view->u.buf.offset,
>>> - &desc->desc_list[4]);
>>> - desc->dirty = true;
>>> + descs->list +
>>> + desc_slot * 16 + 4);
>>> +
>>> + (*tex_handle)->desc_dirty = true;
>>> sctx->bindless_descriptors_dirty = true;
>>> radeon_add_to_buffer_list_check_mem(
>>> @@ -1874,10 +1878,12 @@ static void si_rebind_buffer(struct
>>> pipe_context *ctx, struct pipe_resource *buf
>>> /* Bindless image handles */
>>> if (rbuffer->image_handle_allocated) {
>>> + struct si_descriptors *descs = &sctx->bindless_descriptors;
>>> +
>>> util_dynarray_foreach(&sctx->resident_img_handles,
>>> struct si_image_handle *, img_handle) {
>>> struct pipe_image_view *view = &(*img_handle)->view;
>>> - struct si_bindless_descriptor *desc = (*img_handle)->desc;
>>> + unsigned desc_slot = (*img_handle)->desc_slot;
>>> if (view->resource == buf) {
>>> if (view->access & PIPE_IMAGE_ACCESS_WRITE)
>>> @@ -1885,8 +1891,10 @@ static void si_rebind_buffer(struct
>>> pipe_context *ctx, struct pipe_resource *buf
>>> si_set_buf_desc_address(rbuffer,
>>> view->u.buf.offset,
>>> - &desc->desc_list[4]);
>>> - desc->dirty = true;
>>> + descs->list +
>>> + desc_slot * 8 + 4);
>>> +
>>> + (*img_handle)->desc_dirty = true;
>>> sctx->bindless_descriptors_dirty = true;
>>> radeon_add_to_buffer_list_check_mem(
>>> @@ -1918,11 +1926,18 @@ static void si_invalidate_buffer(struct
>>> pipe_context *ctx, struct pipe_resource
>>> }
>>> static void si_upload_bindless_descriptor(struct si_context *sctx,
>>> - struct si_bindless_descriptor *desc)
>>> + unsigned desc_slot,
>>> + unsigned num_dwords)
>>> {
>>> + struct si_descriptors *desc = &sctx->bindless_descriptors;
>>> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>>> - uint64_t va = desc->buffer->gpu_address + desc->offset;
>>> - unsigned num_dwords = sizeof(desc->desc_list) / 4;
>>> + uint32_t *data;
>>> + uint64_t va;
>>> +
>>> + data = desc->list + desc_slot * num_dwords;
>>> +
>>> + va = desc->buffer->gpu_address + desc->buffer_offset +
>>> + desc_slot * num_dwords * 4;
>>> radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + num_dwords, 0));
>>> radeon_emit(cs, S_370_DST_SEL(V_370_TC_L2) |
>>> @@ -1930,7 +1945,7 @@ static void
>>> si_upload_bindless_descriptor(struct si_context *sctx,
>>> S_370_ENGINE_SEL(V_370_ME));
>>> radeon_emit(cs, va);
>>> radeon_emit(cs, va >> 32);
>>> - radeon_emit_array(cs, desc->desc_list, num_dwords);
>>> + radeon_emit_array(cs, data, num_dwords);
>>> }
>>> static void si_upload_bindless_descriptors(struct si_context *sctx)
>>> @@ -1947,24 +1962,24 @@ static void
>>> si_upload_bindless_descriptors(struct si_context *sctx)
>>> util_dynarray_foreach(&sctx->resident_tex_handles,
>>> struct si_texture_handle *, tex_handle) {
>>> - struct si_bindless_descriptor *desc = (*tex_handle)->desc;
>>> + unsigned desc_slot = (*tex_handle)->desc_slot;
>>> - if (!desc->dirty)
>>> + if (!(*tex_handle)->desc_dirty)
>>> continue;
>>> - si_upload_bindless_descriptor(sctx, desc);
>>> - desc->dirty = false;
>>> + si_upload_bindless_descriptor(sctx, desc_slot, 16);
>>> + (*tex_handle)->desc_dirty = false;
>>> }
>>> util_dynarray_foreach(&sctx->resident_img_handles,
>>> struct si_image_handle *, img_handle) {
>>> - struct si_bindless_descriptor *desc = (*img_handle)->desc;
>>> + unsigned desc_slot = (*img_handle)->desc_slot;
>>> - if (!desc->dirty)
>>> + if (!(*img_handle)->desc_dirty)
>>> continue;
>>> - si_upload_bindless_descriptor(sctx, desc);
>>> - desc->dirty = false;
>>> + si_upload_bindless_descriptor(sctx, desc_slot, 8);
>>> + (*img_handle)->desc_dirty = false;
>>> }
>>> /* Invalidate L1 because it doesn't know that L2 changed. */
>>> @@ -1977,9 +1992,11 @@ static void
>>> si_upload_bindless_descriptors(struct si_context *sctx)
>>> /* Update mutable image descriptor fields of all resident textures. */
>>> static void si_update_all_resident_texture_descriptors(struct
>>> si_context *sctx)
>>> {
>>> + struct si_descriptors *desc = &sctx->bindless_descriptors;
>>> +
>>> util_dynarray_foreach(&sctx->resident_tex_handles,
>>> struct si_texture_handle *, tex_handle) {
>>> - struct si_bindless_descriptor *desc = (*tex_handle)->desc;
>>> + unsigned desc_slot_offset = (*tex_handle)->desc_slot * 16;
>>> struct si_sampler_view *sview =
>>> (struct si_sampler_view *)(*tex_handle)->view;
>>> uint32_t desc_list[16];
>>> @@ -1987,31 +2004,43 @@ static void
>>> si_update_all_resident_texture_descriptors(struct si_context *sctx)
>>> if (sview->base.texture->target == PIPE_BUFFER)
>>> continue;
>>> - memcpy(desc_list, desc->desc_list, sizeof(desc_list));
>>> + /* Store the previous descriptor to only mark it dirty if it
>>> + * has been changed.
>>> + */
>>> + memcpy(desc_list, desc->list + desc_slot_offset,
>>> + sizeof(desc_list));
>>> +
>>> si_set_sampler_view_desc(sctx, sview, &(*tex_handle)->sstate,
>>> - &desc->desc_list[0]);
>>> + desc->list + desc_slot_offset);
>>> - if (memcmp(desc_list, desc->desc_list, sizeof(desc_list))) {
>>> - desc->dirty = true;
>>> + if (memcmp(desc_list, desc->list + desc_slot_offset,
>>> + sizeof(desc_list))) {
>>> + (*tex_handle)->desc_dirty = true;
>>> sctx->bindless_descriptors_dirty = true;
>>> }
>>> }
>>> util_dynarray_foreach(&sctx->resident_img_handles,
>>> struct si_image_handle *, img_handle) {
>>> - struct si_bindless_descriptor *desc = (*img_handle)->desc;
>>> + unsigned desc_slot_offset = (*img_handle)->desc_slot * 8;
>>> struct pipe_image_view *view = &(*img_handle)->view;
>>> - uint32_t desc_list[16];
>>> + uint32_t desc_list[8];
>>> if (view->resource->target == PIPE_BUFFER)
>>> continue;
>>> - memcpy(desc_list, desc->desc_list, sizeof(desc_list));
>>> + /* Store the previous descriptor to only mark it dirty if it
>>> + * has been changed.
>>> + */
>>> + memcpy(desc_list, desc->list + desc_slot_offset,
>>> + sizeof(desc_list));
>>> +
>>> si_set_shader_image_desc(sctx, view, true,
>>> - &desc->desc_list[0]);
>>> + desc->list + desc_slot_offset);
>>> - if (memcmp(desc_list, desc->desc_list, sizeof(desc_list))) {
>>> - desc->dirty = true;
>>> + if (memcmp(desc_list, desc->list + desc_slot_offset,
>>> + sizeof(desc_list))) {
>>> + (*img_handle)->desc_dirty = true;
>>> sctx->bindless_descriptors_dirty = true;
>>> }
>>> }
>>> @@ -2082,6 +2111,8 @@ static void
>>> si_shader_userdata_begin_new_cs(struct si_context *sctx)
>>> {
>>> sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
>>> sctx->vertex_buffer_pointer_dirty = sctx->vertex_buffers.buffer
>>> != NULL;
>>> + sctx->graphics_bindless_pointer_dirty =
>>> sctx->bindless_descriptors.buffer != NULL;
>>> + sctx->compute_bindless_pointer_dirty =
>>> sctx->bindless_descriptors.buffer != NULL;
>>> si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
>>> }
>>> @@ -2223,6 +2254,12 @@ void si_emit_graphics_shader_userdata(struct
>>> si_context *sctx,
>>> sh_base[PIPE_SHADER_VERTEX]);
>>> sctx->vertex_buffer_pointer_dirty = false;
>>> }
>>> +
>>> + if (sctx->graphics_bindless_pointer_dirty) {
>>> + si_emit_graphics_shader_pointers(sctx,
>>> + &sctx->bindless_descriptors);
>>> + sctx->graphics_bindless_pointer_dirty = false;
>>> + }
>>> }
>>> void si_emit_compute_shader_userdata(struct si_context *sctx)
>>> @@ -2239,135 +2276,101 @@ void si_emit_compute_shader_userdata(struct
>>> si_context *sctx)
>>> si_emit_shader_pointer(sctx, descs + i, base);
>>> }
>>> sctx->shader_pointers_dirty &= ~compute_mask;
>>> +
>>> + if (sctx->compute_bindless_pointer_dirty) {
>>> + si_emit_shader_pointer(sctx, &sctx->bindless_descriptors,
>>> base);
>>> + sctx->compute_bindless_pointer_dirty = false;
>>> + }
>>> }
>>> /* BINDLESS */
>>> -struct si_bindless_descriptor_slab
>>> +static void si_init_bindless_descriptors(struct si_context *sctx,
>>> + struct si_descriptors *desc,
>>> + unsigned shader_userdata_index,
>>> + unsigned num_elements)
>>> {
>>> - struct pb_slab base;
>>> - struct r600_resource *buffer;
>>> - struct si_bindless_descriptor *entries;
>>> -};
>>> + si_init_descriptors(sctx, desc, shader_userdata_index, 16,
>>> num_elements,
>>> + 0, 0, NULL);
>>> + sctx->bindless_descriptors.num_active_slots = num_elements;
>>> -bool si_bindless_descriptor_can_reclaim_slab(void *priv,
>>> - struct pb_slab_entry *entry)
>>> -{
>>> - /* Do not allow to reclaim any bindless descriptors for now
>>> because the
>>> - * GPU might be using them. This should be improved later on.
>>> + /* The first bindless descriptor is stored at slot 1, because 0
>>> is not
>>> + * considered to be a valid handle.
>>> */
>>> - return false;
>>> + sctx->num_bindless_descriptors = 1;
>>> }
>>> -struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv,
>>> unsigned heap,
>>> - unsigned entry_size,
>>> - unsigned group_index)
>>> +static void si_release_bindless_descriptors(struct si_context *sctx)
>>> {
>>> - struct si_context *sctx = priv;
>>> - struct si_screen *sscreen = sctx->screen;
>>> - struct si_bindless_descriptor_slab *slab;
>>> -
>>> - slab = CALLOC_STRUCT(si_bindless_descriptor_slab);
>>> - if (!slab)
>>> - return NULL;
>>> -
>>> - /* Create a buffer in VRAM for 1024 bindless descriptors. */
>>> - slab->buffer = (struct r600_resource *)
>>> - pipe_buffer_create(&sscreen->b.b, 0,
>>> - PIPE_USAGE_DEFAULT, 64 * 1024);
>>> - if (!slab->buffer)
>>> - goto fail;
>>> -
>>> - slab->base.num_entries = slab->buffer->bo_size / entry_size;
>>> - slab->base.num_free = slab->base.num_entries;
>>> - slab->entries = CALLOC(slab->base.num_entries,
>>> sizeof(*slab->entries));
>>> - if (!slab->entries)
>>> - goto fail_buffer;
>>> -
>>> - LIST_INITHEAD(&slab->base.free);
>>> -
>>> - for (unsigned i = 0; i < slab->base.num_entries; ++i) {
>>> - struct si_bindless_descriptor *desc = &slab->entries[i];
>>> -
>>> - desc->entry.slab = &slab->base;
>>> - desc->entry.group_index = group_index;
>>> - desc->buffer = slab->buffer;
>>> - desc->offset = i * entry_size;
>>> -
>>> - LIST_ADDTAIL(&desc->entry.head, &slab->base.free);
>>> - }
>>> -
>>> - /* Add the descriptor to the per-context list. */
>>> - util_dynarray_append(&sctx->bindless_descriptors,
>>> - struct r600_resource *, slab->buffer);
>>> -
>>> - return &slab->base;
>>> -
>>> -fail_buffer:
>>> - r600_resource_reference(&slab->buffer, NULL);
>>> -fail:
>>> - FREE(slab);
>>> - return NULL;
>>> + si_release_descriptors(&sctx->bindless_descriptors);
>>> }
>>> -void si_bindless_descriptor_slab_free(void *priv, struct pb_slab
>>> *pslab)
>>> -{
>>> - struct si_context *sctx = priv;
>>> - struct si_bindless_descriptor_slab *slab =
>>> - (struct si_bindless_descriptor_slab *)pslab;
>>> -
>>> - /* Remove the descriptor from the per-context list. */
>>> - util_dynarray_delete_unordered(&sctx->bindless_descriptors,
>>> - struct r600_resource *, slab->buffer);
>>> -
>>> - r600_resource_reference(&slab->buffer, NULL);
>>> - FREE(slab->entries);
>>> - FREE(slab);
>>> -}
>>> -
>>> -static struct si_bindless_descriptor *
>>> +static unsigned
>>> si_create_bindless_descriptor(struct si_context *sctx, uint32_t
>>> *desc_list,
>>> unsigned size)
>>> {
>>> struct si_screen *sscreen = sctx->screen;
>>> - struct si_bindless_descriptor *desc;
>>> - struct pb_slab_entry *entry;
>>> - void *ptr;
>>> + struct si_descriptors *desc = &sctx->bindless_descriptors;
>>> + bool resized = false;
>>> + unsigned desc_slot;
>>> - /* Sub-allocate the bindless descriptor from a slab to avoid
>>> dealing
>>> - * with a ton of buffers and for reducing the winsys overhead.
>>> - */
>>> - entry = pb_slab_alloc(&sctx->bindless_descriptor_slabs, 64, 0);
>>> - if (!entry)
>>> - return NULL;
>>> + /* Reserve a new slot for this bindless descriptor. */
>>> + desc_slot = sctx->num_bindless_descriptors++;
>>> - desc = NULL;
>>> - desc = container_of(entry, desc, entry);
>>> + if (desc_slot >= desc->num_elements) {
>>> + /* The array of bindless descriptors is full, resize it. */
>>> + unsigned slot_size = desc->element_dw_size * 4;
>>> + unsigned new_num_elements = desc->num_elements * 2;
>>> - /* Upload the descriptor directly in VRAM. Because the slabs are
>>> - * currently never reclaimed, we don't need to synchronize the
>>> - * operation.
>>> - */
>>> - ptr = sscreen->b.ws->buffer_map(desc->buffer->buf, NULL,
>>> - PIPE_TRANSFER_WRITE |
>>> - PIPE_TRANSFER_UNSYNCHRONIZED);
>>> - util_memcpy_cpu_to_le32(ptr + desc->offset, desc_list, size);
>>> + desc->list = REALLOC(desc->list, desc->num_elements *
>>> slot_size,
>>> + new_num_elements * slot_size);
>>> + desc->num_elements = new_num_elements;
>>> + desc->num_active_slots = new_num_elements;
>>> + resized = true;
>>> + }
>>> - /* Keep track of the initial descriptor especially for buffers
>>> - * invalidation because we might need to know the previous address.
>>> - */
>>> - memcpy(desc->desc_list, desc_list, sizeof(desc->desc_list));
>>> + /* Copy the descriptor into the array. */
>>> + memcpy(desc->list + desc_slot * size / 4, desc_list, size);
>>> +
>>> + if (!desc->buffer || resized) {
>>> + /* Upload the whole array of bindless descriptors if it's new,
>>> + * or re-upload it if the array has been resized.
>>> + */
>>> + if (!si_upload_descriptors(sctx, desc,
>>> + &sctx->shader_userdata.atom))
>>> + return 0;
>>> +
>>> + /* Make sure to re-emit the shader pointers for all stages. */
>>> + sctx->graphics_bindless_pointer_dirty = true;
>>> + sctx->compute_bindless_pointer_dirty = true;
>>> + } else {
>>> + void *ptr;
>>> +
>>> + /* Upload the descriptor directly in VRAM. Because the slot
>>> + * index is new, we don't need to synchronize the operation.
>>> + */
>>> + ptr = sscreen->b.ws->buffer_map(desc->buffer->buf, NULL,
>>> + PIPE_TRANSFER_WRITE |
>>> + PIPE_TRANSFER_UNSYNCHRONIZED);
>>> + ptr += desc->buffer_offset;
>>> +
>>> + util_memcpy_cpu_to_le32(ptr + desc_slot * size,
>>> + desc->list + desc_slot * size / 4,
>>> + size);
>>> + }
>>> - return desc;
>>> + return desc_slot;
>>> }
>>> static void si_invalidate_bindless_buf_desc(struct si_context *sctx,
>>> - struct si_bindless_descriptor *desc,
>>> + unsigned desc_slot_offset,
>>> struct pipe_resource *resource,
>>> - uint64_t offset)
>>> + uint64_t offset,
>>> + bool *desc_dirty)
>>> {
>>> + struct si_descriptors *desc = &sctx->bindless_descriptors;
>>> struct r600_resource *buf = r600_resource(resource);
>>> - uint32_t *desc_list = desc->desc_list + 4;
>>> + uint32_t *desc_list = desc->list + desc_slot_offset + 4;
>>> uint64_t old_desc_va;
>>> assert(resource->target == PIPE_BUFFER);
>>> @@ -2382,7 +2385,7 @@ static void
>>> si_invalidate_bindless_buf_desc(struct si_context *sctx,
>>> */
>>> si_set_buf_desc_address(buf, offset, &desc_list[0]);
>>> - desc->dirty = true;
>>> + *desc_dirty = true;
>>> sctx->bindless_descriptors_dirty = true;
>>> }
>>> }
>>> @@ -2415,20 +2418,17 @@ static uint64_t
>>> si_create_texture_handle(struct pipe_context *ctx,
>>> memcpy(&tex_handle->sstate, sstate, sizeof(*sstate));
>>> ctx->delete_sampler_state(ctx, sstate);
>>> - tex_handle->desc = si_create_bindless_descriptor(sctx, desc_list,
>>> - sizeof(desc_list));
>>> - if (!tex_handle->desc) {
>>> + tex_handle->desc_slot = si_create_bindless_descriptor(sctx,
>>> desc_list,
>>> + sizeof(desc_list));
>>> + if (!tex_handle->desc_slot) {
>>> FREE(tex_handle);
>>> return 0;
>>> }
>>> - handle = tex_handle->desc->buffer->gpu_address +
>>> - tex_handle->desc->offset;
>>> + handle = tex_handle->desc_slot;
>>> if (!_mesa_hash_table_insert(sctx->tex_handles, (void *)handle,
>>> tex_handle)) {
>>> - pb_slab_free(&sctx->bindless_descriptor_slabs,
>>> - &tex_handle->desc->entry);
>>> FREE(tex_handle);
>>> return 0;
>>> }
>>> @@ -2454,8 +2454,6 @@ static void si_delete_texture_handle(struct
>>> pipe_context *ctx, uint64_t handle)
>>> pipe_sampler_view_reference(&tex_handle->view, NULL);
>>> _mesa_hash_table_remove(sctx->tex_handles, entry);
>>> - pb_slab_free(&sctx->bindless_descriptor_slabs,
>>> - &tex_handle->desc->entry);
>>> FREE(tex_handle);
>>> }
>>> @@ -2497,9 +2495,12 @@ static void
>>> si_make_texture_handle_resident(struct pipe_context *ctx,
>>> p_atomic_read(&rtex->framebuffers_bound))
>>> sctx->need_check_render_feedback = true;
>>> } else {
>>> - si_invalidate_bindless_buf_desc(sctx, tex_handle->desc,
>>> + unsigned desc_slot_offset = tex_handle->desc_slot * 16;
>>> +
>>> + si_invalidate_bindless_buf_desc(sctx, desc_slot_offset,
>>> sview->base.texture,
>>> - sview->base.u.buf.offset);
>>> + sview->base.u.buf.offset,
>>> + &tex_handle->desc_dirty);
>>> }
>>> /* Add the texture handle to the per-context list. */
>>> @@ -2509,11 +2510,6 @@ static void
>>> si_make_texture_handle_resident(struct pipe_context *ctx,
>>> /* Add the buffers to the current CS in case si_begin_new_cs()
>>> * is not going to be called.
>>> */
>>> - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>>> - tex_handle->desc->buffer,
>>> - RADEON_USAGE_READWRITE,
>>> - RADEON_PRIO_DESCRIPTORS);
>>> -
>>> si_sampler_view_add_buffer(sctx, sview->base.texture,
>>> RADEON_USAGE_READ,
>>> sview->is_stencil_sampler, false);
>>> @@ -2540,7 +2536,7 @@ static uint64_t si_create_image_handle(struct
>>> pipe_context *ctx,
>>> {
>>> struct si_context *sctx = (struct si_context *)ctx;
>>> struct si_image_handle *img_handle;
>>> - uint32_t desc_list[16];
>>> + uint32_t desc_list[8];
>>> uint64_t handle;
>>> if (!view || !view->resource)
>>> @@ -2555,20 +2551,17 @@ static uint64_t si_create_image_handle(struct
>>> pipe_context *ctx,
>>> si_set_shader_image_desc(sctx, view, false, &desc_list[0]);
>>> - img_handle->desc = si_create_bindless_descriptor(sctx, desc_list,
>>> - sizeof(desc_list));
>>> - if (!img_handle->desc) {
>>> + img_handle->desc_slot = si_create_bindless_descriptor(sctx,
>>> desc_list,
>>> + sizeof(desc_list));
>>> + if (!img_handle->desc_slot) {
>>> FREE(img_handle);
>>> return 0;
>>> }
>>> - handle = img_handle->desc->buffer->gpu_address +
>>> - img_handle->desc->offset;
>>> + handle = img_handle->desc_slot;
>>> if (!_mesa_hash_table_insert(sctx->img_handles, (void *)handle,
>>> img_handle)) {
>>> - pb_slab_free(&sctx->bindless_descriptor_slabs,
>>> - &img_handle->desc->entry);
>>> FREE(img_handle);
>>> return 0;
>>> }
>>> @@ -2594,8 +2587,6 @@ static void si_delete_image_handle(struct
>>> pipe_context *ctx, uint64_t handle)
>>> util_copy_image_view(&img_handle->view, NULL);
>>> _mesa_hash_table_remove(sctx->img_handles, entry);
>>> - pb_slab_free(&sctx->bindless_descriptor_slabs,
>>> - &img_handle->desc->entry);
>>> FREE(img_handle);
>>> }
>>> @@ -2633,9 +2624,12 @@ static void
>>> si_make_image_handle_resident(struct pipe_context *ctx,
>>> p_atomic_read(&rtex->framebuffers_bound))
>>> sctx->need_check_render_feedback = true;
>>> } else {
>>> - si_invalidate_bindless_buf_desc(sctx, img_handle->desc,
>>> + unsigned desc_slot_offset = img_handle->desc_slot * 8;
>>> +
>>> + si_invalidate_bindless_buf_desc(sctx, desc_slot_offset,
>>> view->resource,
>>> - view->u.buf.offset);
>>> + view->u.buf.offset,
>>> + &img_handle->desc_dirty);
>>> }
>>> /* Add the image handle to the per-context list. */
>>> @@ -2645,11 +2639,6 @@ static void
>>> si_make_image_handle_resident(struct pipe_context *ctx,
>>> /* Add the buffers to the current CS in case si_begin_new_cs()
>>> * is not going to be called.
>>> */
>>> - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
>>> - img_handle->desc->buffer,
>>> - RADEON_USAGE_READWRITE,
>>> - RADEON_PRIO_DESCRIPTORS);
>>> -
>>> si_sampler_view_add_buffer(sctx, view->resource,
>>> (access & PIPE_IMAGE_ACCESS_WRITE) ?
>>> RADEON_USAGE_READWRITE :
>>> @@ -2679,20 +2668,6 @@ void
>>> si_all_resident_buffers_begin_new_cs(struct si_context *sctx)
>>> num_resident_img_handles = sctx->resident_img_handles.size /
>>> sizeof(struct si_image_handle *);
>>> - /* Skip adding the bindless descriptors when no handles are
>>> resident.
>>> - */
>>> - if (!num_resident_tex_handles && !num_resident_img_handles)
>>> - return;
>>> -
>>> - /* Add all bindless descriptors. */
>>> - util_dynarray_foreach(&sctx->bindless_descriptors,
>>> - struct r600_resource *, desc) {
>>> -
>>> - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *desc,
>>> - RADEON_USAGE_READWRITE,
>>> - RADEON_PRIO_DESCRIPTORS);
>>> - }
>>> -
>>> /* Add all resident texture handles. */
>>> util_dynarray_foreach(&sctx->resident_tex_handles,
>>> struct si_texture_handle *, tex_handle) {
>>> @@ -2841,6 +2816,13 @@ void si_init_all_descriptors(struct si_context
>>> *sctx)
>>> si_init_descriptors(sctx, &sctx->vertex_buffers,
>>> SI_SGPR_VERTEX_BUFFERS,
>>> 4, SI_NUM_VERTEX_BUFFERS, 0, 0, NULL);
>>> + /* Initialize an array of 1024 bindless descriptors, when the
>>> limit is
>>> + * reached, just make it larger and re-upload the whole array.
>>> + */
>>> + si_init_bindless_descriptors(sctx, &sctx->bindless_descriptors,
>>> + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES,
>>> + 1024);
>>> +
>>> sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
>>> sctx->total_ce_ram_allocated = ce_offset;
>>> @@ -2954,6 +2936,7 @@ void si_release_all_descriptors(struct
>>> si_context *sctx)
>>> for (i = 0; i < SI_NUM_DESCS; ++i)
>>> si_release_descriptors(&sctx->descriptors[i]);
>>> si_release_descriptors(&sctx->vertex_buffers);
>>> + si_release_bindless_descriptors(sctx);
>>> }
>>> void si_all_descriptors_begin_new_cs(struct si_context *sctx)
>>> @@ -2970,6 +2953,7 @@ void si_all_descriptors_begin_new_cs(struct
>>> si_context *sctx)
>>> for (i = 0; i < SI_NUM_DESCS; ++i)
>>> si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);
>>> + si_descriptors_begin_new_cs(sctx, &sctx->bindless_descriptors);
>>> si_shader_userdata_begin_new_cs(sctx);
>>> }
>>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
>>> b/src/gallium/drivers/radeonsi/si_pipe.c
>>> index 5f3b7e112c..8b26be8f2f 100644
>>> --- a/src/gallium/drivers/radeonsi/si_pipe.c
>>> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
>>> @@ -96,9 +96,6 @@ static void si_destroy_context(struct pipe_context
>>> *context)
>>> r600_resource_reference(&sctx->last_trace_buf, NULL);
>>> radeon_clear_saved_cs(&sctx->last_gfx);
>>> - pb_slabs_deinit(&sctx->bindless_descriptor_slabs);
>>> - util_dynarray_fini(&sctx->bindless_descriptors);
>>> -
>>> _mesa_hash_table_destroy(sctx->tex_handles, NULL);
>>> _mesa_hash_table_destroy(sctx->img_handles, NULL);
>>> @@ -345,15 +342,6 @@ static struct pipe_context
>>> *si_create_context(struct pipe_screen *screen,
>>> sctx->tm = si_create_llvm_target_machine(sscreen);
>>> - /* Create a slab allocator for all bindless descriptors. */
>>> - if (!pb_slabs_init(&sctx->bindless_descriptor_slabs, 6, 6, 1, sctx,
>>> - si_bindless_descriptor_can_reclaim_slab,
>>> - si_bindless_descriptor_slab_alloc,
>>> - si_bindless_descriptor_slab_free))
>>> - goto fail;
>>> -
>>> - util_dynarray_init(&sctx->bindless_descriptors, NULL);
>>> -
>>> /* Bindless handles. */
>>> sctx->tex_handles = _mesa_hash_table_create(NULL,
>>> _mesa_hash_pointer,
>>> _mesa_key_pointer_equal);
>>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
>>> b/src/gallium/drivers/radeonsi/si_pipe.h
>>> index bd724e80a0..819a44d226 100644
>>> --- a/src/gallium/drivers/radeonsi/si_pipe.h
>>> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>>> @@ -230,25 +230,18 @@ union si_vgt_param_key {
>>> uint32_t index;
>>> };
>>> -struct si_bindless_descriptor
>>> -{
>>> - struct pb_slab_entry entry;
>>> - struct r600_resource *buffer;
>>> - unsigned offset;
>>> - uint32_t desc_list[16];
>>> - bool dirty;
>>> -};
>>> -
>>> struct si_texture_handle
>>> {
>>> - struct si_bindless_descriptor *desc;
>>> + unsigned desc_slot;
>>> + bool desc_dirty;
>>> struct pipe_sampler_view *view;
>>> struct si_sampler_state sstate;
>>> };
>>> struct si_image_handle
>>> {
>>> - struct si_bindless_descriptor *desc;
>>> + unsigned desc_slot;
>>> + bool desc_dirty;
>>> struct pipe_image_view view;
>>> };
>>> @@ -418,12 +411,12 @@ struct si_context {
>>> union si_vgt_param_key ia_multi_vgt_param_key;
>>> unsigned ia_multi_vgt_param[SI_NUM_VGT_PARAM_STATES];
>>> - /* Slab allocator for bindless descriptors. */
>>> - struct pb_slabs bindless_descriptor_slabs;
>>> -
>>> /* Bindless descriptors. */
>>> - struct util_dynarray bindless_descriptors;
>>> + struct si_descriptors bindless_descriptors;
>>> + unsigned num_bindless_descriptors;
>>> bool bindless_descriptors_dirty;
>>> + bool graphics_bindless_pointer_dirty;
>>> + bool compute_bindless_pointer_dirty;
>>> /* Allocated bindless handles */
>>> struct hash_table *tex_handles;
>>> diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
>>> b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
>>> index 3b50ca5341..3ca446aaa8 100644
>>> --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
>>> +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
>>> @@ -213,15 +213,13 @@ image_fetch_rsrc(
>>> }
>>> if (image->Register.File != TGSI_FILE_IMAGE) {
>>> - struct gallivm_state *gallivm = &ctx->gallivm;
>>> - LLVMBuilderRef builder = gallivm->builder;
>>> -
>>> - LLVMValueRef ptr =
>>> - lp_build_emit_fetch_src(bld_base, image,
>>> - TGSI_TYPE_UNSIGNED64, 0);
>>> - rsrc_ptr = LLVMBuildIntToPtr(builder, ptr,
>>> - si_const_array(ctx->v8i32, 0), "");
>>> - index = LLVMConstInt(ctx->i32, 0, 0);
>>> + /* Bindless descriptors are accessible from a different pair of
>>> + * user SGPR indices.
>>> + */
>>> + rsrc_ptr = LLVMGetParam(ctx->main_fn,
>>> + ctx->param_bindless_samplers_and_images);
>>> + index = lp_build_emit_fetch_src(bld_base, image,
>>> + TGSI_TYPE_UNSIGNED, 0);
>>> }
>>> *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
>>> @@ -1221,15 +1219,13 @@ static void tex_fetch_ptrs(
>>> }
>>> if (reg->Register.File != TGSI_FILE_SAMPLER) {
>>> - struct gallivm_state *gallivm = &ctx->gallivm;
>>> - LLVMBuilderRef builder = gallivm->builder;
>>> -
>>> - LLVMValueRef ptr =
>>> - lp_build_emit_fetch_src(bld_base, reg,
>>> - TGSI_TYPE_UNSIGNED64, 0);
>>> - list = LLVMBuildIntToPtr(builder, ptr,
>>> - si_const_array(ctx->v8i32, 0), "");
>>> - index = LLVMConstInt(ctx->i32, 0, 0);
>>> + /* Bindless descriptors are accessible from a different pair of
>>> + * user SGPR indices.
>>> + */
>>> + list = LLVMGetParam(ctx->main_fn,
>>> + ctx->param_bindless_samplers_and_images);
>>> + index = lp_build_emit_fetch_src(bld_base, reg,
>>> + TGSI_TYPE_UNSIGNED, 0);
>>> }
>>> if (target == TGSI_TEXTURE_BUFFER)
>>>
>>
>>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list