[Mesa-dev] [PATCH 02/10] radv: Keep a global BO list for VkMemory.
Samuel Pitoiset
samuel.pitoiset at gmail.com
Fri Apr 13 18:12:31 UTC 2018
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
We should remove some radv_cs_add_buffer() calls and re-enable local BOs
to reduce overhead, but this can be done later.
Make sure to check the system submission path.
On 04/12/2018 01:44 AM, Bas Nieuwenhuizen wrote:
> With update after bind we can't attach bo's to the command buffer
> from the descriptor set anymore, so we have to have a global BO
> list.
>
> I am somewhat surprised this works really well even though we have
> implicit synchronization in the WSI based on the bo list associations
> and with the new behavior every command buffer is associated with
> every swapchain image. But I could not find slowdowns in games because
> of it.
> ---
> src/amd/vulkan/radv_device.c | 125 +++++++++++++-----
> src/amd/vulkan/radv_private.h | 8 ++
> src/amd/vulkan/radv_radeon_winsys.h | 6 +
> src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 46 ++++++-
> 4 files changed, 146 insertions(+), 39 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 22e8f1e7a78..c81b69fef5c 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -1208,6 +1208,55 @@ radv_queue_finish(struct radv_queue *queue)
> queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
> }
>
> +static void
> +radv_bo_list_init(struct radv_bo_list *bo_list)
> +{
> + pthread_mutex_init(&bo_list->mutex, NULL);
> + bo_list->list.count = bo_list->capacity = 0;
> + bo_list->list.bos = NULL;
> +}
> +
> +static void
> +radv_bo_list_finish(struct radv_bo_list *bo_list)
> +{
> + free(bo_list->list.bos);
> + pthread_mutex_destroy(&bo_list->mutex);
> +}
> +
> +static VkResult radv_bo_list_add(struct radv_bo_list *bo_list, struct radeon_winsys_bo *bo)
> +{
> + pthread_mutex_lock(&bo_list->mutex);
> + if (bo_list->list.count == bo_list->capacity) {
> + unsigned capacity = MAX2(4, bo_list->capacity * 2);
> + void *data = realloc(bo_list->list.bos, capacity * sizeof(struct radeon_winsys_bo*));
> +
> + if (!data) {
> + pthread_mutex_unlock(&bo_list->mutex);
> + return VK_ERROR_OUT_OF_HOST_MEMORY;
> + }
> +
> + bo_list->list.bos = (struct radeon_winsys_bo**)data;
> + bo_list->capacity = capacity;
> + }
> +
> + bo_list->list.bos[bo_list->list.count++] = bo;
> + pthread_mutex_unlock(&bo_list->mutex);
> + return VK_SUCCESS;
> +}
> +
> +static void radv_bo_list_remove(struct radv_bo_list *bo_list, struct radeon_winsys_bo *bo)
> +{
> + pthread_mutex_lock(&bo_list->mutex);
> + for(unsigned i = 0; i < bo_list->list.count; ++i) {
> + if (bo_list->list.bos[i] == bo) {
> + bo_list->list.bos[i] = bo_list->list.bos[bo_list->list.count - 1];
> + --bo_list->list.count;
> + break;
> + }
> + }
> + pthread_mutex_unlock(&bo_list->mutex);
> +}
> +
> static void
> radv_device_init_gs_info(struct radv_device *device)
> {
> @@ -1308,6 +1357,8 @@ VkResult radv_CreateDevice(
> mtx_init(&device->shader_slab_mutex, mtx_plain);
> list_inithead(&device->shader_slabs);
>
> + radv_bo_list_init(&device->bo_list);
> +
> for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
> const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
> uint32_t qfi = queue_create->queueFamilyIndex;
> @@ -1440,6 +1491,8 @@ VkResult radv_CreateDevice(
> fail_meta:
> radv_device_finish_meta(device);
> fail:
> + radv_bo_list_finish(&device->bo_list);
> +
> if (device->trace_bo)
> device->ws->buffer_destroy(device->trace_bo);
>
> @@ -1487,6 +1540,7 @@ void radv_DestroyDevice(
>
> radv_destroy_shader_slabs(device);
>
> + radv_bo_list_finish(&device->bo_list);
> vk_free(&device->alloc, device);
> }
>
> @@ -2257,7 +2311,7 @@ static VkResult radv_signal_fence(struct radv_queue *queue,
>
> ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
> &queue->device->empty_cs[queue->queue_family_index],
> - 1, NULL, NULL, &sem_info,
> + 1, NULL, NULL, &sem_info, NULL,
> false, fence->fence);
> radv_free_sem_info(&sem_info);
>
> @@ -2334,7 +2388,7 @@ VkResult radv_QueueSubmit(
> ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
> &queue->device->empty_cs[queue->queue_family_index],
> 1, NULL, NULL,
> - &sem_info,
> + &sem_info, NULL,
> false, base_fence);
> if (ret) {
> radv_loge("failed to submit CS %d\n", i);
> @@ -2372,11 +2426,15 @@ VkResult radv_QueueSubmit(
> sem_info.cs_emit_wait = j == 0;
> sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
>
> + pthread_mutex_lock(&queue->device->bo_list.mutex);
> +
> ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
> advance, initial_preamble, continue_preamble_cs,
> - &sem_info,
> + &sem_info, &queue->device->bo_list.list,
> can_patch, base_fence);
>
> + pthread_mutex_unlock(&queue->device->bo_list.mutex);
> +
> if (ret) {
> radv_loge("failed to submit CS %d\n", i);
> abort();
> @@ -2582,11 +2640,8 @@ static VkResult radv_alloc_memory(struct radv_device *device,
> goto fail;
> } else {
> close(import_info->fd);
> - goto out_success;
> }
> - }
> -
> - if (host_ptr_info) {
> + } else if (host_ptr_info) {
> assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
> assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED);
> mem->bo = device->ws->buffer_from_ptr(device->ws, host_ptr_info->pHostPointer,
> @@ -2596,41 +2651,46 @@ static VkResult radv_alloc_memory(struct radv_device *device,
> goto fail;
> } else {
> mem->user_ptr = host_ptr_info->pHostPointer;
> - goto out_success;
> }
> - }
> -
> - uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
> - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
> - mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
> - domain = RADEON_DOMAIN_GTT;
> - else
> - domain = RADEON_DOMAIN_VRAM;
> + } else {
> + uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
> + if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
> + mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
> + domain = RADEON_DOMAIN_GTT;
> + else
> + domain = RADEON_DOMAIN_VRAM;
>
> - if (mem_type_index == RADV_MEM_TYPE_VRAM)
> - flags |= RADEON_FLAG_NO_CPU_ACCESS;
> - else
> - flags |= RADEON_FLAG_CPU_ACCESS;
> + if (mem_type_index == RADV_MEM_TYPE_VRAM)
> + flags |= RADEON_FLAG_NO_CPU_ACCESS;
> + else
> + flags |= RADEON_FLAG_CPU_ACCESS;
>
> - if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
> - flags |= RADEON_FLAG_GTT_WC;
> + if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
> + flags |= RADEON_FLAG_GTT_WC;
>
> - if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
> - flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
> + if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
> + flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
>
> - mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
> - domain, flags);
> + mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
> + domain, flags);
>
> - if (!mem->bo) {
> - result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
> - goto fail;
> + if (!mem->bo) {
> + result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
> + goto fail;
> + }
> + mem->type_index = mem_type_index;
> }
> - mem->type_index = mem_type_index;
> -out_success:
> +
> + result = radv_bo_list_add(&device->bo_list, mem->bo);
> + if (result != VK_SUCCESS)
> + goto fail_bo;
> +
> *pMem = radv_device_memory_to_handle(mem);
>
> return VK_SUCCESS;
>
> +fail_bo:
> + device->ws->buffer_destroy(mem->bo);
> fail:
> vk_free2(&device->alloc, pAllocator, mem);
>
> @@ -2658,6 +2718,7 @@ void radv_FreeMemory(
> if (mem == NULL)
> return;
>
> + radv_bo_list_remove(&device->bo_list, mem->bo);
> device->ws->buffer_destroy(mem->bo);
> mem->bo = NULL;
>
> @@ -2977,7 +3038,7 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
> queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
> &queue->device->empty_cs[queue->queue_family_index],
> 1, NULL, NULL,
> - &sem_info,
> + &sem_info, NULL,
> false, base_fence);
> fence_emitted = true;
> if (fence)
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 1bcc3a906ec..9bed7ba07c2 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -596,6 +596,12 @@ struct radv_queue {
> struct radeon_winsys_cs *continue_preamble_cs;
> };
>
> +struct radv_bo_list {
> + struct radv_winsys_bo_list list;
> + unsigned capacity;
> + pthread_mutex_t mutex;
> +};
> +
> struct radv_device {
> VK_LOADER_DATA _loader_data;
>
> @@ -658,6 +664,8 @@ struct radv_device {
> uint64_t dmesg_timestamp;
>
> struct radv_device_extension_table enabled_extensions;
> +
> + struct radv_bo_list bo_list;
> };
>
> struct radv_device_memory {
> diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
> index 270b3bceaba..cfde19d1ae1 100644
> --- a/src/amd/vulkan/radv_radeon_winsys.h
> +++ b/src/amd/vulkan/radv_radeon_winsys.h
> @@ -177,6 +177,11 @@ struct radv_winsys_sem_info {
> struct radv_winsys_sem_counts signal;
> };
>
> +struct radv_winsys_bo_list {
> + struct radeon_winsys_bo **bos;
> + unsigned count;
> +};
> +
> struct radeon_winsys {
> void (*destroy)(struct radeon_winsys *ws);
>
> @@ -245,6 +250,7 @@ struct radeon_winsys {
> struct radeon_winsys_cs *initial_preamble_cs,
> struct radeon_winsys_cs *continue_preamble_cs,
> struct radv_winsys_sem_info *sem_info,
> + const struct radv_winsys_bo_list *bo_list, /* optional */
> bool can_patch,
> struct radeon_winsys_fence *fence);
>
> diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
> index cd7ab384e73..a975f0e4969 100644
> --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
> +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
> @@ -550,6 +550,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> unsigned count,
> struct radv_amdgpu_winsys_bo *extra_bo,
> struct radeon_winsys_cs *extra_cs,
> + const struct radv_winsys_bo_list *radv_bo_list,
> amdgpu_bo_list_handle *bo_list)
> {
> int r = 0;
> @@ -577,7 +578,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> bo_list);
> free(handles);
> pthread_mutex_unlock(&ws->global_bo_list_lock);
> - } else if (count == 1 && !extra_bo && !extra_cs &&
> + } else if (count == 1 && !extra_bo && !extra_cs && !radv_bo_list &&
> !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
> struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
> if (cs->num_buffers == 0) {
> @@ -599,6 +600,11 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> if (extra_cs) {
> total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
> }
> +
> + if (radv_bo_list) {
> + total_buffer_count += radv_bo_list->count;
> + }
> +
> if (total_buffer_count == 0) {
> *bo_list = 0;
> return 0;
> @@ -672,6 +678,27 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
> }
> }
>
> + if (radv_bo_list) {
> + unsigned unique_bo_so_far = unique_bo_count;
> + const unsigned default_bo_priority = 7;
> + for (unsigned i = 0; i < radv_bo_list->count; ++i) {
> + struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(radv_bo_list->bos[i]);
> + bool found = false;
> + for (unsigned j = 0; j < unique_bo_so_far; ++j) {
> + if (bo->bo == handles[j]) {
> + found = true;
> + priorities[j] = MAX2(priorities[j], default_bo_priority);
> + break;
> + }
> + }
> + if (!found) {
> + handles[unique_bo_count] = bo->bo;
> + priorities[unique_bo_count] = default_bo_priority;
> + ++unique_bo_count;
> + }
> + }
> + }
> +
> if (unique_bo_count > 0) {
> r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles,
> priorities, bo_list);
> @@ -707,6 +734,7 @@ static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
> static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
> int queue_idx,
> struct radv_winsys_sem_info *sem_info,
> + const struct radv_winsys_bo_list *radv_bo_list,
> struct radeon_winsys_cs **cs_array,
> unsigned cs_count,
> struct radeon_winsys_cs *initial_preamble_cs,
> @@ -743,7 +771,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
> }
> }
>
> - r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs, &bo_list);
> + r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs,
> + radv_bo_list, &bo_list);
> if (r) {
> fprintf(stderr, "amdgpu: buffer list creation failed for the "
> "chained submission(%d)\n", r);
> @@ -787,6 +816,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
> static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
> int queue_idx,
> struct radv_winsys_sem_info *sem_info,
> + const struct radv_winsys_bo_list *radv_bo_list,
> struct radeon_winsys_cs **cs_array,
> unsigned cs_count,
> struct radeon_winsys_cs *initial_preamble_cs,
> @@ -811,7 +841,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
> memset(&request, 0, sizeof(request));
>
> r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
> - preamble_cs, &bo_list);
> + preamble_cs, radv_bo_list, &bo_list);
> if (r) {
> fprintf(stderr, "amdgpu: buffer list creation failed "
> "for the fallback submission (%d)\n", r);
> @@ -868,6 +898,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
> static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
> int queue_idx,
> struct radv_winsys_sem_info *sem_info,
> + const struct radv_winsys_bo_list *radv_bo_list,
> struct radeon_winsys_cs **cs_array,
> unsigned cs_count,
> struct radeon_winsys_cs *initial_preamble_cs,
> @@ -937,7 +968,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
>
> r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
> (struct radv_amdgpu_winsys_bo*)bo,
> - preamble_cs, &bo_list);
> + preamble_cs, radv_bo_list, &bo_list);
> if (r) {
> fprintf(stderr, "amdgpu: buffer list creation failed "
> "for the sysmem submission (%d)\n", r);
> @@ -988,6 +1019,7 @@ static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
> struct radeon_winsys_cs *initial_preamble_cs,
> struct radeon_winsys_cs *continue_preamble_cs,
> struct radv_winsys_sem_info *sem_info,
> + const struct radv_winsys_bo_list *bo_list,
> bool can_patch,
> struct radeon_winsys_fence *_fence)
> {
> @@ -997,13 +1029,13 @@ static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
>
> assert(sem_info);
> if (!cs->ws->use_ib_bos) {
> - ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, cs_array,
> + ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, bo_list, cs_array,
> cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
> } else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && cs->ws->batchchain) {
> - ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, cs_array,
> + ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, bo_list, cs_array,
> cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
> } else {
> - ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, cs_array,
> + ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, bo_list, cs_array,
> cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
> }
>
>
More information about the mesa-dev
mailing list