[PATCH] drm/amdgpu: Fix error handling in amdgpu_amdkfd_gpuvm_free_memory_of_gpu
Felix Kuehling
felix.kuehling at amd.com
Fri Jun 10 20:54:02 UTC 2022
Am 2022-06-10 um 00:04 schrieb Ramesh Errabolu:
> Following error conditions are fixed:
> Unpin MMIO and DOORBELL BOs only after map count goes to zero
> Remove BO from validate list of a KFD process in a safe manner
> Print a warning message if unreserving GPUVMs encounters an error
>
> Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu at amd.com>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 42 +++++++++++--------
> 1 file changed, 25 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index a1de900ba677..ee48e6591f99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1013,14 +1013,22 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
> mutex_unlock(&process_info->lock);
> }
>
> -static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
> +/**
> + * remove_kgd_mem_from_validate_list() - Remove BO from process's validate list,
> + * in an idempotent manner, so that restore worker can't access it anymore
> + * @mem: BO's membership handle in validate list
> + * @process_info: KFD process handle to which BO belongs
> + *
> + * Return: void
I don't think you need to state a void return explicitly. [+David],
since you were looking into KFD documentation and kernel-doc comments
lately, do you have any feedback on the kernel-doc syntax?
Other than that, this patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> + */
> +static void remove_kgd_mem_from_validate_list(struct kgd_mem *mem,
> struct amdkfd_process_info *process_info)
> {
> struct ttm_validate_buffer *bo_list_entry;
>
> bo_list_entry = &mem->validate_list;
> mutex_lock(&process_info->lock);
> - list_del(&bo_list_entry->head);
> + list_del_init(&bo_list_entry->head);
> mutex_unlock(&process_info->lock);
> }
>
> @@ -1796,7 +1804,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
>
> allocate_init_user_pages_failed:
> err_pin_bo:
> - remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
> + remove_kgd_mem_from_validate_list(*mem, avm->process_info);
> drm_vma_node_revoke(&gobj->vma_node, drm_priv);
> err_node_allow:
> /* Don't unreserve system mem limit twice */
> @@ -1825,20 +1833,12 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
> unsigned long bo_size = mem->bo->tbo.base.size;
> struct kfd_mem_attachment *entry, *tmp;
> struct bo_vm_reservation_context ctx;
> - struct ttm_validate_buffer *bo_list_entry;
> unsigned int mapped_to_gpu_memory;
> int ret;
> bool is_imported = false;
>
> mutex_lock(&mem->lock);
>
> - /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
> - if (mem->alloc_flags &
> - (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
> - KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
> - amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
> - }
> -
> mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
> is_imported = mem->is_imported;
> mutex_unlock(&mem->lock);
> @@ -1853,10 +1853,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
> }
>
> /* Make sure restore workers don't access the BO any more */
> - bo_list_entry = &mem->validate_list;
> - mutex_lock(&process_info->lock);
> - list_del(&bo_list_entry->head);
> - mutex_unlock(&process_info->lock);
> + remove_kgd_mem_from_validate_list(mem, process_info);
>
> /* No more MMU notifiers */
> amdgpu_mn_unregister(mem->bo);
> @@ -1878,7 +1875,18 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
> list_for_each_entry_safe(entry, tmp, &mem->attachments, list)
> kfd_mem_detach(entry);
>
> + /* Return success even in case of error */
> ret = unreserve_bo_and_vms(&ctx, false, false);
> + if (unlikely(ret)) {
> + WARN_ONCE(ret, "Error in unreserving BO and associated VMs");
> + ret = 0;
> + }
> +
> + /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
> + if (mem->alloc_flags &
> + (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
> + KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))
> + amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
>
> /* Free the sync object */
> amdgpu_sync_free(&mem->sync);
> @@ -2814,7 +2822,7 @@ int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem
> bo_reservation_failure:
> mutex_unlock(&(*mem)->process_info->lock);
> amdgpu_sync_free(&(*mem)->sync);
> - remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
> + remove_kgd_mem_from_validate_list(*mem, process_info);
> amdgpu_bo_unref(&gws_bo);
> mutex_destroy(&(*mem)->lock);
> kfree(*mem);
> @@ -2832,7 +2840,7 @@ int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
> /* Remove BO from process's validate list so restore worker won't touch
> * it anymore
> */
> - remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
> + remove_kgd_mem_from_validate_list(kgd_mem, process_info);
>
> ret = amdgpu_bo_reserve(gws_bo, false);
> if (unlikely(ret)) {
More information about the amd-gfx
mailing list