[PATCH 5/9] drm/amdkfd: Ensure user queue buffers residency

Wed Jul 17 20:26:25 UTC 2024

On 2024-07-15 08:34, Philip Yang wrote:
> Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap
> BO from the GPU if the bo_va queue_refcount is not zero.
>
> Create queue to increase the bo_va queue_refcount, destroy queue to
> decrease the bo_va queue_refcount, to ensure the queue buffers mapped on
> the GPU when queue is active.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 14 ++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h    |  6 ++++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_queue.c        | 34 ++++++++++++++++---
>   5 files changed, 49 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 0ab37e7aec26..6d5fd371d5ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
>   	return ret;
>   }
>   
> -static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
> +static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
>   				struct kfd_mem_attachment *entry,
>   				struct amdgpu_sync *sync)
>   {
> @@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
>   	struct amdgpu_device *adev = entry->adev;
>   	struct amdgpu_vm *vm = bo_va->base.vm;
>   
> +	if (bo_va->queue_refcount) {
> +		pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
> +		return -EBUSY;
> +	}
> +
>   	amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
>   
>   	amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
>   
>   	amdgpu_sync_fence(sync, bo_va->last_pt_update);
> +
> +	return 0;
>   }
>   
>   static int update_gpuvm_pte(struct kgd_mem *mem,
> @@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>   		pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
>   			 entry->va, entry->va + bo_size, entry);
>   
> -		unmap_bo_from_gpuvm(mem, entry, ctx.sync);
> +		ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
> +		if (ret)
> +			goto unreserve_out;
> +
>   		entry->is_mapped = false;
>   
>   		mem->mapped_to_gpu_memory--;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index bc42ccbde659..d7e27957013f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -90,6 +90,12 @@ struct amdgpu_bo_va {
>   	bool				cleared;
>   
>   	bool				is_xgmi;
> +
> +	/*
> +	 * protected by vm reservation lock
> +	 * if non-zero, cannot unmap from GPU because user queues may still access it
> +	 */
> +	unsigned int			queue_refcount;
>   };
>   
>   struct amdgpu_bo {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 202f24ee4bd7..65a37ac5a0f0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
>   		err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
>   			peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
>   		if (err) {
> -			pr_err("Failed to unmap from gpu %d/%d\n",
> -			       i, args->n_devices);
> +			pr_debug("Failed to unmap from gpu %d/%d\n", i, args->n_devices);
>   			goto unmap_memory_from_gpu_failed;
>   		}
>   		args->n_success = i+1;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index d0dca20849d9..95fbdb12beb1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1291,6 +1291,7 @@ void print_queue_properties(struct queue_properties *q);
>   void print_queue(struct queue *q);
>   int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
>   			 u64 expected_size);
> +void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
>   int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
>   int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
> index 0e661160c295..3fd386dcb011 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
> @@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
>   	}
>   
>   	*pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);
> +	mapping->bo_va->queue_refcount++;
>   	return 0;
>   
>   out_err:
> @@ -113,6 +114,19 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_
>   	return -EINVAL;
>   }
>   
> +void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
> +{
> +	if (*bo) {
> +		struct amdgpu_bo_va *bo_va;
> +
> +		bo_va = amdgpu_vm_bo_find(vm, *bo);
> +		if (bo_va)
> +			bo_va->queue_refcount--;
> +	}
> +
> +	amdgpu_bo_unref(bo);
> +}
> +
>   int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
>   {
>   	struct amdgpu_vm *vm;
> @@ -166,10 +180,20 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
>   
>   int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
>   {
> -	amdgpu_bo_unref(&properties->wptr_bo);
> -	amdgpu_bo_unref(&properties->rptr_bo);
> -	amdgpu_bo_unref(&properties->ring_bo);
> -	amdgpu_bo_unref(&properties->eop_buf_bo);
> -	amdgpu_bo_unref(&properties->cwsr_bo);
> +	struct amdgpu_vm *vm;
> +	int err;
> +
> +	vm = drm_priv_to_vm(pdd->drm_priv);
> +	err = amdgpu_bo_reserve(vm->root.bo, false);
> +	if (err)
> +		return err;

The caller never handles these errors. See my comments on patch 2.

Regards,
   Felix

> +
> +	kfd_queue_buffer_put(vm, &properties->wptr_bo);
> +	kfd_queue_buffer_put(vm, &properties->rptr_bo);
> +	kfd_queue_buffer_put(vm, &properties->ring_bo);
> +	kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
> +	kfd_queue_buffer_put(vm, &properties->cwsr_bo);
> +
> +	amdgpu_bo_unreserve(vm->root.bo);
>   	return 0;
>   }