[PATCH 03/11] drm/amdkfd: Improve amdgpu_vm_handle_moved

Felix Kuehling felix.kuehling at amd.com
Wed Nov 1 17:09:58 UTC 2023


On 2023-10-17 17:13, Felix Kuehling wrote:
> Let amdgpu_vm_handle_moved update all BO VA mappings of BOs reserved by
> the caller. This will be useful for handling extra BO VA mappings in
> KFD VMs that are managed through the render node API.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> Reviewed-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 22 +--------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 19 +++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  3 ++-
>   4 files changed, 18 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 74769afaa33d..c8f2907ebd6f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1113,7 +1113,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   	struct amdgpu_vm *vm = &fpriv->vm;
>   	struct amdgpu_bo_list_entry *e;
>   	struct amdgpu_bo_va *bo_va;
> -	struct amdgpu_bo *bo;
>   	unsigned int i;
>   	int r;
>   
> @@ -1141,26 +1140,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   			return r;
>   	}
>   
> -	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
> -		/* ignore duplicates */
> -		bo = ttm_to_amdgpu_bo(e->tv.bo);
> -		if (!bo)
> -			continue;
> -
> -		bo_va = e->bo_va;
> -		if (bo_va == NULL)
> -			continue;
> -
> -		r = amdgpu_vm_bo_update(adev, bo_va, false);
> -		if (r)
> -			return r;
> -
> -		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
> -		if (r)
> -			return r;
> -	}

FYI, removing this loop seemed to cause PSDB failures, mostly in RADV 
tests. It may have been a glitch in the infrastructure, but the failures 
were consistent on the three subsequent patches, too. Restoring this 
loop seems to make the tests pass, so I'll do that for now. I don't have 
time to debug CS with RADV, and this change is not needed for my ROCm work.

Regards,
   Felix


> -
> -	r = amdgpu_vm_handle_moved(adev, vm);
> +	r = amdgpu_vm_handle_moved(adev, vm, &p->ticket);
>   	if (r)
>   		return r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index b5e28fa3f414..e7e87a3b2601 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -409,7 +409,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
>   		if (!r)
>   			r = amdgpu_vm_clear_freed(adev, vm, NULL);
>   		if (!r)
> -			r = amdgpu_vm_handle_moved(adev, vm);
> +			r = amdgpu_vm_handle_moved(adev, vm, ticket);
>   
>   		if (r && r != -EBUSY)
>   			DRM_ERROR("Failed to invalidate VM page tables (%d))\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index d72daf15662f..c586d0e93d75 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1285,6 +1285,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>    *
>    * @adev: amdgpu_device pointer
>    * @vm: requested vm
> + * @ticket: optional reservation ticket used to reserve the VM
>    *
>    * Make sure all BOs which are moved are updated in the PTs.
>    *
> @@ -1294,11 +1295,12 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>    * PTs have to be reserved!
>    */
>   int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
> -			   struct amdgpu_vm *vm)
> +			   struct amdgpu_vm *vm,
> +			   struct ww_acquire_ctx *ticket)
>   {
>   	struct amdgpu_bo_va *bo_va;
>   	struct dma_resv *resv;
> -	bool clear;
> +	bool clear, unlock;
>   	int r;
>   
>   	spin_lock(&vm->status_lock);
> @@ -1321,17 +1323,24 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>   		spin_unlock(&vm->status_lock);
>   
>   		/* Try to reserve the BO to avoid clearing its ptes */
> -		if (!adev->debug_vm && dma_resv_trylock(resv))
> +		if (!adev->debug_vm && dma_resv_trylock(resv)) {
>   			clear = false;
> +			unlock = true;
> +		/* The caller is already holding the reservation lock */
> +		} else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
> +			clear = false;
> +			unlock = false;
>   		/* Somebody else is using the BO right now */
> -		else
> +		} else {
>   			clear = true;
> +			unlock = false;
> +		}
>   
>   		r = amdgpu_vm_bo_update(adev, bo_va, clear);
>   		if (r)
>   			return r;
>   
> -		if (!clear)
> +		if (unlock)
>   			dma_resv_unlock(resv);
>   		spin_lock(&vm->status_lock);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 6e71978db13f..ebcc75132b74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -432,7 +432,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>   			  struct amdgpu_vm *vm,
>   			  struct dma_fence **fence);
>   int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
> -			   struct amdgpu_vm *vm);
> +			   struct amdgpu_vm *vm,
> +			   struct ww_acquire_ctx *ticket);
>   void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>   			    struct amdgpu_vm *vm, struct amdgpu_bo *bo);
>   int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,


More information about the amd-gfx mailing list