[PATCH v2 3/3] drm/amdgpu: sync page table freeing with tlb flush

Thu Feb 1 13:48:44 UTC 2024

Am 31.01.24 um 18:14 schrieb Shashank Sharma:
> This patch:
> - Attaches the TLB flush fence to the PT objects being freed
> - Adds a new ptr in VM to save this last TLB flush fence
> - Adds a new lock in VM to prevent out-of-context update of saved
>    TLB flush fence
> - Adds a new ptr in tlb_flush structure to save vm
>
> The idea is to delay freeing of page table objects until we have the
> respective TLB entries flushed.
>
> V2: rebase
>
> Cc: Christian König <Christian.Koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Felix Kuehling <felix.kuehling at amd.com>
> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  3 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     | 27 +++++++++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 13 +++++++--
>   4 files changed, 45 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 67c690044b97..b0e81c249e3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2245,6 +2245,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	vm->generation = 0;
>   
>   	mutex_init(&vm->eviction_lock);
> +	mutex_init(&vm->tlb_flush_lock);
>   	vm->evicting = false;
>   	vm->tlb_fence_context = dma_fence_context_alloc(1);
>   
> @@ -2360,7 +2361,9 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   	}
>   
>   	dma_fence_put(vm->last_update);
> +	dma_fence_put(vm->tlb_fence_last);
>   	vm->last_update = dma_fence_get_stub();
> +	vm->tlb_fence_last = dma_fence_get_stub();
>   	vm->is_compute_context = true;
>   
>   	/* Free the shadow bo for compute VM */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 8e6fd25d07b7..b05bc586237f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -334,6 +334,10 @@ struct amdgpu_vm {
>   	uint64_t		*tlb_seq_cpu_addr;
>   	uint64_t		tlb_fence_context;
>   
> +	/* Ptr and lock to maintain tlb flush sync */
> +	struct mutex		tlb_flush_lock;
> +	struct dma_fence        *tlb_fence_last;
> +
>   	atomic64_t		kfd_last_flushed_seq;
>   
>   	/* How many times we had to re-generate the page tables */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index 95dc0afdaffb..f1c4418c4d63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -631,6 +631,18 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
>   	return r;
>   }
>   
> +static inline
> +void amdgpu_vm_attach_tlb_fence(struct amdgpu_bo *bo, struct dma_fence *fence)
> +{
> +	if (!bo || !fence)
> +		return;
> +
> +	if (!dma_resv_reserve_fences(bo->tbo.base.resv, 1)) {
> +		dma_resv_add_fence(bo->tbo.base.resv, fence,
> +				   DMA_RESV_USAGE_BOOKKEEP);
> +	}
> +}
> +
>   /**
>    * amdgpu_vm_pt_free - free one PD/PT
>    *
> @@ -638,6 +650,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
>    */
>   static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
>   {
> +	struct amdgpu_vm *vm;
>   	struct amdgpu_bo *shadow;
>   
>   	if (!entry->bo)
> @@ -646,9 +659,23 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
>   	entry->bo->vm_bo = NULL;
>   	shadow = amdgpu_bo_shadowed(entry->bo);
>   	if (shadow) {
> +		vm = shadow->vm_bo->vm;
> +
> +		mutex_lock(&vm->tlb_flush_lock);
> +		if (vm->tlb_fence_last)
> +			amdgpu_vm_attach_tlb_fence(shadow, vm->tlb_fence_last);
> +		mutex_unlock(&vm->tlb_flush_lock);
> +
>   		ttm_bo_set_bulk_move(&shadow->tbo, NULL);
>   		amdgpu_bo_unref(&shadow);
>   	}
> +
> +	vm = entry->vm;
> +	mutex_lock(&vm->tlb_flush_lock);
> +	if (vm->tlb_fence_last)
> +		amdgpu_vm_attach_tlb_fence(entry->bo, vm->tlb_fence_last);
> +	mutex_unlock(&vm->tlb_flush_lock);
> +

That approach doesn't make sense. Instead add the freed PT/PDs to a 
linked list in the parameters structure and only really free them after 
adding the fence to the root PD.

>   	ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
>   
>   	spin_lock(&entry->vm->status_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> index 569681badd7c..54ec81d30034 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> @@ -31,6 +31,7 @@
>   struct amdgpu_tlb_fence {
>   	struct dma_fence	base;
>   	struct amdgpu_device	*adev;
> +	struct amdgpu_vm	*vm;

Big NAK to that. The VM might not live long enough to see the end of the 
TLB flush.

Regards,
Christian.

>   	struct dma_fence	*dependency;
>   	struct work_struct	work;
>   	spinlock_t		lock;
> @@ -51,6 +52,7 @@ static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
>   static void amdgpu_tlb_fence_work(struct work_struct *work)
>   {
>   	struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
> +	struct amdgpu_vm *vm = f->vm;
>   	int r;
>   
>   	r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
> @@ -62,6 +64,10 @@ static void amdgpu_tlb_fence_work(struct work_struct *work)
>   
>   	dma_fence_signal(&f->base);
>   	dma_fence_put(&f->base);
> +
> +	mutex_lock(&vm->tlb_flush_lock);
> +	vm->tlb_fence_last = NULL;
> +	mutex_unlock(&vm->tlb_flush_lock);
>   }
>   
>   static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
> @@ -92,6 +98,7 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm
>   	f->adev = adev;
>   	f->dependency = *fence;
>   	f->pasid = vm->pasid;
> +	f->vm = vm;
>   	INIT_WORK(&f->work, amdgpu_tlb_fence_work);
>   	spin_lock_init(&f->lock);
>   
> @@ -99,8 +106,10 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm
>   		       vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
>   
>   	/* TODO: We probably need a separate wq here */
> -	dma_fence_get(&f->base);
> -	schedule_work(&f->work);
> +	mutex_lock(&vm->tlb_flush_lock);
> +	vm->tlb_fence_last = dma_fence_get(&f->base);
> +	mutex_unlock(&vm->tlb_flush_lock);
>   
> +	schedule_work(&f->work);
>   	*fence = &f->base;
>   }