[PATCH v2 3/3] drm/amdgpu: sync page table freeing with tlb flush

Tue Feb 6 13:52:17 UTC 2024

Hey Christian,

On 01/02/2024 14:48, Christian König wrote:
>
>
> Am 31.01.24 um 18:14 schrieb Shashank Sharma:
>> This patch:
>> - Attaches the TLB flush fence to the PT objects being freed
>> - Adds a new ptr in VM to save this last TLB flush fence
>> - Adds a new lock in VM to prevent out-of-context update of saved
>>    TLB flush fence
>> - Adds a new ptr in tlb_flush structure to save vm
>>
>> The idea is to delay freeing of page table objects until we have the
>> respective TLB entries flushed.
>>
>> V2: rebase
>>
>> Cc: Christian König <Christian.Koenig at amd.com>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> Cc: Felix Kuehling <felix.kuehling at amd.com>
>> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  3 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |  4 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     | 27 +++++++++++++++++++
>>   .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 13 +++++++--
>>   4 files changed, 45 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 67c690044b97..b0e81c249e3a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2245,6 +2245,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, 
>> struct amdgpu_vm *vm,
>>       vm->generation = 0;
>>         mutex_init(&vm->eviction_lock);
>> +    mutex_init(&vm->tlb_flush_lock);
>>       vm->evicting = false;
>>       vm->tlb_fence_context = dma_fence_context_alloc(1);
>>   @@ -2360,7 +2361,9 @@ int amdgpu_vm_make_compute(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm)
>>       }
>>         dma_fence_put(vm->last_update);
>> +    dma_fence_put(vm->tlb_fence_last);
>>       vm->last_update = dma_fence_get_stub();
>> +    vm->tlb_fence_last = dma_fence_get_stub();
>>       vm->is_compute_context = true;
>>         /* Free the shadow bo for compute VM */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 8e6fd25d07b7..b05bc586237f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -334,6 +334,10 @@ struct amdgpu_vm {
>>       uint64_t        *tlb_seq_cpu_addr;
>>       uint64_t        tlb_fence_context;
>>   +    /* Ptr and lock to maintain tlb flush sync */
>> +    struct mutex        tlb_flush_lock;
>> +    struct dma_fence        *tlb_fence_last;
>> +
>>       atomic64_t        kfd_last_flushed_seq;
>>         /* How many times we had to re-generate the page tables */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> index 95dc0afdaffb..f1c4418c4d63 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> @@ -631,6 +631,18 @@ static int amdgpu_vm_pt_alloc(struct 
>> amdgpu_device *adev,
>>       return r;
>>   }
>>   +static inline
>> +void amdgpu_vm_attach_tlb_fence(struct amdgpu_bo *bo, struct 
>> dma_fence *fence)
>> +{
>> +    if (!bo || !fence)
>> +        return;
>> +
>> +    if (!dma_resv_reserve_fences(bo->tbo.base.resv, 1)) {
>> +        dma_resv_add_fence(bo->tbo.base.resv, fence,
>> +                   DMA_RESV_USAGE_BOOKKEEP);
>> +    }
>> +}
>> +
>>   /**
>>    * amdgpu_vm_pt_free - free one PD/PT
>>    *
>> @@ -638,6 +650,7 @@ static int amdgpu_vm_pt_alloc(struct 
>> amdgpu_device *adev,
>>    */
>>   static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
>>   {
>> +    struct amdgpu_vm *vm;
>>       struct amdgpu_bo *shadow;
>>         if (!entry->bo)
>> @@ -646,9 +659,23 @@ static void amdgpu_vm_pt_free(struct 
>> amdgpu_vm_bo_base *entry)
>>       entry->bo->vm_bo = NULL;
>>       shadow = amdgpu_bo_shadowed(entry->bo);
>>       if (shadow) {
>> +        vm = shadow->vm_bo->vm;
>> +
>> +        mutex_lock(&vm->tlb_flush_lock);
>> +        if (vm->tlb_fence_last)
>> +            amdgpu_vm_attach_tlb_fence(shadow, vm->tlb_fence_last);
>> +        mutex_unlock(&vm->tlb_flush_lock);
>> +
>>           ttm_bo_set_bulk_move(&shadow->tbo, NULL);
>>           amdgpu_bo_unref(&shadow);
>>       }
>> +
>> +    vm = entry->vm;
>> +    mutex_lock(&vm->tlb_flush_lock);
>> +    if (vm->tlb_fence_last)
>> +        amdgpu_vm_attach_tlb_fence(entry->bo, vm->tlb_fence_last);
>> +    mutex_unlock(&vm->tlb_flush_lock);
>> +
>
> That approach doesn't make sense. Instead add the freed PT/PDs to a 
> linked list in the parameters structure and only really free them 
> after adding the fence to the root PD.

Sure, I will do those changes.

Just for the curiosity, why wouldn't this approach work ? Wouldn't this 
delay the actual freeing of buffers TTM until the fence signal ?

- Shashank

>
>
>> ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
>>         spin_lock(&entry->vm->status_lock);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> index 569681badd7c..54ec81d30034 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> @@ -31,6 +31,7 @@
>>   struct amdgpu_tlb_fence {
>>       struct dma_fence    base;
>>       struct amdgpu_device    *adev;
>> +    struct amdgpu_vm    *vm;
>
> Big NAK to that. The VM might not live long enough to see the end of 
> the TLB flush.
>
> Regards,
> Christian.
>
>>       struct dma_fence    *dependency;
>>       struct work_struct    work;
>>       spinlock_t        lock;
>> @@ -51,6 +52,7 @@ static const char 
>> *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
>>   static void amdgpu_tlb_fence_work(struct work_struct *work)
>>   {
>>       struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
>> +    struct amdgpu_vm *vm = f->vm;
>>       int r;
>>         r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, 
>> true, 0);
>> @@ -62,6 +64,10 @@ static void amdgpu_tlb_fence_work(struct 
>> work_struct *work)
>>         dma_fence_signal(&f->base);
>>       dma_fence_put(&f->base);
>> +
>> +    mutex_lock(&vm->tlb_flush_lock);
>> +    vm->tlb_fence_last = NULL;
>> +    mutex_unlock(&vm->tlb_flush_lock);
>>   }
>>     static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
>> @@ -92,6 +98,7 @@ void amdgpu_vm_tlb_fence_create(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm
>>       f->adev = adev;
>>       f->dependency = *fence;
>>       f->pasid = vm->pasid;
>> +    f->vm = vm;
>>       INIT_WORK(&f->work, amdgpu_tlb_fence_work);
>>       spin_lock_init(&f->lock);
>>   @@ -99,8 +106,10 @@ void amdgpu_vm_tlb_fence_create(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm
>>                  vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
>>         /* TODO: We probably need a separate wq here */
>> -    dma_fence_get(&f->base);
>> -    schedule_work(&f->work);
>> +    mutex_lock(&vm->tlb_flush_lock);
>> +    vm->tlb_fence_last = dma_fence_get(&f->base);
>> +    mutex_unlock(&vm->tlb_flush_lock);
>>   +    schedule_work(&f->work);
>>       *fence = &f->base;
>>   }
>