[PATCH v7 1/2] drm/amdgpu: implement TLB flush fence

Mon Mar 18 15:19:40 UTC 2024

On 18/03/2024 15:58, Christian König wrote:
> Am 18.03.24 um 13:08 schrieb Shashank Sharma:
>> From: Christian Koenig <christian.koenig at amd.com>
>>
>> The problem is that when (for example) 4k pages are replaced
>> with a single 2M page we need to wait for change to be flushed
>> out by invalidating the TLB before the PT can be freed.
>>
>> Solve this by moving the TLB flush into a DMA-fence object which
>> can be used to delay the freeing of the PT BOs until it is signaled.
>>
>> V2: (Shashank)
>>      - rebase
>>      - set dma_fence_error only in case of error
>>      - add tlb_flush fence only when PT/PD BO is locked (Felix)
>>      - use vm->pasid when f is NULL (Mukul)
>>
>> V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
>>      - move the misplaced fence_create call to the end (Philip)
>>
>> V5: - free the f->dependency properly
>>
>> V6: (Shashank)
>>      - light code movement, moved all the clean-up in previous patch
>>      - introduce params.needs_flush and its usage in this patch
>>      - rebase without TLB HW sequence patch
>>
>> V7:
>>     - Keep the vm->last_update_fence and tlb_cb code until
>>       we can fix the HW sequencing (Christian)
>>     - Move all the tlb_fence related code in a separate function so that
>>       its easier to read and review
>>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
>> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
>> Reviewed-by: Shashank Sharma <shashank.sharma at amd.com>
>> Signed-off-by: Christian Koenig <christian.koenig at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/Makefile           |   3 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  68 +++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   8 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c    |   4 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     |   2 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c   |   4 +
>>   .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++++++++++++++++++
>>   7 files changed, 171 insertions(+), 30 deletions(-)
>>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
>> b/drivers/gpu/drm/amd/amdgpu/Makefile
>> index 4536c8ad0e11..f24f11ac3e92 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
>> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
>> @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
>> amdgpu_kms.o \
>>       amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
>>       atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
>>       atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
>> -    amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o 
>> amdgpu_pll.o \
>> +    amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
>> +    amdgpu_ib.o amdgpu_pll.o \
>>       amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
>>       amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o 
>> amdgpu_virt.o \
>>       amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 81fb3465e197..26f1c3359642 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -885,6 +885,40 @@ static void amdgpu_vm_tlb_seq_cb(struct 
>> dma_fence *fence,
>>       kfree(tlb_cb);
>>   }
>>   +static int
>> +amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, struct 
>> dma_fence **fence)
>> +{
>> +    struct amdgpu_vm_tlb_seq_struct *tlb_cb;
>> +    struct amdgpu_vm *vm = params->vm;
>> +
>> +    if (!fence || !*fence)
>> +        return 0;
>> +
>> +    tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
>> +    if (!tlb_cb)
>> +        return -ENOMEM;
>
> That won't work like this. The page tables are already updated, so you 
> need to have the callback no matter what.
>
> That's why the code previously allocated the tlb_cb structure before 
> updating the page tables.

I could see from the previous code that there was also handling of cb 
being OOM, but I think your point is 'do not start the PT update until 
we have memory for callback', so I will accommodate that.

- Shashank

>
> Regards,
> Christian.
>
>> +
>> +    tlb_cb->vm = vm;
>> +    if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
>> +                    amdgpu_vm_tlb_seq_cb)) {
>> +        dma_fence_put(vm->last_tlb_flush);
>> +        vm->last_tlb_flush = dma_fence_get(*fence);
>> +    } else {
>> +        amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
>> +    }
>> +
>> +    /* Prepare a TLB flush fence to be attached to PTs */
>> +    if (!params->unlocked && vm->is_compute_context) {
>> +        amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
>> +
>> +        /* Makes sure no PD/PT is freed before the flush */
>> +        dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
>> +                   DMA_RESV_USAGE_BOOKKEEP);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   /**
>>    * amdgpu_vm_update_range - update a range in the vm page table
>>    *
>> @@ -917,7 +951,6 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>                  struct dma_fence **fence)
>>   {
>>       struct amdgpu_vm_update_params params;
>> -    struct amdgpu_vm_tlb_seq_struct *tlb_cb;
>>       struct amdgpu_res_cursor cursor;
>>       enum amdgpu_sync_mode sync_mode;
>>       int r, idx;
>> @@ -925,12 +958,6 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       if (!drm_dev_enter(adev_to_drm(adev), &idx))
>>           return -ENODEV;
>>   -    tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
>> -    if (!tlb_cb) {
>> -        r = -ENOMEM;
>> -        goto error_unlock;
>> -    }
>> -
>>       /* Vega20+XGMI where PTEs get inadvertently cached in L2 
>> texture cache,
>>        * heavy-weight flush TLB unconditionally.
>>        */
>> @@ -948,6 +975,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       params.immediate = immediate;
>>       params.pages_addr = pages_addr;
>>       params.unlocked = unlocked;
>> +    params.needs_flush = flush_tlb;
>>       params.allow_override = allow_override;
>>         /* Implicitly sync to command submissions in the same VM before
>> @@ -961,7 +989,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>       amdgpu_vm_eviction_lock(vm);
>>       if (vm->evicting) {
>>           r = -EBUSY;
>> -        goto error_free;
>> +        goto error_unlock;
>>       }
>>         if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) {
>> @@ -974,7 +1002,7 @@ int amdgpu_vm_update_range(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>         r = vm->update_funcs->prepare(&params, resv, sync_mode);
>>       if (r)
>> -        goto error_free;
>> +        goto error_unlock;
>>         amdgpu_res_first(pages_addr ? NULL : res, offset,
>>                (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor);
>> @@ -1024,29 +1052,18 @@ int amdgpu_vm_update_range(struct 
>> amdgpu_device *adev, struct amdgpu_vm *vm,
>>           tmp = start + num_entries;
>>           r = amdgpu_vm_ptes_update(&params, start, tmp, addr, flags);
>>           if (r)
>> -            goto error_free;
>> +            goto error_unlock;
>>             amdgpu_res_next(&cursor, num_entries * 
>> AMDGPU_GPU_PAGE_SIZE);
>>           start = tmp;
>>       }
>>         r = vm->update_funcs->commit(&params, fence);
>> +    if (r)
>> +        goto error_unlock;
>>   -    if (flush_tlb || params.table_freed) {
>> -        tlb_cb->vm = vm;
>> -        if (fence && *fence &&
>> -            !dma_fence_add_callback(*fence, &tlb_cb->cb,
>> -                       amdgpu_vm_tlb_seq_cb)) {
>> -            dma_fence_put(vm->last_tlb_flush);
>> -            vm->last_tlb_flush = dma_fence_get(*fence);
>> -        } else {
>> -            amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
>> -        }
>> -        tlb_cb = NULL;
>> -    }
>> -
>> -error_free:
>> -    kfree(tlb_cb);
>> +    if (params.needs_flush)
>> +        r = amdgpu_vm_tlb_flush(&params, fence);
>>     error_unlock:
>>       amdgpu_vm_eviction_unlock(vm);
>> @@ -2391,6 +2408,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, 
>> struct amdgpu_vm *vm,
>>         mutex_init(&vm->eviction_lock);
>>       vm->evicting = false;
>> +    vm->tlb_fence_context = dma_fence_context_alloc(1);
>>         r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
>>                   false, &root, xcp_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 8efa8422f4f7..b0a4fe683352 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -257,9 +257,9 @@ struct amdgpu_vm_update_params {
>>       unsigned int num_dw_left;
>>         /**
>> -     * @table_freed: return true if page table is freed when updating
>> +     * @needs_flush: true whenever we need to invalidate the TLB
>>        */
>> -    bool table_freed;
>> +    bool needs_flush;
>>         /**
>>        * @allow_override: true for memory that is not uncached: 
>> allows MTYPE
>> @@ -342,6 +342,7 @@ struct amdgpu_vm {
>>       atomic64_t        tlb_seq;
>>       struct dma_fence    *last_tlb_flush;
>>       atomic64_t        kfd_last_flushed_seq;
>> +    uint64_t        tlb_fence_context;
>>         /* How many times we had to re-generate the page tables */
>>       uint64_t        generation;
>> @@ -611,5 +612,8 @@ void amdgpu_vm_update_fault_cache(struct 
>> amdgpu_device *adev,
>>                     uint64_t addr,
>>                     uint32_t status,
>>                     unsigned int vmhub);
>> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
>> +                 struct amdgpu_vm *vm,
>> +                 struct dma_fence **fence);
>>     #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>> index 6e31621452de..3895bd7d176a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>> @@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct 
>> amdgpu_vm_update_params *p,
>>   static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
>>                   struct dma_fence **fence)
>>   {
>> -    /* Flush HDP */
>> +    if (p->needs_flush)
>> +        atomic64_inc(&p->vm->tlb_seq);
>> +
>>       mb();
>>       amdgpu_device_flush_hdp(p->adev, NULL);
>>       return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> index 124389a6bf48..601df0ce8290 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> @@ -972,7 +972,7 @@ int amdgpu_vm_ptes_update(struct 
>> amdgpu_vm_update_params *params,
>>               while (cursor.pfn < frag_start) {
>>                   /* Make sure previous mapping is freed */
>>                   if (cursor.entry->bo) {
>> -                    params->table_freed = true;
>> +                    params->needs_flush = true;
>>                       amdgpu_vm_pt_free_dfs(adev, params->vm,
>>                                     &cursor,
>>                                     params->unlocked);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> index 349416e176a1..66e8a016126b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> @@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct 
>> amdgpu_vm_update_params *p,
>>         WARN_ON(ib->length_dw == 0);
>>       amdgpu_ring_pad_ib(ring, ib);
>> +
>> +    if (p->needs_flush)
>> +        atomic64_inc(&p->vm->tlb_seq);
>> +
>>       WARN_ON(ib->length_dw > p->num_dw_left);
>>       f = amdgpu_job_submit(p->job);
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> new file mode 100644
>> index 000000000000..51cddfa3f1e8
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> @@ -0,0 +1,112 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR MIT
>> +/*
>> + * Copyright 2023 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without 
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, 
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom 
>> the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be 
>> included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>> EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
>> DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
>> OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
>> USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#include <linux/dma-fence.h>
>> +#include <linux/workqueue.h>
>> +
>> +#include "amdgpu.h"
>> +#include "amdgpu_vm.h"
>> +#include "amdgpu_gmc.h"
>> +
>> +struct amdgpu_tlb_fence {
>> +    struct dma_fence    base;
>> +    struct amdgpu_device    *adev;
>> +    struct dma_fence    *dependency;
>> +    struct work_struct    work;
>> +    spinlock_t        lock;
>> +    uint16_t        pasid;
>> +
>> +};
>> +
>> +static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence 
>> *fence)
>> +{
>> +    return "amdgpu tlb fence";
>> +}
>> +
>> +static const char *amdgpu_tlb_fence_get_timeline_name(struct 
>> dma_fence *f)
>> +{
>> +    return "amdgpu tlb timeline";
>> +}
>> +
>> +static void amdgpu_tlb_fence_work(struct work_struct *work)
>> +{
>> +    struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
>> +    int r;
>> +
>> +    if (f->dependency) {
>> +        dma_fence_wait(f->dependency, false);
>> +        dma_fence_put(f->dependency);
>> +        f->dependency = NULL;
>> +    }
>> +
>> +    r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
>> +    if (r) {
>> +        dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
>> +            f->pasid);
>> +        dma_fence_set_error(&f->base, r);
>> +    }
>> +
>> +    dma_fence_signal(&f->base);
>> +    dma_fence_put(&f->base);
>> +}
>> +
>> +static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
>> +    .use_64bit_seqno = true,
>> +    .get_driver_name = amdgpu_tlb_fence_get_driver_name,
>> +    .get_timeline_name = amdgpu_tlb_fence_get_timeline_name
>> +};
>> +
>> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm,
>> +                struct dma_fence **fence)
>> +{
>> +    struct amdgpu_tlb_fence *f;
>> +
>> +    f = kmalloc(sizeof(*f), GFP_KERNEL);
>> +    if (!f) {
>> +        /*
>> +         * We can't fail since the PDEs and PTEs are already 
>> updated, so
>> +         * just block for the dependency and execute the TLB flush
>> +         */
>> +        if (*fence)
>> +            dma_fence_wait(*fence, false);
>> +
>> +        amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
>> +        *fence = dma_fence_get_stub();
>> +        return;
>> +    }
>> +
>> +    f->adev = adev;
>> +    f->dependency = *fence;
>> +    f->pasid = vm->pasid;
>> +    INIT_WORK(&f->work, amdgpu_tlb_fence_work);
>> +    spin_lock_init(&f->lock);
>> +
>> +    dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
>> +               vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
>> +
>> +    /* TODO: We probably need a separate wq here */
>> +    dma_fence_get(&f->base);
>> +    schedule_work(&f->work);
>> +
>> +    *fence = &f->base;
>> +}
>