[PATCH v9 1/2] drm/amdgpu: implement TLB flush fence

Mon Mar 18 17:30:03 UTC 2024

[AMD Official Use Only - General]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Shashank Sharma
> Sent: Monday, March 18, 2024 12:12 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig at amd.com>; Kuehling, Felix
> <Felix.Kuehling at amd.com>; Bhardwaj, Rajneesh
> <Rajneesh.Bhardwaj at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Sharma, Shashank
> <Shashank.Sharma at amd.com>
> Subject: [PATCH v9 1/2] drm/amdgpu: implement TLB flush fence
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> From: Christian Koenig <christian.koenig at amd.com>
>
> The problem is that when (for example) 4k pages are replaced with a single 2M
> page we need to wait for change to be flushed out by invalidating the TLB
> before the PT can be freed.
>
> Solve this by moving the TLB flush into a DMA-fence object which can be used
> to delay the freeing of the PT BOs until it is signaled.
>
> V2: (Shashank)
>     - rebase
>     - set dma_fence_error only in case of error
>     - add tlb_flush fence only when PT/PD BO is locked (Felix)
>     - use vm->pasid when f is NULL (Mukul)
>
> V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
>     - move the misplaced fence_create call to the end (Philip)
>
> V5: - free the f->dependency properly
>
> V6: (Shashank)
>     - light code movement, moved all the clean-up in previous patch
>     - introduce params.needs_flush and its usage in this patch
>     - rebase without TLB HW sequence patch
>
> V7:
>    - Keep the vm->last_update_fence and tlb_cb code until
>      we can fix the HW sequencing (Christian)
>    - Move all the tlb_fence related code in a separate function so that
>      its easier to read and review
>
> V9: Addressed review comments from Christian
>     - start PT update only when we have callback memory allocated
>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Reviewed-by: Shashank Sharma <shashank.sharma at amd.com>
> Signed-off-by: Christian Koenig <christian.koenig at amd.com>
> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile           |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  64 +++++++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   8 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c    |   4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c   |   4 +
>  .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112
> ++++++++++++++++++
>  7 files changed, 175 insertions(+), 22 deletions(-)  create mode 100644
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 4536c8ad0e11..f24f11ac3e92 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o
> amdgpu_doorbell_mgr.o amdgpu_kms.o \
>         amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
>         atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
>         atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
> -       amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o
> amdgpu_pll.o \
> +       amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o
> amdgpu_vm_tlb_fence.o \
> +       amdgpu_ib.o amdgpu_pll.o \
>         amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
>         amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o
> amdgpu_virt.o \
>         amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 81fb3465e197..104bf600c85f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -885,6 +885,44 @@ static void amdgpu_vm_tlb_seq_cb(struct
> dma_fence *fence,
>         kfree(tlb_cb);
>  }
>
> +/**
> + * amdgpu_vm_tlb_flush - prepare TLB flush
> + *
> + * @params: parameters for update
> + * @fence: input fence to sync TLB flush with
> + * @tlb_cb: the callback structure
> + *
> + * Increments the tlb sequence to make sure that future CS execute a VM
> flush.
> + */
> +static void
> +amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
> +                   struct dma_fence **fence,
> +                   struct amdgpu_vm_tlb_seq_struct *tlb_cb) {
> +       struct amdgpu_vm *vm = params->vm;
> +
> +       if (!fence || !*fence)
> +               return;
> +
> +       tlb_cb->vm = vm;
> +       if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
> +                                   amdgpu_vm_tlb_seq_cb)) {
> +               dma_fence_put(vm->last_tlb_flush);
> +               vm->last_tlb_flush = dma_fence_get(*fence);
> +       } else {
> +               amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
> +       }
> +
> +       /* Prepare a TLB flush fence to be attached to PTs */
> +       if (!params->unlocked && vm->is_compute_context) {
> +               amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
> +
> +               /* Makes sure no PD/PT is freed before the flush */
> +               dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
> +                                  DMA_RESV_USAGE_BOOKKEEP);
> +       }
> +}
> +
>  /**
>   * amdgpu_vm_update_range - update a range in the vm page table
>   *
> @@ -916,8 +954,8 @@ int amdgpu_vm_update_range(struct
> amdgpu_device *adev, struct amdgpu_vm *vm,
>                            struct ttm_resource *res, dma_addr_t *pages_addr,
>                            struct dma_fence **fence)  {
> -       struct amdgpu_vm_update_params params;
>         struct amdgpu_vm_tlb_seq_struct *tlb_cb;
> +       struct amdgpu_vm_update_params params;
>         struct amdgpu_res_cursor cursor;
>         enum amdgpu_sync_mode sync_mode;
>         int r, idx;
> @@ -926,10 +964,8 @@ int amdgpu_vm_update_range(struct
> amdgpu_device *adev, struct amdgpu_vm *vm,
>                 return -ENODEV;
>
>         tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
> -       if (!tlb_cb) {
> -               r = -ENOMEM;
> -               goto error_unlock;
> -       }
> +       if (!tlb_cb)
> +               return -ENOMEM;
>
Do you not need to call drm_dev_exit() if this tlb_cb allocation fails?

Regards,
Mukul

>         /* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
>          * heavy-weight flush TLB unconditionally.
> @@ -948,6 +984,7 @@ int amdgpu_vm_update_range(struct
> amdgpu_device *adev, struct amdgpu_vm *vm,
>         params.immediate = immediate;
>         params.pages_addr = pages_addr;
>         params.unlocked = unlocked;
> +       params.needs_flush = flush_tlb;
>         params.allow_override = allow_override;
>
>         /* Implicitly sync to command submissions in the same VM before @@ -
> 1031,24 +1068,16 @@ int amdgpu_vm_update_range(struct amdgpu_device
> *adev, struct amdgpu_vm *vm,
>         }
>
>         r = vm->update_funcs->commit(&params, fence);
> +       if (r)
> +               goto error_free;
>
> -       if (flush_tlb || params.table_freed) {
> -               tlb_cb->vm = vm;
> -               if (fence && *fence &&
> -                   !dma_fence_add_callback(*fence, &tlb_cb->cb,
> -                                          amdgpu_vm_tlb_seq_cb)) {
> -                       dma_fence_put(vm->last_tlb_flush);
> -                       vm->last_tlb_flush = dma_fence_get(*fence);
> -               } else {
> -                       amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
> -               }
> +       if (params.needs_flush) {
> +               amdgpu_vm_tlb_flush(&params, fence, tlb_cb);
>                 tlb_cb = NULL;
>         }
>
>  error_free:
>         kfree(tlb_cb);
> -
> -error_unlock:
>         amdgpu_vm_eviction_unlock(vm);
>         drm_dev_exit(idx);
>         return r;
> @@ -2391,6 +2420,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev,
> struct amdgpu_vm *vm,
>
>         mutex_init(&vm->eviction_lock);
>         vm->evicting = false;
> +       vm->tlb_fence_context = dma_fence_context_alloc(1);
>
>         r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
>                                 false, &root, xcp_id); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 8efa8422f4f7..b0a4fe683352 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -257,9 +257,9 @@ struct amdgpu_vm_update_params {
>         unsigned int num_dw_left;
>
>         /**
> -        * @table_freed: return true if page table is freed when updating
> +        * @needs_flush: true whenever we need to invalidate the TLB
>          */
> -       bool table_freed;
> +       bool needs_flush;
>
>         /**
>          * @allow_override: true for memory that is not uncached: allows MTYPE
> @@ -342,6 +342,7 @@ struct amdgpu_vm {
>         atomic64_t              tlb_seq;
>         struct dma_fence        *last_tlb_flush;
>         atomic64_t              kfd_last_flushed_seq;
> +       uint64_t                tlb_fence_context;
>
>         /* How many times we had to re-generate the page tables */
>         uint64_t                generation;
> @@ -611,5 +612,8 @@ void amdgpu_vm_update_fault_cache(struct
> amdgpu_device *adev,
>                                   uint64_t addr,
>                                   uint32_t status,
>                                   unsigned int vmhub);
> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
> +                                struct amdgpu_vm *vm,
> +                                struct dma_fence **fence);
>
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> index 6e31621452de..3895bd7d176a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> @@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct
> amdgpu_vm_update_params *p,  static int amdgpu_vm_cpu_commit(struct
> amdgpu_vm_update_params *p,
>                                 struct dma_fence **fence)  {
> -       /* Flush HDP */
> +       if (p->needs_flush)
> +               atomic64_inc(&p->vm->tlb_seq);
> +
>         mb();
>         amdgpu_device_flush_hdp(p->adev, NULL);
>         return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index 124389a6bf48..601df0ce8290 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -972,7 +972,7 @@ int amdgpu_vm_ptes_update(struct
> amdgpu_vm_update_params *params,
>                         while (cursor.pfn < frag_start) {
>                                 /* Make sure previous mapping is freed */
>                                 if (cursor.entry->bo) {
> -                                       params->table_freed = true;
> +                                       params->needs_flush = true;
>                                         amdgpu_vm_pt_free_dfs(adev, params->vm,
>                                                               &cursor,
>                                                               params->unlocked); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 349416e176a1..66e8a016126b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct
> amdgpu_vm_update_params *p,
>
>         WARN_ON(ib->length_dw == 0);
>         amdgpu_ring_pad_ib(ring, ib);
> +
> +       if (p->needs_flush)
> +               atomic64_inc(&p->vm->tlb_seq);
> +
>         WARN_ON(ib->length_dw > p->num_dw_left);
>         f = amdgpu_job_submit(p->job);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> new file mode 100644
> index 000000000000..51cddfa3f1e8
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
> @@ -0,0 +1,112 @@
> +// SPDX-License-Identifier: GPL-2.0 OR MIT
> +/*
> + * Copyright 2023 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/dma-fence.h>
> +#include <linux/workqueue.h>
> +
> +#include "amdgpu.h"
> +#include "amdgpu_vm.h"
> +#include "amdgpu_gmc.h"
> +
> +struct amdgpu_tlb_fence {
> +       struct dma_fence        base;
> +       struct amdgpu_device    *adev;
> +       struct dma_fence        *dependency;
> +       struct work_struct      work;
> +       spinlock_t              lock;
> +       uint16_t                pasid;
> +
> +};
> +
> +static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence
> +*fence) {
> +       return "amdgpu tlb fence";
> +}
> +
> +static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence
> +*f) {
> +       return "amdgpu tlb timeline";
> +}
> +
> +static void amdgpu_tlb_fence_work(struct work_struct *work) {
> +       struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
> +       int r;
> +
> +       if (f->dependency) {
> +               dma_fence_wait(f->dependency, false);
> +               dma_fence_put(f->dependency);
> +               f->dependency = NULL;
> +       }
> +
> +       r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
> +       if (r) {
> +               dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
> +                       f->pasid);
> +               dma_fence_set_error(&f->base, r);
> +       }
> +
> +       dma_fence_signal(&f->base);
> +       dma_fence_put(&f->base);
> +}
> +
> +static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
> +       .use_64bit_seqno = true,
> +       .get_driver_name = amdgpu_tlb_fence_get_driver_name,
> +       .get_timeline_name = amdgpu_tlb_fence_get_timeline_name
> +};
> +
> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> +                               struct dma_fence **fence) {
> +       struct amdgpu_tlb_fence *f;
> +
> +       f = kmalloc(sizeof(*f), GFP_KERNEL);
> +       if (!f) {
> +               /*
> +                * We can't fail since the PDEs and PTEs are already updated, so
> +                * just block for the dependency and execute the TLB flush
> +                */
> +               if (*fence)
> +                       dma_fence_wait(*fence, false);
> +
> +               amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
> +               *fence = dma_fence_get_stub();
> +               return;
> +       }
> +
> +       f->adev = adev;
> +       f->dependency = *fence;
> +       f->pasid = vm->pasid;
> +       INIT_WORK(&f->work, amdgpu_tlb_fence_work);
> +       spin_lock_init(&f->lock);
> +
> +       dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
> +                      vm->tlb_fence_context,
> + atomic64_read(&vm->tlb_seq));
> +
> +       /* TODO: We probably need a separate wq here */
> +       dma_fence_get(&f->base);
> +       schedule_work(&f->work);
> +
> +       *fence = &f->base;
> +}
> --
> 2.43.2