[PATCH v3 1/3] drm/amdgpu: replace TLB seq callback with HW seq
Sharma, Shashank
Shashank.Sharma at amd.com
Mon Feb 26 16:22:09 UTC 2024
[AMD Official Use Only - General]
Please feel free to use:
Reviewed-by: Shashank Sharma <shashank.sharma at amd.com>
Regards
Shashank
________________________________
From: Christian König <ckoenig.leichtzumerken at gmail.com>
Sent: Monday, February 26, 2024 3:45 PM
To: Sharma, Shashank <Shashank.Sharma at amd.com>; amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>
Cc: Koenig, Christian <Christian.Koenig at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Bhardwaj, Rajneesh <Rajneesh.Bhardwaj at amd.com>
Subject: Re: [PATCH v3 1/3] drm/amdgpu: replace TLB seq callback with HW seq
Am 23.02.24 um 14:42 schrieb Shashank Sharma:
> From: Christian König <christian.koenig at amd.com>
>
> The callback we installed for the SDMA update were actually pretty
> horrible. since we now have seq64 use that one and HW seq writes
> instead.
>
> V2:(Shashank)
> - rebased on amd-drm-staging-next
> - changed amdgpu_seq64_gpu_addr
>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Cc: Felix Kuehling <felix.kuehling at amd.com>
> Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj at amd.com>
> Signed-off-by: Christian König <christian.koenig at amd.com>
Shashank can I get an rb on this patch here so that I can push it to
amd-staging-drm-next?
The patch was basically just to double check if the seq64 code works as
intended.
Thanks,
Christian.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c | 14 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 79 ++++-----------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 27 ++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 5 ++
> 7 files changed, 42 insertions(+), 89 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
> index 3d0d56087d41..300dc79fa2b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.c
> @@ -199,6 +199,20 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 va)
> __clear_bit(bit_pos, adev->seq64.used);
> }
>
> +/**
> + * amdgpu_seq64_gpu_addr - Calculate GPU addr from va
> + *
> + * @adev: amdgpu_device pointer
> + * @va: virtual address in client address space
> + *
> + * Calculate the GART address for a VA.
> + */
> +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va)
> +{
> + return va - amdgpu_seq64_get_va_base(adev) +
> + amdgpu_bo_gpu_offset(adev->seq64.sbo);
> +}
> +
> /**
> * amdgpu_seq64_fini - Cleanup seq64 driver
> *
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h
> index 4203b2ab318d..63e8ac0a2057 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_seq64.h
> @@ -43,6 +43,7 @@ void amdgpu_seq64_free(struct amdgpu_device *adev, u64 gpu_addr);
> int amdgpu_seq64_map(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> struct amdgpu_bo_va **bo_va);
> void amdgpu_seq64_unmap(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv);
> +u64 amdgpu_seq64_gpu_addr(struct amdgpu_device *adev, u64 va);
>
> #endif
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ed4a8c5d26d7..0960e0a665d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -111,21 +111,6 @@ struct amdgpu_prt_cb {
> struct dma_fence_cb cb;
> };
>
> -/**
> - * struct amdgpu_vm_tlb_seq_struct - Helper to increment the TLB flush sequence
> - */
> -struct amdgpu_vm_tlb_seq_struct {
> - /**
> - * @vm: pointer to the amdgpu_vm structure to set the fence sequence on
> - */
> - struct amdgpu_vm *vm;
> -
> - /**
> - * @cb: callback
> - */
> - struct dma_fence_cb cb;
> -};
> -
> /**
> * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping
> *
> @@ -862,23 +847,6 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
> return r;
> }
>
> -/**
> - * amdgpu_vm_tlb_seq_cb - make sure to increment tlb sequence
> - * @fence: unused
> - * @cb: the callback structure
> - *
> - * Increments the tlb sequence to make sure that future CS execute a VM flush.
> - */
> -static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
> - struct dma_fence_cb *cb)
> -{
> - struct amdgpu_vm_tlb_seq_struct *tlb_cb;
> -
> - tlb_cb = container_of(cb, typeof(*tlb_cb), cb);
> - atomic64_inc(&tlb_cb->vm->tlb_seq);
> - kfree(tlb_cb);
> -}
> -
> /**
> * amdgpu_vm_update_range - update a range in the vm page table
> *
> @@ -911,7 +879,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> struct dma_fence **fence)
> {
> struct amdgpu_vm_update_params params;
> - struct amdgpu_vm_tlb_seq_struct *tlb_cb;
> struct amdgpu_res_cursor cursor;
> enum amdgpu_sync_mode sync_mode;
> int r, idx;
> @@ -919,12 +886,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> if (!drm_dev_enter(adev_to_drm(adev), &idx))
> return -ENODEV;
>
> - tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
> - if (!tlb_cb) {
> - r = -ENOMEM;
> - goto error_unlock;
> - }
> -
> /* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
> * heavy-weight flush TLB unconditionally.
> */
> @@ -942,6 +903,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> params.immediate = immediate;
> params.pages_addr = pages_addr;
> params.unlocked = unlocked;
> + params.needs_flush = flush_tlb;
> params.allow_override = allow_override;
>
> /* Implicitly sync to command submissions in the same VM before
> @@ -955,7 +917,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> amdgpu_vm_eviction_lock(vm);
> if (vm->evicting) {
> r = -EBUSY;
> - goto error_free;
> + goto error_unlock;
> }
>
> if (!unlocked && !dma_fence_is_signaled(vm->last_unlocked)) {
> @@ -968,7 +930,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>
> r = vm->update_funcs->prepare(¶ms, resv, sync_mode);
> if (r)
> - goto error_free;
> + goto error_unlock;
>
> amdgpu_res_first(pages_addr ? NULL : res, offset,
> (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, &cursor);
> @@ -1018,7 +980,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> tmp = start + num_entries;
> r = amdgpu_vm_ptes_update(¶ms, start, tmp, addr, flags);
> if (r)
> - goto error_free;
> + goto error_unlock;
>
> amdgpu_res_next(&cursor, num_entries * AMDGPU_GPU_PAGE_SIZE);
> start = tmp;
> @@ -1026,22 +988,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>
> r = vm->update_funcs->commit(¶ms, fence);
>
> - if (flush_tlb || params.table_freed) {
> - tlb_cb->vm = vm;
> - if (fence && *fence &&
> - !dma_fence_add_callback(*fence, &tlb_cb->cb,
> - amdgpu_vm_tlb_seq_cb)) {
> - dma_fence_put(vm->last_tlb_flush);
> - vm->last_tlb_flush = dma_fence_get(*fence);
> - } else {
> - amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
> - }
> - tlb_cb = NULL;
> - }
> -
> -error_free:
> - kfree(tlb_cb);
> -
> error_unlock:
> amdgpu_vm_eviction_unlock(vm);
> drm_dev_exit(idx);
> @@ -2260,10 +2206,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
> INIT_KFIFO(vm->faults);
>
> - r = amdgpu_vm_init_entities(adev, vm);
> + r = amdgpu_seq64_alloc(adev, &vm->tlb_seq_va, &vm->tlb_seq_cpu_addr);
> if (r)
> return r;
>
> + r = amdgpu_vm_init_entities(adev, vm);
> + if (r)
> + goto error_free_seq;
> +
> vm->pte_support_ats = false;
> vm->is_compute_context = false;
>
> @@ -2283,7 +2233,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>
> vm->last_update = dma_fence_get_stub();
> vm->last_unlocked = dma_fence_get_stub();
> - vm->last_tlb_flush = dma_fence_get_stub();
> vm->generation = 0;
>
> mutex_init(&vm->eviction_lock);
> @@ -2322,10 +2271,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> amdgpu_bo_unref(&root_bo);
>
> error_free_delayed:
> - dma_fence_put(vm->last_tlb_flush);
> dma_fence_put(vm->last_unlocked);
> amdgpu_vm_fini_entities(vm);
>
> +error_free_seq:
> + amdgpu_seq64_free(adev, vm->tlb_seq_va);
> return r;
> }
>
> @@ -2441,7 +2391,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
> struct amdgpu_bo_va_mapping *mapping, *tmp;
> bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
> struct amdgpu_bo *root;
> - unsigned long flags;
> int i;
>
> amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
> @@ -2453,11 +2402,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
> amdgpu_vm_set_pasid(adev, vm, 0);
> dma_fence_wait(vm->last_unlocked, false);
> dma_fence_put(vm->last_unlocked);
> - dma_fence_wait(vm->last_tlb_flush, false);
> - /* Make sure that all fence callbacks have completed */
> - spin_lock_irqsave(vm->last_tlb_flush->lock, flags);
> - spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags);
> - dma_fence_put(vm->last_tlb_flush);
> + amdgpu_seq64_free(adev, vm->tlb_seq_va);
>
> list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
> if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 666698a57192..ac9380afcb69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -247,9 +247,9 @@ struct amdgpu_vm_update_params {
> unsigned int num_dw_left;
>
> /**
> - * @table_freed: return true if page table is freed when updating
> + * @needs_flush: true whenever we need to invalidate the TLB
> */
> - bool table_freed;
> + bool needs_flush;
>
> /**
> * @allow_override: true for memory that is not uncached: allows MTYPE
> @@ -328,9 +328,11 @@ struct amdgpu_vm {
> struct drm_sched_entity immediate;
> struct drm_sched_entity delayed;
>
> - /* Last finished delayed update */
> + /* Sequence number indicating necessary TLB flush */
> atomic64_t tlb_seq;
> - struct dma_fence *last_tlb_flush;
> + uint64_t tlb_seq_va;
> + uint64_t *tlb_seq_cpu_addr;
> +
> atomic64_t kfd_last_flushed_seq;
>
> /* How many times we had to re-generate the page tables */
> @@ -549,22 +551,7 @@ int amdgpu_vm_pt_map_tables(struct amdgpu_device *adev, struct amdgpu_vm *vm);
> */
> static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm *vm)
> {
> - unsigned long flags;
> - spinlock_t *lock;
> -
> - /*
> - * Workaround to stop racing between the fence signaling and handling
> - * the cb. The lock is static after initially setting it up, just make
> - * sure that the dma_fence structure isn't freed up.
> - */
> - rcu_read_lock();
> - lock = vm->last_tlb_flush->lock;
> - rcu_read_unlock();
> -
> - spin_lock_irqsave(lock, flags);
> - spin_unlock_irqrestore(lock, flags);
> -
> - return atomic64_read(&vm->tlb_seq);
> + return READ_ONCE(*vm->tlb_seq_cpu_addr);
> }
>
> /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> index 6e31621452de..0f8fd0acef7b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> @@ -108,7 +108,8 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
> static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
> struct dma_fence **fence)
> {
> - /* Flush HDP */
> + if (p->needs_flush)
> + *p->vm->tlb_seq_cpu_addr = atomic64_inc_return(&p->vm->tlb_seq);
> mb();
> amdgpu_device_flush_hdp(p->adev, NULL);
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index a160265ddc07..95dc0afdaffb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -1056,7 +1056,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
> while (cursor.pfn < frag_start) {
> /* Make sure previous mapping is freed */
> if (cursor.entry->bo) {
> - params->table_freed = true;
> + params->needs_flush = true;
> amdgpu_vm_pt_free_dfs(adev, params->vm,
> &cursor,
> params->unlocked);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 349416e176a1..91cc3121fd4b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -126,6 +126,11 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>
> WARN_ON(ib->length_dw == 0);
> amdgpu_ring_pad_ib(ring, ib);
> + if (p->needs_flush) {
> + p->job->uf_addr = amdgpu_seq64_gpu_addr(p->adev,
> + p->vm->tlb_seq_va);
> + p->job->uf_sequence = atomic64_inc_return(&p->vm->tlb_seq);
> + }
> WARN_ON(ib->length_dw > p->num_dw_left);
> f = amdgpu_job_submit(p->job);
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240226/382589e8/attachment-0001.htm>
More information about the amd-gfx
mailing list