[PATCH v5 1/2] drm/amdgpu: implement TLB flush fence

Mon Mar 11 14:37:25 UTC 2024

On 07/03/2024 20:22, Philip Yang wrote:
>
>
> On 2024-03-06 09:41, Shashank Sharma wrote:
>> From: Christian König<christian.koenig at amd.com>
>>
>> The problem is that when (for example) 4k pages are replaced
>> with a single 2M page we need to wait for change to be flushed
>> out by invalidating the TLB before the PT can be freed.
>>
>> Solve this by moving the TLB flush into a DMA-fence object which
>> can be used to delay the freeing of the PT BOs until it is signaled.
>>
>> V2: (Shashank)
>>      - rebase
>>      - set dma_fence_error only in case of error
>>      - add tlb_flush fence only when PT/PD BO is locked (Felix)
>>      - use vm->pasid when f is NULL (Mukul)
>>
>> V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
>>      - move the misplaced fence_create call to the end (Philip)
>>
>> V5: - free the f->dependency properly (Christian)
>>
>> Cc: Christian Koenig<christian.koenig at amd.com>
>> Cc: Felix Kuehling<Felix.Kuehling at amd.com>
>> Cc: Rajneesh Bhardwaj<rajneesh.bhardwaj at amd.com>
>> Cc: Alex Deucher<alexander.deucher at amd.com>
>> Reviewed-by: Shashank Sharma<shashank.sharma at amd.com>
>> Signed-off-by: Christian König<christian.koenig at amd.com>
>> Signed-off-by: Shashank Sharma<shashank.sharma at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/Makefile           |   3 +-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  10 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   4 +
>>   .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 112 ++++++++++++++++++
>>   4 files changed, 128 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
>> index fa26a4e3a99d..91ab4cf29b5b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
>> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
>> @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
>>   	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
>>   	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
>>   	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
>> -	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
>> +	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
>> +	amdgpu_ib.o amdgpu_pll.o \
>>   	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
>>   	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
>>   	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 0960e0a665d3..310aae6fb49b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>   
>>   	r = vm->update_funcs->commit(&params, fence);
>>   
>> +	/* Prepare a TLB flush fence to be attached to PTs */
>> +	if (!unlocked && params.needs_flush && vm->is_compute_context) {
>> +		amdgpu_vm_tlb_fence_create(adev, vm, fence);
>> +
>> +		/* Makes sure no PD/PT is freed before the flush */
>> +		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
>> +				   DMA_RESV_USAGE_BOOKKEEP);
>> +	}
>> +
>>   error_unlock:
>>   	amdgpu_vm_eviction_unlock(vm);
>>   	drm_dev_exit(idx);
>> @@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>   
>>   	mutex_init(&vm->eviction_lock);
>>   	vm->evicting = false;
>> +	vm->tlb_fence_context = dma_fence_context_alloc(1);
>>   
>>   	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
>>   				false, &root, xcp_id);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 64b3f69efa57..298f604b8e5f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -341,6 +341,7 @@ struct amdgpu_vm {
>>   	atomic64_t		tlb_seq;
>>   	uint64_t		tlb_seq_va;
>>   	uint64_t		*tlb_seq_cpu_addr;
>> +	uint64_t		tlb_fence_context;
>>   
>>   	atomic64_t		kfd_last_flushed_seq;
>>   
>> @@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>>   				  uint64_t addr,
>>   				  uint32_t status,
>>   				  unsigned int vmhub);
>> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
>> +				 struct amdgpu_vm *vm,
>> +				 struct dma_fence **fence);
>>   
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> new file mode 100644
>> index 000000000000..51cddfa3f1e8
>> --- /dev/null
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
>> @@ -0,0 +1,112 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR MIT
>> +/*
>> + * Copyright 2023 Advanced Micro Devices, Inc.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> + * OTHER DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#include <linux/dma-fence.h>
>> +#include <linux/workqueue.h>
>> +
>> +#include "amdgpu.h"
>> +#include "amdgpu_vm.h"
>> +#include "amdgpu_gmc.h"
>> +
>> +struct amdgpu_tlb_fence {
>> +	struct dma_fence	base;
>> +	struct amdgpu_device	*adev;
>> +	struct dma_fence	*dependency;
>> +	struct work_struct	work;
>> +	spinlock_t		lock;
>> +	uint16_t		pasid;
>> +
>> +};
>> +
>> +static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence)
>> +{
>> +	return "amdgpu tlb fence";
>> +}
>> +
>> +static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
>> +{
>> +	return "amdgpu tlb timeline";
>> +}
>> +
>> +static void amdgpu_tlb_fence_work(struct work_struct *work)
>> +{
>> +	struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
>> +	int r;
>> +
>> +	if (f->dependency) {
>> +		dma_fence_wait(f->dependency, false);
>> +		dma_fence_put(f->dependency);
>> +		f->dependency = NULL;
>> +	}
>> +
>> +	r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
>
> To flush all XCCs, as this is a corner case, we could start with this 
> to make it correct for SPX mode for now, with extra flush for other modes.
>
>     int num_xcc = f->adev->gfx.xcc_mask ? 
> NUM_XCC(f->adev->gfx.xcc_mask) : 1;
>     uint32_t xcc_mask = GENMASK(num_xcc - 1, 0);
>     int i;
>
>     for_each_inst(i, xcc_mask)
>          r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 
> TLB_FLUSH_LEGACY, true, i);

Thanks for this input, Philip.

@Christian, your feedback for this ?

- Shashank

>
> Regards,
> Philip
>> +	if (r) {
>> +		dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
>> +			f->pasid);
>> +		dma_fence_set_error(&f->base, r);
>> +	}
>> +
>> +	dma_fence_signal(&f->base);
>> +	dma_fence_put(&f->base);
>> +}
>> +
>> +static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
>> +	.use_64bit_seqno = true,
>> +	.get_driver_name = amdgpu_tlb_fence_get_driver_name,
>> +	.get_timeline_name = amdgpu_tlb_fence_get_timeline_name
>> +};
>> +
>> +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>> +				struct dma_fence **fence)
>> +{
>> +	struct amdgpu_tlb_fence *f;
>> +
>> +	f = kmalloc(sizeof(*f), GFP_KERNEL);
>> +	if (!f) {
>> +		/*
>> +		 * We can't fail since the PDEs and PTEs are already updated, so
>> +		 * just block for the dependency and execute the TLB flush
>> +		 */
>> +		if (*fence)
>> +			dma_fence_wait(*fence, false);
>> +
>> +		amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
>> +		*fence = dma_fence_get_stub();
>> +		return;
>> +	}
>> +
>> +	f->adev = adev;
>> +	f->dependency = *fence;
>> +	f->pasid = vm->pasid;
>> +	INIT_WORK(&f->work, amdgpu_tlb_fence_work);
>> +	spin_lock_init(&f->lock);
>> +
>> +	dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
>> +		       vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
>> +
>> +	/* TODO: We probably need a separate wq here */
>> +	dma_fence_get(&f->base);
>> +	schedule_work(&f->work);
>> +
>> +	*fence = &f->base;
>> +}