[PATCH v3] drm/amdkfd: Change kfd/svm page fault drain handling

Mon Jul 29 18:23:49 UTC 2024

Am 25.07.24 um 20:19 schrieb Xiaogang.Chen:
> From: Xiaogang Chen <xiaogang.chen at amd.com>
>
> When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
> not handle any incoming pages fault of this process until a deferred work item
> got executed by default system wq. The time period of "not handle page fault"
> can be long and is unpredicable. That is advese to kfd performance on page
> faults recovery.
>
> This patch uses time stamp of incoming page page to decide to drop or handle
> page fault. When app unmap vm ranges kfd records each gpu device's ih ring
> current time stamp. These time stamps are used at kfd page fault recovery
> routine.
>
> Any page fault happens on unmapped ranges after unmap events is app bug that
> accesses vm range after unmap. It is not driver work to cover that.

But that defers destroying the unmapped range until the IH ring is 
cleared, isn't it?

>
> By using time stamp of page fault do not need drain page faults at deferred
> work. So, the time period that kfd does not handle page faults is reduced
> and can be controlled.
>
> Signed-off-by: Xiaogang.Chen <Xiaogang.Chen at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |   4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |   2 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |   3 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |   4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |   5 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 102 ++++++++++++++++---------
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   2 +-
>   7 files changed, 79 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3abfa66d72a2..d90b7ea3f020 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>    * shouldn't be reported any more.
>    */
>   bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> -			    u32 vmid, u32 node_id, uint64_t addr,
> +			    u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
>   			    bool write_fault)
>   {
>   	bool is_compute_context = false;
> @@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>   	addr /= AMDGPU_GPU_PAGE_SIZE;
>   
>   	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
> -	    node_id, addr, write_fault)) {
> +	    node_id, addr, ts, write_fault)) {
>   		amdgpu_bo_unref(&root);
>   		return true;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 312a408b80d3..1d6a1381ede9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
>   void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
>   
>   bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> -			    u32 vmid, u32 node_id, uint64_t addr,
> +			    u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
>   			    bool write_fault);
>   
>   void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index d933e19e0cf5..3596cc2ee7e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>   		/* Try to handle the recoverable page faults by filling page
>   		 * tables
>   		 */
> -		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
> +		if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
> +					   entry->timestamp, write_fault))
>   			return 1;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 350f6b6676f1..ac08d9424feb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   			cam_index = entry->src_data[2] & 0x3ff;
>   
>   			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
> -						     addr, write_fault);
> +						     addr, entry->timestamp, write_fault);
>   			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
>   			if (ret)
>   				return 1;
> @@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   			 * tables
>   			 */
>   			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
> -						   addr, write_fault))
> +						   addr, entry->timestamp, write_fault))
>   				return 1;
>   		}
>   	}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index c51e908f6f19..771c98e104ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -850,10 +850,13 @@ struct svm_range_list {
>   	struct list_head                criu_svm_metadata_list;
>   	spinlock_t			deferred_list_lock;
>   	atomic_t			evicted_ranges;
> -	atomic_t			drain_pagefaults;
> +	/* stop page fault recovery for this process */
> +	atomic_t			stop_pf_recovery;
>   	struct delayed_work		restore_work;
>   	DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
>   	struct task_struct		*faulting_task;
> +	/* check point ts decides if page fault recovery need be dropped */
> +	uint64_t			checkpoint_ts[MAX_GPU_INSTANCE];
>   };
>   
>   /* Process data */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 407636a68814..fb0e883868b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2263,16 +2263,10 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
>   {
>   	struct kfd_process_device *pdd;
>   	struct kfd_process *p;
> -	int drain;
>   	uint32_t i;
>   
>   	p = container_of(svms, struct kfd_process, svms);
>   
> -restart:
> -	drain = atomic_read(&svms->drain_pagefaults);
> -	if (!drain)
> -		return;
> -
>   	for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
>   		pdd = p->pdds[i];
>   		if (!pdd)
> @@ -2292,8 +2286,6 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
>   
>   		pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
>   	}
> -	if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
> -		goto restart;
>   }
>   
>   static void svm_range_deferred_list_work(struct work_struct *work)
> @@ -2315,17 +2307,8 @@ static void svm_range_deferred_list_work(struct work_struct *work)
>   			 prange->start, prange->last, prange->work_item.op);
>   
>   		mm = prange->work_item.mm;
> -retry:
> -		mmap_write_lock(mm);
>   
> -		/* Checking for the need to drain retry faults must be inside
> -		 * mmap write lock to serialize with munmap notifiers.
> -		 */
> -		if (unlikely(atomic_read(&svms->drain_pagefaults))) {
> -			mmap_write_unlock(mm);
> -			svm_range_drain_retry_fault(svms);
> -			goto retry;
> -		}
> +		mmap_write_lock(mm);
>   
>   		/* Remove from deferred_list must be inside mmap write lock, for
>   		 * two race cases:
> @@ -2446,6 +2429,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
>   	struct kfd_process *p;
>   	unsigned long s, l;
>   	bool unmap_parent;
> +	uint32_t i;
>   
>   	p = kfd_lookup_process_by_mm(mm);
>   	if (!p)
> @@ -2455,11 +2439,37 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
>   	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
>   		 prange, prange->start, prange->last, start, last);
>   
> -	/* Make sure pending page faults are drained in the deferred worker
> -	 * before the range is freed to avoid straggler interrupts on
> -	 * unmapped memory causing "phantom faults".
> +	/* calculate time stamps that are used to decide which page faults need be
> +	 * dropped or handled before unmap pages from gpu vm
>   	 */
> -	atomic_inc(&svms->drain_pagefaults);
> +	for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
> +		struct kfd_process_device *pdd;
> +		struct amdgpu_device *adev;
> +		struct amdgpu_ih_ring *ih;
> +		uint32_t checkpoint_wptr;
> +
> +		pdd = p->pdds[i];
> +		if (!pdd)
> +			continue;
> +
> +		adev = pdd->dev->adev;
> +
> +		/* check if adev->irq.ih1 is not empty */
> +		ih = &adev->irq.ih1;
> +		checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
> +		if (ih->rptr != checkpoint_wptr) {
> +			WRITE_ONCE(svms->checkpoint_ts[i], checkpoint_wptr);
> +			continue;
> +		}
> +
> +		/* check if dev->irq.ih_soft is not empty */
> +		ih = &adev->irq.ih_soft;
> +		checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
> +		if (ih->rptr != checkpoint_wptr) {
> +			WRITE_ONCE(svms->checkpoint_ts[i], checkpoint_wptr);
> +			continue;
> +		}
> +	}
>   
>   	unmap_parent = start <= prange->start && last >= prange->last;
>   
> @@ -2900,7 +2910,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
>   int
>   svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   			uint32_t vmid, uint32_t node_id,
> -			uint64_t addr, bool write_fault)
> +			uint64_t addr, uint64_t ts, bool write_fault)
>   {
>   	unsigned long start, last, size;
>   	struct mm_struct *mm = NULL;
> @@ -2910,7 +2920,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   	ktime_t timestamp = ktime_get_boottime();
>   	struct kfd_node *node;
>   	int32_t best_loc;
> -	int32_t gpuidx = MAX_GPU_INSTANCE;
> +	int32_t gpuid, gpuidx = MAX_GPU_INSTANCE;
>   	bool write_locked = false;
>   	struct vm_area_struct *vma;
>   	bool migration = false;
> @@ -2930,12 +2940,39 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   
>   	pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
>   
> -	if (atomic_read(&svms->drain_pagefaults)) {
> -		pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
> +	/* kfd page fault recovery is disabled */
> +	if (atomic_read(&svms->stop_pf_recovery)) {
> +		pr_debug("page fault handing disabled, drop fault 0x%llx\n", addr);
>   		r = 0;
>   		goto out;
>   	}
>   
> +	node = kfd_node_by_irq_ids(adev, node_id, vmid);
> +	if (!node) {
> +		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
> +			 vmid);
> +		r = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
> +		pr_debug("failed to get gpuid/gpuidex for node_id: %d \n", node_id);
> +		r = -EFAULT;
> +		goto out;
> +	}
> +
> +	/* check if this page fault time stamp is before svms->checkpoint_ts */
> +	if (READ_ONCE(svms->checkpoint_ts[gpuidx]) != 0 &&
> +		      amdgpu_ih_ts_after(ts,  READ_ONCE(svms->checkpoint_ts[gpuidx]))) {
> +		pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
> +		r = 0;
> +		goto out;
> +	} else
> +		/* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
> +		 * to zero to avoid following ts wrap around give wrong comparing
> +		 */
> +		WRITE_ONCE(svms->checkpoint_ts[gpuidx], 0);

Pretty clear NAK to using WRITE_ONCE like this. You have exactly zero 
memory barrier here.

Regards,
Christian.

> +
>   	if (!p->xnack_enabled) {
>   		pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
>   		r = -EFAULT;
> @@ -2952,13 +2989,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   		goto out;
>   	}
>   
> -	node = kfd_node_by_irq_ids(adev, node_id, vmid);
> -	if (!node) {
> -		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
> -			 vmid);
> -		r = -EFAULT;
> -		goto out;
> -	}
>   	mmap_read_lock(mm);
>   retry_write_locked:
>   	mutex_lock(&svms->lock);
> @@ -3173,9 +3203,11 @@ void svm_range_list_fini(struct kfd_process *p)
>   	/*
>   	 * Ensure no retry fault comes in afterwards, as page fault handler will
>   	 * not find kfd process and take mm lock to recover fault.
> +	 * stop kfd page fault handing, then wait pending page faults got drained
>   	 */
> -	atomic_inc(&p->svms.drain_pagefaults);
> +	atomic_set(&p->svms.stop_pf_recovery, 1);
>   	svm_range_drain_retry_fault(&p->svms);
> +	atomic_set(&p->svms.stop_pf_recovery, 0);
>   
>   	list_for_each_entry_safe(prange, next, &p->svms.list, list) {
>   		svm_range_unlink(prange);
> @@ -3197,7 +3229,7 @@ int svm_range_list_init(struct kfd_process *p)
>   	mutex_init(&svms->lock);
>   	INIT_LIST_HEAD(&svms->list);
>   	atomic_set(&svms->evicted_ranges, 0);
> -	atomic_set(&svms->drain_pagefaults, 0);
> +	atomic_set(&svms->stop_pf_recovery, 0);
>   	INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
>   	INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
>   	INIT_LIST_HEAD(&svms->deferred_range_list);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 70c1776611c4..5f447c3642cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -173,7 +173,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
>   			    bool clear);
>   void svm_range_vram_node_free(struct svm_range *prange);
>   int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> -			    uint32_t vmid, uint32_t node_id, uint64_t addr,
> +			    uint32_t vmid, uint32_t node_id, uint64_t addr, uint64_t ts,
>   			    bool write_fault);
>   int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
>   void svm_range_add_list_work(struct svm_range_list *svms,