[PATCH v3] drm/amdkfd: Change kfd/svm page fault drain handling
Christian König
ckoenig.leichtzumerken at gmail.com
Mon Jul 29 18:23:49 UTC 2024
Am 25.07.24 um 20:19 schrieb Xiaogang.Chen:
> From: Xiaogang Chen <xiaogang.chen at amd.com>
>
> When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
> not handle any incoming pages fault of this process until a deferred work item
> got executed by default system wq. The time period of "not handle page fault"
> can be long and is unpredicable. That is advese to kfd performance on page
> faults recovery.
>
> This patch uses time stamp of incoming page page to decide to drop or handle
> page fault. When app unmap vm ranges kfd records each gpu device's ih ring
> current time stamp. These time stamps are used at kfd page fault recovery
> routine.
>
> Any page fault happens on unmapped ranges after unmap events is app bug that
> accesses vm range after unmap. It is not driver work to cover that.
But that defers destroying the unmapped range until the IH ring is
cleared, isn't it?
>
> By using time stamp of page fault do not need drain page faults at deferred
> work. So, the time period that kfd does not handle page faults is reduced
> and can be controlled.
>
> Signed-off-by: Xiaogang.Chen <Xiaogang.Chen at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 102 ++++++++++++++++---------
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
> 7 files changed, 79 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 3abfa66d72a2..d90b7ea3f020 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> * shouldn't be reported any more.
> */
> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> - u32 vmid, u32 node_id, uint64_t addr,
> + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
> bool write_fault)
> {
> bool is_compute_context = false;
> @@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> addr /= AMDGPU_GPU_PAGE_SIZE;
>
> if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
> - node_id, addr, write_fault)) {
> + node_id, addr, ts, write_fault)) {
> amdgpu_bo_unref(&root);
> return true;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 312a408b80d3..1d6a1381ede9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
> void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
>
> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> - u32 vmid, u32 node_id, uint64_t addr,
> + u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
> bool write_fault);
>
> void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index d933e19e0cf5..3596cc2ee7e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
> /* Try to handle the recoverable page faults by filling page
> * tables
> */
> - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
> + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
> + entry->timestamp, write_fault))
> return 1;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 350f6b6676f1..ac08d9424feb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
> cam_index = entry->src_data[2] & 0x3ff;
>
> ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
> - addr, write_fault);
> + addr, entry->timestamp, write_fault);
> WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
> if (ret)
> return 1;
> @@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
> * tables
> */
> if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
> - addr, write_fault))
> + addr, entry->timestamp, write_fault))
> return 1;
> }
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index c51e908f6f19..771c98e104ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -850,10 +850,13 @@ struct svm_range_list {
> struct list_head criu_svm_metadata_list;
> spinlock_t deferred_list_lock;
> atomic_t evicted_ranges;
> - atomic_t drain_pagefaults;
> + /* stop page fault recovery for this process */
> + atomic_t stop_pf_recovery;
> struct delayed_work restore_work;
> DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
> struct task_struct *faulting_task;
> + /* check point ts decides if page fault recovery need be dropped */
> + uint64_t checkpoint_ts[MAX_GPU_INSTANCE];
> };
>
> /* Process data */
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 407636a68814..fb0e883868b4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2263,16 +2263,10 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
> {
> struct kfd_process_device *pdd;
> struct kfd_process *p;
> - int drain;
> uint32_t i;
>
> p = container_of(svms, struct kfd_process, svms);
>
> -restart:
> - drain = atomic_read(&svms->drain_pagefaults);
> - if (!drain)
> - return;
> -
> for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
> pdd = p->pdds[i];
> if (!pdd)
> @@ -2292,8 +2286,6 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
>
> pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
> }
> - if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
> - goto restart;
> }
>
> static void svm_range_deferred_list_work(struct work_struct *work)
> @@ -2315,17 +2307,8 @@ static void svm_range_deferred_list_work(struct work_struct *work)
> prange->start, prange->last, prange->work_item.op);
>
> mm = prange->work_item.mm;
> -retry:
> - mmap_write_lock(mm);
>
> - /* Checking for the need to drain retry faults must be inside
> - * mmap write lock to serialize with munmap notifiers.
> - */
> - if (unlikely(atomic_read(&svms->drain_pagefaults))) {
> - mmap_write_unlock(mm);
> - svm_range_drain_retry_fault(svms);
> - goto retry;
> - }
> + mmap_write_lock(mm);
>
> /* Remove from deferred_list must be inside mmap write lock, for
> * two race cases:
> @@ -2446,6 +2429,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
> struct kfd_process *p;
> unsigned long s, l;
> bool unmap_parent;
> + uint32_t i;
>
> p = kfd_lookup_process_by_mm(mm);
> if (!p)
> @@ -2455,11 +2439,37 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
> pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
> prange, prange->start, prange->last, start, last);
>
> - /* Make sure pending page faults are drained in the deferred worker
> - * before the range is freed to avoid straggler interrupts on
> - * unmapped memory causing "phantom faults".
> + /* calculate time stamps that are used to decide which page faults need be
> + * dropped or handled before unmap pages from gpu vm
> */
> - atomic_inc(&svms->drain_pagefaults);
> + for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
> + struct kfd_process_device *pdd;
> + struct amdgpu_device *adev;
> + struct amdgpu_ih_ring *ih;
> + uint32_t checkpoint_wptr;
> +
> + pdd = p->pdds[i];
> + if (!pdd)
> + continue;
> +
> + adev = pdd->dev->adev;
> +
> + /* check if adev->irq.ih1 is not empty */
> + ih = &adev->irq.ih1;
> + checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
> + if (ih->rptr != checkpoint_wptr) {
> + WRITE_ONCE(svms->checkpoint_ts[i], checkpoint_wptr);
> + continue;
> + }
> +
> + /* check if dev->irq.ih_soft is not empty */
> + ih = &adev->irq.ih_soft;
> + checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
> + if (ih->rptr != checkpoint_wptr) {
> + WRITE_ONCE(svms->checkpoint_ts[i], checkpoint_wptr);
> + continue;
> + }
> + }
>
> unmap_parent = start <= prange->start && last >= prange->last;
>
> @@ -2900,7 +2910,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
> int
> svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> uint32_t vmid, uint32_t node_id,
> - uint64_t addr, bool write_fault)
> + uint64_t addr, uint64_t ts, bool write_fault)
> {
> unsigned long start, last, size;
> struct mm_struct *mm = NULL;
> @@ -2910,7 +2920,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> ktime_t timestamp = ktime_get_boottime();
> struct kfd_node *node;
> int32_t best_loc;
> - int32_t gpuidx = MAX_GPU_INSTANCE;
> + int32_t gpuid, gpuidx = MAX_GPU_INSTANCE;
> bool write_locked = false;
> struct vm_area_struct *vma;
> bool migration = false;
> @@ -2930,12 +2940,39 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>
> pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
>
> - if (atomic_read(&svms->drain_pagefaults)) {
> - pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
> + /* kfd page fault recovery is disabled */
> + if (atomic_read(&svms->stop_pf_recovery)) {
> + pr_debug("page fault handing disabled, drop fault 0x%llx\n", addr);
> r = 0;
> goto out;
> }
>
> + node = kfd_node_by_irq_ids(adev, node_id, vmid);
> + if (!node) {
> + pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
> + vmid);
> + r = -EFAULT;
> + goto out;
> + }
> +
> + if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
> + pr_debug("failed to get gpuid/gpuidex for node_id: %d \n", node_id);
> + r = -EFAULT;
> + goto out;
> + }
> +
> + /* check if this page fault time stamp is before svms->checkpoint_ts */
> + if (READ_ONCE(svms->checkpoint_ts[gpuidx]) != 0 &&
> + amdgpu_ih_ts_after(ts, READ_ONCE(svms->checkpoint_ts[gpuidx]))) {
> + pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
> + r = 0;
> + goto out;
> + } else
> + /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
> + * to zero to avoid following ts wrap around give wrong comparing
> + */
> + WRITE_ONCE(svms->checkpoint_ts[gpuidx], 0);
Pretty clear NAK to using WRITE_ONCE like this. You have exactly zero
memory barrier here.
Regards,
Christian.
> +
> if (!p->xnack_enabled) {
> pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
> r = -EFAULT;
> @@ -2952,13 +2989,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> goto out;
> }
>
> - node = kfd_node_by_irq_ids(adev, node_id, vmid);
> - if (!node) {
> - pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
> - vmid);
> - r = -EFAULT;
> - goto out;
> - }
> mmap_read_lock(mm);
> retry_write_locked:
> mutex_lock(&svms->lock);
> @@ -3173,9 +3203,11 @@ void svm_range_list_fini(struct kfd_process *p)
> /*
> * Ensure no retry fault comes in afterwards, as page fault handler will
> * not find kfd process and take mm lock to recover fault.
> + * stop kfd page fault handing, then wait pending page faults got drained
> */
> - atomic_inc(&p->svms.drain_pagefaults);
> + atomic_set(&p->svms.stop_pf_recovery, 1);
> svm_range_drain_retry_fault(&p->svms);
> + atomic_set(&p->svms.stop_pf_recovery, 0);
>
> list_for_each_entry_safe(prange, next, &p->svms.list, list) {
> svm_range_unlink(prange);
> @@ -3197,7 +3229,7 @@ int svm_range_list_init(struct kfd_process *p)
> mutex_init(&svms->lock);
> INIT_LIST_HEAD(&svms->list);
> atomic_set(&svms->evicted_ranges, 0);
> - atomic_set(&svms->drain_pagefaults, 0);
> + atomic_set(&svms->stop_pf_recovery, 0);
> INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
> INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
> INIT_LIST_HEAD(&svms->deferred_range_list);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 70c1776611c4..5f447c3642cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -173,7 +173,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
> bool clear);
> void svm_range_vram_node_free(struct svm_range *prange);
> int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> - uint32_t vmid, uint32_t node_id, uint64_t addr,
> + uint32_t vmid, uint32_t node_id, uint64_t addr, uint64_t ts,
> bool write_fault);
> int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
> void svm_range_add_list_work(struct svm_range_list *svms,
More information about the amd-gfx
mailing list