[PATCH v3] drm/amdkfd: Run restore_workers on freezable WQs
Deng, Emily
Emily.Deng at amd.com
Fri Nov 24 13:23:51 UTC 2023
[AMD Official Use Only - General]
Tested-by: Emily Deng <Emily.Deng at amd.com>
>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling at amd.com>
>Sent: Friday, November 24, 2023 6:55 AM
>To: amd-gfx at lists.freedesktop.org
>Cc: Deng, Emily <Emily.Deng at amd.com>; Pan, Xinhui
><Xinhui.Pan at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
>Subject: [PATCH v3] drm/amdkfd: Run restore_workers on freezable WQs
>
>Make restore workers freezable so we don't have to explicitly flush them in
>suspend and GPU reset code paths, and we don't accidentally try to restore
>BOs while the GPU is suspended. Not having to flush restore_work also helps
>avoid lock/fence dependencies in the GPU reset case where we're not allowed
>to wait for fences.
>
>A side effect of this is, that we can now have multiple concurrent threads trying
>to signal the same eviction fence. Rework eviction fence signaling and
>replacement to account for that.
>
>The GPU reset path can no longer rely on restore_process_worker to resume
>queues because evict/restore workers can run independently of it. Instead call
>a new restore_process_helper directly.
>
>This is an RFC and request for testing.
>
>v2:
>- Reworked eviction fence signaling
>- Introduced restore_process_helper
>
>v3:
>- Handle unsignaled eviction fences in restore_process_bos
>
>Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>Acked-by: Christian König <christian.koenig at amd.com>
>---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 68 +++++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 87 +++++++++++--------
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +-
> 3 files changed, 104 insertions(+), 55 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>index 2e302956a279..bdec88713a09 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>@@ -1431,7 +1431,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void
>**process_info,
> amdgpu_amdkfd_restore_userptr_worker);
>
> *process_info = info;
>- *ef = dma_fence_get(&info->eviction_fence->base);
> }
>
> vm->process_info = *process_info;
>@@ -1462,6 +1461,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void
>**process_info,
> list_add_tail(&vm->vm_list_node,
> &(vm->process_info->vm_list_head));
> vm->process_info->n_vms++;
>+
>+ *ef = dma_fence_get(&vm->process_info->eviction_fence->base);
> mutex_unlock(&vm->process_info->lock);
>
> return 0;
>@@ -1473,10 +1474,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void
>**process_info,
> reserve_pd_fail:
> vm->process_info = NULL;
> if (info) {
>- /* Two fence references: one in info and one in *ef */
> dma_fence_put(&info->eviction_fence->base);
>- dma_fence_put(*ef);
>- *ef = NULL;
> *process_info = NULL;
> put_pid(info->pid);
> create_evict_fence_fail:
>@@ -1670,7 +1668,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
> goto out_unlock;
> }
> WRITE_ONCE(pinfo->block_mmu_notifications, false);
>- schedule_delayed_work(&pinfo->restore_userptr_work, 0);
>+ queue_delayed_work(system_freezable_wq,
>+ &pinfo->restore_userptr_work, 0);
>
> out_unlock:
> mutex_unlock(&pinfo->lock);
>@@ -2475,7 +2474,8 @@ int amdgpu_amdkfd_evict_userptr(struct
>mmu_interval_notifier *mni,
>
>KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
> if (r)
> pr_err("Failed to quiesce KFD\n");
>- schedule_delayed_work(&process_info-
>>restore_userptr_work,
>+ queue_delayed_work(system_freezable_wq,
>+ &process_info->restore_userptr_work,
>
> msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
> }
> mutex_unlock(&process_info->notifier_lock);
>@@ -2810,7 +2810,8 @@ static void
>amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>
> /* If validation failed, reschedule another attempt */
> if (evicted_bos) {
>- schedule_delayed_work(&process_info-
>>restore_userptr_work,
>+ queue_delayed_work(system_freezable_wq,
>+ &process_info->restore_userptr_work,
>
> msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>
> kfd_smi_event_queue_restore_rescheduled(mm);
>@@ -2819,6 +2820,23 @@ static void
>amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
> put_task_struct(usertask);
> }
>
>+static void replace_eviction_fence(struct dma_fence **ef,
>+ struct dma_fence *new_ef)
>+{
>+ struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
>+ /* protected by process_info->lock */);
>+
>+ /* If we're replacing an unsignaled eviction fence, that fence will
>+ * never be signaled, and if anyone is still waiting on that fence,
>+ * they will hang forever. This should never happen. We should only
>+ * replace the fence in restore_work that only gets scheduled after
>+ * eviction work signaled the fence.
>+ */
>+ WARN_ONCE(!dma_fence_is_signaled(old_ef),
>+ "Replacing unsignaled eviction fence");
>+ dma_fence_put(old_ef);
>+}
>+
> /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the
>given
> * KFD process identified by process_info
> *
>@@ -2844,7 +2862,6 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>**ef)
> struct amdgpu_vm *peer_vm;
> struct kgd_mem *mem;
> struct bo_vm_reservation_context ctx;
>- struct amdgpu_amdkfd_fence *new_fence;
> int ret = 0, i;
> struct list_head duplicate_save;
> struct amdgpu_sync sync_obj;
>@@ -2974,22 +2991,35 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>**ef)
> /* Wait for validate and PT updates to finish */
> amdgpu_sync_wait(&sync_obj, false);
>
>- /* Release old eviction fence and create new one, because fence only
>- * goes from unsignaled to signaled, fence cannot be reused.
>- * Use context and mm from the old fence.
>+ /* The old eviction fence may be unsignaled if restore happens
>+ * after a GPU reset or suspend/resume. Keep the old fence in that
>+ * case. Otherwise release the old eviction fence and create new
>+ * one, because fence only goes from unsignaled to signaled once
>+ * and cannot be reused. Use context and mm from the old fence.
>+ *
>+ * If an old eviction fence signals after this check, that's OK.
>+ * Anyone signaling an eviction fence must stop the queues first
>+ * and schedule another restore worker.
> */
>- new_fence = amdgpu_amdkfd_fence_create(
>+ if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {
>+ struct amdgpu_amdkfd_fence *new_fence =
>+ amdgpu_amdkfd_fence_create(
> process_info->eviction_fence->base.context,
> process_info->eviction_fence->mm,
> NULL);
>- if (!new_fence) {
>- pr_err("Failed to create eviction fence\n");
>- ret = -ENOMEM;
>- goto validate_map_fail;
>+
>+ if (!new_fence) {
>+ pr_err("Failed to create eviction fence\n");
>+ ret = -ENOMEM;
>+ goto validate_map_fail;
>+ }
>+ dma_fence_put(&process_info->eviction_fence->base);
>+ process_info->eviction_fence = new_fence;
>+ replace_eviction_fence(ef, dma_fence_get(&new_fence-
>>base));
>+ } else {
>+ WARN_ONCE(*ef != &process_info->eviction_fence->base,
>+ "KFD eviction fence doesn't match KGD
>process_info");
> }
>- dma_fence_put(&process_info->eviction_fence->base);
>- process_info->eviction_fence = new_fence;
>- *ef = dma_fence_get(&new_fence->base);
>
> /* Attach new eviction fence to all BOs except pinned ones */
> list_for_each_entry(mem, &process_info->kfd_bo_list, diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>index c10d050e1a61..71df51fcc1b0 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
> if (!kfd_process_wq)
> kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
> if (!kfd_restore_wq)
>- kfd_restore_wq =
>alloc_ordered_workqueue("kfd_restore_wq", 0);
>+ kfd_restore_wq =
>alloc_ordered_workqueue("kfd_restore_wq",
>+ WQ_FREEZABLE);
>
> if (!kfd_process_wq || !kfd_restore_wq) {
> kfd_process_destroy_wq();
>@@ -1642,6 +1643,7 @@ int kfd_process_device_init_vm(struct
>kfd_process_device *pdd,
> struct amdgpu_fpriv *drv_priv;
> struct amdgpu_vm *avm;
> struct kfd_process *p;
>+ struct dma_fence *ef;
> struct kfd_node *dev;
> int ret;
>
>@@ -1661,11 +1663,12 @@ int kfd_process_device_init_vm(struct
>kfd_process_device *pdd,
>
> ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
> &p->kgd_process_info,
>- &p->ef);
>+ &ef);
> if (ret) {
> pr_err("Failed to create process VM object\n");
> return ret;
> }
>+ RCU_INIT_POINTER(p->ef, ef);
> pdd->drm_priv = drm_file->private_data;
>
> ret = kfd_process_device_reserve_ib_mem(pdd);
>@@ -1908,6 +1911,21 @@ kfd_process_gpuid_from_node(struct kfd_process
>*p, struct kfd_node *node,
> return -EINVAL;
> }
>
>+static int signal_eviction_fence(struct kfd_process *p) {
>+ struct dma_fence *ef;
>+ int ret;
>+
>+ rcu_read_lock();
>+ ef = dma_fence_get_rcu_safe(&p->ef);
>+ rcu_read_unlock();
>+
>+ ret = dma_fence_signal(ef);
>+ dma_fence_put(ef);
>+
>+ return ret;
>+}
>+
> static void evict_process_worker(struct work_struct *work) {
> int ret;
>@@ -1920,31 +1938,46 @@ static void evict_process_worker(struct
>work_struct *work)
> * lifetime of this thread, kfd_process p will be valid
> */
> p = container_of(dwork, struct kfd_process, eviction_work);
>- WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
>- "Eviction fence mismatch\n");
>-
>- /* Narrow window of overlap between restore and evict work
>- * item is possible. Once
>amdgpu_amdkfd_gpuvm_restore_process_bos
>- * unreserves KFD BOs, it is possible to evicted again. But
>- * restore has few more steps of finish. So lets wait for any
>- * previous restore work to complete
>- */
>- flush_delayed_work(&p->restore_work);
>
> pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> ret = kfd_process_evict_queues(p,
>KFD_QUEUE_EVICTION_TRIGGER_TTM);
> if (!ret) {
>- dma_fence_signal(p->ef);
>- dma_fence_put(p->ef);
>- p->ef = NULL;
>- queue_delayed_work(kfd_restore_wq, &p->restore_work,
>+ /* If another thread already signaled the eviction fence,
>+ * they are responsible stopping the queues and scheduling
>+ * the restore work.
>+ */
>+ if (!signal_eviction_fence(p))
>+ queue_delayed_work(kfd_restore_wq, &p-
>>restore_work,
>
> msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
>+ else
>+ kfd_process_restore_queues(p);
>
> pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
> } else
> pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); }
>
>+static int restore_process_helper(struct kfd_process *p) {
>+ int ret = 0;
>+
>+ /* VMs may not have been acquired yet during debugging. */
>+ if (p->kgd_process_info) {
>+ ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
>+ p->kgd_process_info, &p->ef);
>+ if (ret)
>+ return ret;
>+ }
>+
>+ ret = kfd_process_restore_queues(p);
>+ if (!ret)
>+ pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
>+ else
>+ pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
>+
>+ return ret;
>+}
>+
> static void restore_process_worker(struct work_struct *work) {
> struct delayed_work *dwork;
>@@ -1970,24 +2003,15 @@ static void restore_process_worker(struct
>work_struct *work)
> */
>
> p->last_restore_timestamp = get_jiffies_64();
>- /* VMs may not have been acquired yet during debugging. */
>- if (p->kgd_process_info)
>- ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
>>kgd_process_info,
>- &p->ef);
>+
>+ ret = restore_process_helper(p);
> if (ret) {
> pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d
>ms\n",
> p->pasid, PROCESS_BACK_OFF_TIME_MS);
> ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
>
> msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
> WARN(!ret, "reschedule restore work failed\n");
>- return;
> }
>-
>- ret = kfd_process_restore_queues(p);
>- if (!ret)
>- pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
>- else
>- pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
> }
>
> void kfd_suspend_all_processes(void)
>@@ -1998,14 +2022,9 @@ void kfd_suspend_all_processes(void)
>
> WARN(debug_evictions, "Evicting all processes");
> hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>- cancel_delayed_work_sync(&p->eviction_work);
>- flush_delayed_work(&p->restore_work);
>-
> if (kfd_process_evict_queues(p,
>KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
> pr_err("Failed to suspend process 0x%x\n", p->pasid);
>- dma_fence_signal(p->ef);
>- dma_fence_put(p->ef);
>- p->ef = NULL;
>+ signal_eviction_fence(p);
> }
> srcu_read_unlock(&kfd_processes_srcu, idx); } @@ -2017,7 +2036,7
>@@ int kfd_resume_all_processes(void)
> int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
>
> hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>- if (!queue_delayed_work(kfd_restore_wq, &p->restore_work,
>0)) {
>+ if (restore_process_helper(p)) {
> pr_err("Restore process %d failed during resume\n",
> p->pasid);
> ret = -EFAULT;
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>index b23ba92a794c..42b5279c7010 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>@@ -1871,7 +1871,7 @@ static void svm_range_restore_work(struct
>work_struct *work)
> /* If validation failed, reschedule another attempt */
> if (evicted_ranges) {
> pr_debug("reschedule to restore svm range\n");
>- schedule_delayed_work(&svms->restore_work,
>+ queue_delayed_work(system_freezable_wq, &svms-
>>restore_work,
>
> msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>
> kfd_smi_event_queue_restore_rescheduled(mm);
>@@ -1947,7 +1947,7 @@ svm_range_evict(struct svm_range *prange, struct
>mm_struct *mm,
> pr_debug("failed to quiesce KFD\n");
>
> pr_debug("schedule to restore svm %p ranges\n", svms);
>- schedule_delayed_work(&svms->restore_work,
>+ queue_delayed_work(system_freezable_wq, &svms-
>>restore_work,
>
> msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
> } else {
> unsigned long s, l;
>--
>2.34.1
More information about the amd-gfx
mailing list