[PATCH v3] drm/amdkfd: Run restore_workers on freezable WQs

Fri Nov 24 13:23:51 UTC 2023

[AMD Official Use Only - General]

Tested-by: Emily Deng <Emily.Deng at amd.com>

>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling at amd.com>
>Sent: Friday, November 24, 2023 6:55 AM
>To: amd-gfx at lists.freedesktop.org
>Cc: Deng, Emily <Emily.Deng at amd.com>; Pan, Xinhui
><Xinhui.Pan at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
>Subject: [PATCH v3] drm/amdkfd: Run restore_workers on freezable WQs
>
>Make restore workers freezable so we don't have to explicitly flush them in
>suspend and GPU reset code paths, and we don't accidentally try to restore
>BOs while the GPU is suspended. Not having to flush restore_work also helps
>avoid lock/fence dependencies in the GPU reset case where we're not allowed
>to wait for fences.
>
>A side effect of this is, that we can now have multiple concurrent threads trying
>to signal the same eviction fence. Rework eviction fence signaling and
>replacement to account for that.
>
>The GPU reset path can no longer rely on restore_process_worker to resume
>queues because evict/restore workers can run independently of it. Instead call
>a new restore_process_helper directly.
>
>This is an RFC and request for testing.
>
>v2:
>- Reworked eviction fence signaling
>- Introduced restore_process_helper
>
>v3:
>- Handle unsignaled eviction fences in restore_process_bos
>
>Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>Acked-by: Christian König <christian.koenig at amd.com>
>---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 68 +++++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 87 +++++++++++--------
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
> 3 files changed, 104 insertions(+), 55 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>index 2e302956a279..bdec88713a09 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>@@ -1431,7 +1431,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void
>**process_info,
>                                 amdgpu_amdkfd_restore_userptr_worker);
>
>               *process_info = info;
>-              *ef = dma_fence_get(&info->eviction_fence->base);
>       }
>
>       vm->process_info = *process_info;
>@@ -1462,6 +1461,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void
>**process_info,
>       list_add_tail(&vm->vm_list_node,
>                       &(vm->process_info->vm_list_head));
>       vm->process_info->n_vms++;
>+
>+      *ef = dma_fence_get(&vm->process_info->eviction_fence->base);
>       mutex_unlock(&vm->process_info->lock);
>
>       return 0;
>@@ -1473,10 +1474,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void
>**process_info,
> reserve_pd_fail:
>       vm->process_info = NULL;
>       if (info) {
>-              /* Two fence references: one in info and one in *ef */
>               dma_fence_put(&info->eviction_fence->base);
>-              dma_fence_put(*ef);
>-              *ef = NULL;
>               *process_info = NULL;
>               put_pid(info->pid);
> create_evict_fence_fail:
>@@ -1670,7 +1668,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
>               goto out_unlock;
>       }
>       WRITE_ONCE(pinfo->block_mmu_notifications, false);
>-      schedule_delayed_work(&pinfo->restore_userptr_work, 0);
>+      queue_delayed_work(system_freezable_wq,
>+                         &pinfo->restore_userptr_work, 0);
>
> out_unlock:
>       mutex_unlock(&pinfo->lock);
>@@ -2475,7 +2474,8 @@ int amdgpu_amdkfd_evict_userptr(struct
>mmu_interval_notifier *mni,
>
>KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
>               if (r)
>                       pr_err("Failed to quiesce KFD\n");
>-              schedule_delayed_work(&process_info-
>>restore_userptr_work,
>+              queue_delayed_work(system_freezable_wq,
>+                      &process_info->restore_userptr_work,
>
>       msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>       }
>       mutex_unlock(&process_info->notifier_lock);
>@@ -2810,7 +2810,8 @@ static void
>amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>
>       /* If validation failed, reschedule another attempt */
>       if (evicted_bos) {
>-              schedule_delayed_work(&process_info-
>>restore_userptr_work,
>+              queue_delayed_work(system_freezable_wq,
>+                      &process_info->restore_userptr_work,
>
>       msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>
>               kfd_smi_event_queue_restore_rescheduled(mm);
>@@ -2819,6 +2820,23 @@ static void
>amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>       put_task_struct(usertask);
> }
>
>+static void replace_eviction_fence(struct dma_fence **ef,
>+                                 struct dma_fence *new_ef)
>+{
>+      struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
>+              /* protected by process_info->lock */);
>+
>+      /* If we're replacing an unsignaled eviction fence, that fence will
>+       * never be signaled, and if anyone is still waiting on that fence,
>+       * they will hang forever. This should never happen. We should only
>+       * replace the fence in restore_work that only gets scheduled after
>+       * eviction work signaled the fence.
>+       */
>+      WARN_ONCE(!dma_fence_is_signaled(old_ef),
>+                "Replacing unsignaled eviction fence");
>+      dma_fence_put(old_ef);
>+}
>+
> /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the
>given
>  *   KFD process identified by process_info
>  *
>@@ -2844,7 +2862,6 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>**ef)
>       struct amdgpu_vm *peer_vm;
>       struct kgd_mem *mem;
>       struct bo_vm_reservation_context ctx;
>-      struct amdgpu_amdkfd_fence *new_fence;
>       int ret = 0, i;
>       struct list_head duplicate_save;
>       struct amdgpu_sync sync_obj;
>@@ -2974,22 +2991,35 @@ int
>amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
>**ef)
>       /* Wait for validate and PT updates to finish */
>       amdgpu_sync_wait(&sync_obj, false);
>
>-      /* Release old eviction fence and create new one, because fence only
>-       * goes from unsignaled to signaled, fence cannot be reused.
>-       * Use context and mm from the old fence.
>+      /* The old eviction fence may be unsignaled if restore happens
>+       * after a GPU reset or suspend/resume. Keep the old fence in that
>+       * case. Otherwise release the old eviction fence and create new
>+       * one, because fence only goes from unsignaled to signaled once
>+       * and cannot be reused. Use context and mm from the old fence.
>+       *
>+       * If an old eviction fence signals after this check, that's OK.
>+       * Anyone signaling an eviction fence must stop the queues first
>+       * and schedule another restore worker.
>        */
>-      new_fence = amdgpu_amdkfd_fence_create(
>+      if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {
>+              struct amdgpu_amdkfd_fence *new_fence =
>+                      amdgpu_amdkfd_fence_create(
>                               process_info->eviction_fence->base.context,
>                               process_info->eviction_fence->mm,
>                               NULL);
>-      if (!new_fence) {
>-              pr_err("Failed to create eviction fence\n");
>-              ret = -ENOMEM;
>-              goto validate_map_fail;
>+
>+              if (!new_fence) {
>+                      pr_err("Failed to create eviction fence\n");
>+                      ret = -ENOMEM;
>+                      goto validate_map_fail;
>+              }
>+              dma_fence_put(&process_info->eviction_fence->base);
>+              process_info->eviction_fence = new_fence;
>+              replace_eviction_fence(ef, dma_fence_get(&new_fence-
>>base));
>+      } else {
>+              WARN_ONCE(*ef != &process_info->eviction_fence->base,
>+                        "KFD eviction fence doesn't match KGD
>process_info");
>       }
>-      dma_fence_put(&process_info->eviction_fence->base);
>-      process_info->eviction_fence = new_fence;
>-      *ef = dma_fence_get(&new_fence->base);
>
>       /* Attach new eviction fence to all BOs except pinned ones */
>       list_for_each_entry(mem, &process_info->kfd_bo_list, diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>index c10d050e1a61..71df51fcc1b0 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
>       if (!kfd_process_wq)
>               kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
>       if (!kfd_restore_wq)
>-              kfd_restore_wq =
>alloc_ordered_workqueue("kfd_restore_wq", 0);
>+              kfd_restore_wq =
>alloc_ordered_workqueue("kfd_restore_wq",
>+                                                       WQ_FREEZABLE);
>
>       if (!kfd_process_wq || !kfd_restore_wq) {
>               kfd_process_destroy_wq();
>@@ -1642,6 +1643,7 @@ int kfd_process_device_init_vm(struct
>kfd_process_device *pdd,
>       struct amdgpu_fpriv *drv_priv;
>       struct amdgpu_vm *avm;
>       struct kfd_process *p;
>+      struct dma_fence *ef;
>       struct kfd_node *dev;
>       int ret;
>
>@@ -1661,11 +1663,12 @@ int kfd_process_device_init_vm(struct
>kfd_process_device *pdd,
>
>       ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
>                                                    &p->kgd_process_info,
>-                                                   &p->ef);
>+                                                   &ef);
>       if (ret) {
>               pr_err("Failed to create process VM object\n");
>               return ret;
>       }
>+      RCU_INIT_POINTER(p->ef, ef);
>       pdd->drm_priv = drm_file->private_data;
>
>       ret = kfd_process_device_reserve_ib_mem(pdd);
>@@ -1908,6 +1911,21 @@ kfd_process_gpuid_from_node(struct kfd_process
>*p, struct kfd_node *node,
>       return -EINVAL;
> }
>
>+static int signal_eviction_fence(struct kfd_process *p) {
>+      struct dma_fence *ef;
>+      int ret;
>+
>+      rcu_read_lock();
>+      ef = dma_fence_get_rcu_safe(&p->ef);
>+      rcu_read_unlock();
>+
>+      ret = dma_fence_signal(ef);
>+      dma_fence_put(ef);
>+
>+      return ret;
>+}
>+
> static void evict_process_worker(struct work_struct *work)  {
>       int ret;
>@@ -1920,31 +1938,46 @@ static void evict_process_worker(struct
>work_struct *work)
>        * lifetime of this thread, kfd_process p will be valid
>        */
>       p = container_of(dwork, struct kfd_process, eviction_work);
>-      WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
>-                "Eviction fence mismatch\n");
>-
>-      /* Narrow window of overlap between restore and evict work
>-       * item is possible. Once
>amdgpu_amdkfd_gpuvm_restore_process_bos
>-       * unreserves KFD BOs, it is possible to evicted again. But
>-       * restore has few more steps of finish. So lets wait for any
>-       * previous restore work to complete
>-       */
>-      flush_delayed_work(&p->restore_work);
>
>       pr_debug("Started evicting pasid 0x%x\n", p->pasid);
>       ret = kfd_process_evict_queues(p,
>KFD_QUEUE_EVICTION_TRIGGER_TTM);
>       if (!ret) {
>-              dma_fence_signal(p->ef);
>-              dma_fence_put(p->ef);
>-              p->ef = NULL;
>-              queue_delayed_work(kfd_restore_wq, &p->restore_work,
>+              /* If another thread already signaled the eviction fence,
>+               * they are responsible stopping the queues and scheduling
>+               * the restore work.
>+               */
>+              if (!signal_eviction_fence(p))
>+                      queue_delayed_work(kfd_restore_wq, &p-
>>restore_work,
>
>       msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
>+              else
>+                      kfd_process_restore_queues(p);
>
>               pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
>       } else
>               pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);  }
>
>+static int restore_process_helper(struct kfd_process *p) {
>+      int ret = 0;
>+
>+      /* VMs may not have been acquired yet during debugging. */
>+      if (p->kgd_process_info) {
>+              ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
>+                      p->kgd_process_info, &p->ef);
>+              if (ret)
>+                      return ret;
>+      }
>+
>+      ret = kfd_process_restore_queues(p);
>+      if (!ret)
>+              pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
>+      else
>+              pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
>+
>+      return ret;
>+}
>+
> static void restore_process_worker(struct work_struct *work)  {
>       struct delayed_work *dwork;
>@@ -1970,24 +2003,15 @@ static void restore_process_worker(struct
>work_struct *work)
>        */
>
>       p->last_restore_timestamp = get_jiffies_64();
>-      /* VMs may not have been acquired yet during debugging. */
>-      if (p->kgd_process_info)
>-              ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
>>kgd_process_info,
>-                                                           &p->ef);
>+
>+      ret = restore_process_helper(p);
>       if (ret) {
>               pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d
>ms\n",
>                        p->pasid, PROCESS_BACK_OFF_TIME_MS);
>               ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
>
>       msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
>               WARN(!ret, "reschedule restore work failed\n");
>-              return;
>       }
>-
>-      ret = kfd_process_restore_queues(p);
>-      if (!ret)
>-              pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
>-      else
>-              pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
> }
>
> void kfd_suspend_all_processes(void)
>@@ -1998,14 +2022,9 @@ void kfd_suspend_all_processes(void)
>
>       WARN(debug_evictions, "Evicting all processes");
>       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>-              cancel_delayed_work_sync(&p->eviction_work);
>-              flush_delayed_work(&p->restore_work);
>-
>               if (kfd_process_evict_queues(p,
>KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>                       pr_err("Failed to suspend process 0x%x\n", p->pasid);
>-              dma_fence_signal(p->ef);
>-              dma_fence_put(p->ef);
>-              p->ef = NULL;
>+              signal_eviction_fence(p);
>       }
>       srcu_read_unlock(&kfd_processes_srcu, idx);  } @@ -2017,7 +2036,7
>@@ int kfd_resume_all_processes(void)
>       int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
>
>       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>-              if (!queue_delayed_work(kfd_restore_wq, &p->restore_work,
>0)) {
>+              if (restore_process_helper(p)) {
>                       pr_err("Restore process %d failed during resume\n",
>                              p->pasid);
>                       ret = -EFAULT;
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>index b23ba92a794c..42b5279c7010 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>@@ -1871,7 +1871,7 @@ static void svm_range_restore_work(struct
>work_struct *work)
>       /* If validation failed, reschedule another attempt */
>       if (evicted_ranges) {
>               pr_debug("reschedule to restore svm range\n");
>-              schedule_delayed_work(&svms->restore_work,
>+              queue_delayed_work(system_freezable_wq, &svms-
>>restore_work,
>
>       msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>
>               kfd_smi_event_queue_restore_rescheduled(mm);
>@@ -1947,7 +1947,7 @@ svm_range_evict(struct svm_range *prange, struct
>mm_struct *mm,
>                       pr_debug("failed to quiesce KFD\n");
>
>               pr_debug("schedule to restore svm %p ranges\n", svms);
>-              schedule_delayed_work(&svms->restore_work,
>+              queue_delayed_work(system_freezable_wq, &svms-
>>restore_work,
>
>       msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>       } else {
>               unsigned long s, l;
>--
>2.34.1