[RFC PATCH] drm/amdkfd: Run restore_workers on freezable WQs

Mon Oct 30 08:23:03 UTC 2023

Am 28.10.23 um 00:39 schrieb Felix Kuehling:
> Make restore workers freezable so we don't have to explicitly flush them
> in suspend and GPU reset code paths, and we don't accidentally try to
> restore BOs while the GPU is suspended. Not having to flush restore_work
> also helps avoid lock/fence dependencies in the GPU reset case where we're
> not allowed to wait for fences.
>
> This is an RFC and request for testing. I have not tested this myself yet.
> My notes below:
>
> Restore work won't be frozen during GPU reset. Does it matter? Queues will
> stay evicted until resume in any case. But restore_work may be in trouble
> if it gets run in the middle of a GPU reset. In that case, if anything
> fails, it will just reschedule itself, so should be fine as long as it
> doesn't interfere with the reset itself (respects any mechanisms in place
> to prevent HW access during the reset).
>
> What HW access does restore_work perform:
> - migrating buffers: uses GPU scheduler, should be suspended during reset
> - TLB flushes in userptr restore worker: not called directly, relies on
>    HWS to flush TLBs on VMID attachment
> - TLB flushes in SVM restore worker: Does TLB flush in the mapping code
> - Resuming user mode queues: should not happen while GPU reset keeps queue
>    eviction counter elevated
> Ergo: Except for the SVM case, it's OK to not flush restore work before
> GPU resets. I'll need to rethink the interaction of SVM restore_work with
> GPU resets.

It also sounds like the restore work is some high level work and 
shouldn't interact with the lower level GPU reset.

>
> What about cancelling p->eviction_work? Eviction work would normally be
> needed to signal eviction fences, but we're doing that explicitly in
> suspend_all_processes. Does eviction_work wait for fences anywhere? Yes,
> indirectly by flushing restore_work. So we should not try to cancel it
> during a GPU reset.
>
> Problem: accessing p->ef concurrently in evict_process_worker and
> suspend_all_processes. Need a spinlock to use and update it safely.
> Problem: What if evict_process_worker gets stuck in flushing restore_work?
> We can skip all of that if p->ef is NULL (which it is during the reset).
> Even if it gets stuck, it's no problem if the reset doesn't depend on it.
> It should get unstuck after the reset.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 49 +++++++++++++------
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
>   4 files changed, 44 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 54f31a420229..89e632257663 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1644,7 +1644,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
>   		goto out_unlock;
>   	}
>   	WRITE_ONCE(pinfo->block_mmu_notifications, false);
> -	schedule_delayed_work(&pinfo->restore_userptr_work, 0);
> +	queue_delayed_work(system_freezable_wq,
> +			   &pinfo->restore_userptr_work, 0);
>   
>   out_unlock:
>   	mutex_unlock(&pinfo->lock);
> @@ -2458,7 +2459,8 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
>   				       KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
>   		if (r)
>   			pr_err("Failed to quiesce KFD\n");
> -		schedule_delayed_work(&process_info->restore_userptr_work,
> +		queue_delayed_work(system_freezable_wq,
> +			&process_info->restore_userptr_work,
>   			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>   	}
>   	mutex_unlock(&process_info->notifier_lock);
> @@ -2793,7 +2795,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>   
>   	/* If validation failed, reschedule another attempt */
>   	if (evicted_bos) {
> -		schedule_delayed_work(&process_info->restore_userptr_work,
> +		queue_delayed_work(system_freezable_wq,
> +			&process_info->restore_userptr_work,
>   			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>   
>   		kfd_smi_event_queue_restore_rescheduled(mm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 9cc32f577e38..cf017d027fee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -919,6 +919,7 @@ struct kfd_process {
>   	 * during restore
>   	 */
>   	struct dma_fence *ef;
> +	spinlock_t ef_lock;
>   
>   	/* Work items for evicting and restoring BOs */
>   	struct delayed_work eviction_work;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index fbf053001af9..a07cba58ec5e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
>   	if (!kfd_process_wq)
>   		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
>   	if (!kfd_restore_wq)
> -		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
> +		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
> +							 WQ_FREEZABLE);
>   
>   	if (!kfd_process_wq || !kfd_restore_wq) {
>   		kfd_process_destroy_wq();
> @@ -1460,6 +1461,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   
>   	kref_init(&process->ref);
>   	mutex_init(&process->mutex);
> +	spin_lock_init(&process->ef_lock);
>   	process->mm = thread->mm;
>   	process->lead_thread = thread->group_leader;
>   	process->n_pdds = 0;
> @@ -1904,6 +1906,19 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
>   	return -EINVAL;
>   }
>   
> +static void signal_eviction_fence(struct kfd_process *p)
> +{
> +	spin_lock(&p->ef_lock);
> +	if (!p->ef)
> +		goto unlock_out;
> +	dma_fence_signal(p->ef);

This needs to grab the internal lock of the eviction fence, I'm not sure 
that has correct ordering with the newly added ef_lock.

> +	dma_fence_put(p->ef);
> +	WRITE_ONCE(p->ef, NULL);
> +
> +unlock_out:
> +	spin_unlock(&p->ef_lock);
> +}
> +
>   static void evict_process_worker(struct work_struct *work)
>   {
>   	int ret;
> @@ -1916,8 +1931,11 @@ static void evict_process_worker(struct work_struct *work)
>   	 * lifetime of this thread, kfd_process p will be valid
>   	 */
>   	p = container_of(dwork, struct kfd_process, eviction_work);
> -	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
> -		  "Eviction fence mismatch\n");
> +	/* If the eviction fence is not valid, we're probably in a suspend
> +	 * or GPU reset cycle. There is nothing to do in this case.
> +	 */
> +	if (!READ_ONCE(p->ef))
> +		return;

This evict process work is high level I don't think it should have any 
dependency on the GPU reset.

Regards,
Christian.

>   
>   	/* Narrow window of overlap between restore and evict work
>   	 * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
> @@ -1930,9 +1948,7 @@ static void evict_process_worker(struct work_struct *work)
>   	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
>   	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
>   	if (!ret) {
> -		dma_fence_signal(p->ef);
> -		dma_fence_put(p->ef);
> -		p->ef = NULL;
> +		signal_eviction_fence(p);
>   		queue_delayed_work(kfd_restore_wq, &p->restore_work,
>   				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
>   
> @@ -1967,9 +1983,19 @@ static void restore_process_worker(struct work_struct *work)
>   
>   	p->last_restore_timestamp = get_jiffies_64();
>   	/* VMs may not have been acquired yet during debugging. */
> -	if (p->kgd_process_info)
> +	if (p->kgd_process_info) {
> +		struct dma_fence *ef = NULL;
> +
>   		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> -							     &p->ef);
> +							     &ef);
> +		if (!ret) {
> +			spin_lock(&p->ef_lock);
> +			WARN_ONCE(p->ef, "Eviction fence is not NULL");
> +			WRITE_ONCE(p->ef, ef);
> +			spin_unlock(&p->ef_lock);
> +		}
> +	}
> +
>   	if (ret) {
>   		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
>   			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
> @@ -1994,14 +2020,9 @@ void kfd_suspend_all_processes(void)
>   
>   	WARN(debug_evictions, "Evicting all processes");
>   	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> -		cancel_delayed_work_sync(&p->eviction_work);
> -		flush_delayed_work(&p->restore_work);
> -
>   		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);
> -		dma_fence_signal(p->ef);
> -		dma_fence_put(p->ef);
> -		p->ef = NULL;
> +		signal_eviction_fence(p);
>   	}
>   	srcu_read_unlock(&kfd_processes_srcu, idx);
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index fe937670c927..aa6c34127be9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1869,7 +1869,7 @@ static void svm_range_restore_work(struct work_struct *work)
>   	/* If validation failed, reschedule another attempt */
>   	if (evicted_ranges) {
>   		pr_debug("reschedule to restore svm range\n");
> -		schedule_delayed_work(&svms->restore_work,
> +		queue_delayed_work(system_freezable_wq, &svms->restore_work,
>   			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>   
>   		kfd_smi_event_queue_restore_rescheduled(mm);
> @@ -1945,7 +1945,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
>   			pr_debug("failed to quiesce KFD\n");
>   
>   		pr_debug("schedule to restore svm %p ranges\n", svms);
> -		schedule_delayed_work(&svms->restore_work,
> +		queue_delayed_work(system_freezable_wq, &svms->restore_work,
>   			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>   	} else {
>   		unsigned long s, l;