[PATCH v3] drm/amdkfd: Run restore_workers on freezable WQs

Fri Nov 24 13:40:07 UTC 2023

On 11/24/2023 4:25 AM, Felix Kuehling wrote:
> Make restore workers freezable so we don't have to explicitly flush them
> in suspend and GPU reset code paths, and we don't accidentally try to
> restore BOs while the GPU is suspended. Not having to flush restore_work
> also helps avoid lock/fence dependencies in the GPU reset case where we're
> not allowed to wait for fences.
> 
> A side effect of this is, that we can now have multiple concurrent threads
> trying to signal the same eviction fence. Rework eviction fence signaling
> and replacement to account for that.
> 
> The GPU reset path can no longer rely on restore_process_worker to resume
> queues because evict/restore workers can run independently of it. Instead
> call a new restore_process_helper directly.

Not familiar with this code. For clarity, does this mean 
eviction/restore may happen while a reset is in progress?

Thanks,
Lijo

> 
> This is an RFC and request for testing.
> 
> v2:
> - Reworked eviction fence signaling
> - Introduced restore_process_helper
> 
> v3:
> - Handle unsignaled eviction fences in restore_process_bos
> 
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> Acked-by: Christian König <christian.koenig at amd.com>
> ---
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 68 +++++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 87 +++++++++++--------
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
>   3 files changed, 104 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 2e302956a279..bdec88713a09 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1431,7 +1431,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
>   				  amdgpu_amdkfd_restore_userptr_worker);
>   
>   		*process_info = info;
> -		*ef = dma_fence_get(&info->eviction_fence->base);
>   	}
>   
>   	vm->process_info = *process_info;
> @@ -1462,6 +1461,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
>   	list_add_tail(&vm->vm_list_node,
>   			&(vm->process_info->vm_list_head));
>   	vm->process_info->n_vms++;
> +
> +	*ef = dma_fence_get(&vm->process_info->eviction_fence->base);
>   	mutex_unlock(&vm->process_info->lock);
>   
>   	return 0;
> @@ -1473,10 +1474,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
>   reserve_pd_fail:
>   	vm->process_info = NULL;
>   	if (info) {
> -		/* Two fence references: one in info and one in *ef */
>   		dma_fence_put(&info->eviction_fence->base);
> -		dma_fence_put(*ef);
> -		*ef = NULL;
>   		*process_info = NULL;
>   		put_pid(info->pid);
>   create_evict_fence_fail:
> @@ -1670,7 +1668,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
>   		goto out_unlock;
>   	}
>   	WRITE_ONCE(pinfo->block_mmu_notifications, false);
> -	schedule_delayed_work(&pinfo->restore_userptr_work, 0);
> +	queue_delayed_work(system_freezable_wq,
> +			   &pinfo->restore_userptr_work, 0);
>   
>   out_unlock:
>   	mutex_unlock(&pinfo->lock);
> @@ -2475,7 +2474,8 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
>   				       KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
>   		if (r)
>   			pr_err("Failed to quiesce KFD\n");
> -		schedule_delayed_work(&process_info->restore_userptr_work,
> +		queue_delayed_work(system_freezable_wq,
> +			&process_info->restore_userptr_work,
>   			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>   	}
>   	mutex_unlock(&process_info->notifier_lock);
> @@ -2810,7 +2810,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>   
>   	/* If validation failed, reschedule another attempt */
>   	if (evicted_bos) {
> -		schedule_delayed_work(&process_info->restore_userptr_work,
> +		queue_delayed_work(system_freezable_wq,
> +			&process_info->restore_userptr_work,
>   			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>   
>   		kfd_smi_event_queue_restore_rescheduled(mm);
> @@ -2819,6 +2820,23 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>   	put_task_struct(usertask);
>   }
>   
> +static void replace_eviction_fence(struct dma_fence **ef,
> +				   struct dma_fence *new_ef)
> +{
> +	struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
> +		/* protected by process_info->lock */);
> +
> +	/* If we're replacing an unsignaled eviction fence, that fence will
> +	 * never be signaled, and if anyone is still waiting on that fence,
> +	 * they will hang forever. This should never happen. We should only
> +	 * replace the fence in restore_work that only gets scheduled after
> +	 * eviction work signaled the fence.
> +	 */
> +	WARN_ONCE(!dma_fence_is_signaled(old_ef),
> +		  "Replacing unsignaled eviction fence");
> +	dma_fence_put(old_ef);
> +}
> +
>   /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
>    *   KFD process identified by process_info
>    *
> @@ -2844,7 +2862,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
>   	struct amdgpu_vm *peer_vm;
>   	struct kgd_mem *mem;
>   	struct bo_vm_reservation_context ctx;
> -	struct amdgpu_amdkfd_fence *new_fence;
>   	int ret = 0, i;
>   	struct list_head duplicate_save;
>   	struct amdgpu_sync sync_obj;
> @@ -2974,22 +2991,35 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
>   	/* Wait for validate and PT updates to finish */
>   	amdgpu_sync_wait(&sync_obj, false);
>   
> -	/* Release old eviction fence and create new one, because fence only
> -	 * goes from unsignaled to signaled, fence cannot be reused.
> -	 * Use context and mm from the old fence.
> +	/* The old eviction fence may be unsignaled if restore happens
> +	 * after a GPU reset or suspend/resume. Keep the old fence in that
> +	 * case. Otherwise release the old eviction fence and create new
> +	 * one, because fence only goes from unsignaled to signaled once
> +	 * and cannot be reused. Use context and mm from the old fence.
> +	 *
> +	 * If an old eviction fence signals after this check, that's OK.
> +	 * Anyone signaling an eviction fence must stop the queues first
> +	 * and schedule another restore worker.
>   	 */
> -	new_fence = amdgpu_amdkfd_fence_create(
> +	if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {
> +		struct amdgpu_amdkfd_fence *new_fence =
> +			amdgpu_amdkfd_fence_create(
>   				process_info->eviction_fence->base.context,
>   				process_info->eviction_fence->mm,
>   				NULL);
> -	if (!new_fence) {
> -		pr_err("Failed to create eviction fence\n");
> -		ret = -ENOMEM;
> -		goto validate_map_fail;
> +
> +		if (!new_fence) {
> +			pr_err("Failed to create eviction fence\n");
> +			ret = -ENOMEM;
> +			goto validate_map_fail;
> +		}
> +		dma_fence_put(&process_info->eviction_fence->base);
> +		process_info->eviction_fence = new_fence;
> +		replace_eviction_fence(ef, dma_fence_get(&new_fence->base));
> +	} else {
> +		WARN_ONCE(*ef != &process_info->eviction_fence->base,
> +			  "KFD eviction fence doesn't match KGD process_info");
>   	}
> -	dma_fence_put(&process_info->eviction_fence->base);
> -	process_info->eviction_fence = new_fence;
> -	*ef = dma_fence_get(&new_fence->base);
>   
>   	/* Attach new eviction fence to all BOs except pinned ones */
>   	list_for_each_entry(mem, &process_info->kfd_bo_list,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index c10d050e1a61..71df51fcc1b0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
>   	if (!kfd_process_wq)
>   		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
>   	if (!kfd_restore_wq)
> -		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
> +		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
> +							 WQ_FREEZABLE);
>   
>   	if (!kfd_process_wq || !kfd_restore_wq) {
>   		kfd_process_destroy_wq();
> @@ -1642,6 +1643,7 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
>   	struct amdgpu_fpriv *drv_priv;
>   	struct amdgpu_vm *avm;
>   	struct kfd_process *p;
> +	struct dma_fence *ef;
>   	struct kfd_node *dev;
>   	int ret;
>   
> @@ -1661,11 +1663,12 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
>   
>   	ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
>   						     &p->kgd_process_info,
> -						     &p->ef);
> +						     &ef);
>   	if (ret) {
>   		pr_err("Failed to create process VM object\n");
>   		return ret;
>   	}
> +	RCU_INIT_POINTER(p->ef, ef);
>   	pdd->drm_priv = drm_file->private_data;
>   
>   	ret = kfd_process_device_reserve_ib_mem(pdd);
> @@ -1908,6 +1911,21 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
>   	return -EINVAL;
>   }
>   
> +static int signal_eviction_fence(struct kfd_process *p)
> +{
> +	struct dma_fence *ef;
> +	int ret;
> +
> +	rcu_read_lock();
> +	ef = dma_fence_get_rcu_safe(&p->ef);
> +	rcu_read_unlock();
> +
> +	ret = dma_fence_signal(ef);
> +	dma_fence_put(ef);
> +
> +	return ret;
> +}
> +
>   static void evict_process_worker(struct work_struct *work)
>   {
>   	int ret;
> @@ -1920,31 +1938,46 @@ static void evict_process_worker(struct work_struct *work)
>   	 * lifetime of this thread, kfd_process p will be valid
>   	 */
>   	p = container_of(dwork, struct kfd_process, eviction_work);
> -	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
> -		  "Eviction fence mismatch\n");
> -
> -	/* Narrow window of overlap between restore and evict work
> -	 * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
> -	 * unreserves KFD BOs, it is possible to evicted again. But
> -	 * restore has few more steps of finish. So lets wait for any
> -	 * previous restore work to complete
> -	 */
> -	flush_delayed_work(&p->restore_work);
>   
>   	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
>   	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
>   	if (!ret) {
> -		dma_fence_signal(p->ef);
> -		dma_fence_put(p->ef);
> -		p->ef = NULL;
> -		queue_delayed_work(kfd_restore_wq, &p->restore_work,
> +		/* If another thread already signaled the eviction fence,
> +		 * they are responsible stopping the queues and scheduling
> +		 * the restore work.
> +		 */
> +		if (!signal_eviction_fence(p))
> +			queue_delayed_work(kfd_restore_wq, &p->restore_work,
>   				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
> +		else
> +			kfd_process_restore_queues(p);
>   
>   		pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
>   	} else
>   		pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
>   }
>   
> +static int restore_process_helper(struct kfd_process *p)
> +{
> +	int ret = 0;
> +
> +	/* VMs may not have been acquired yet during debugging. */
> +	if (p->kgd_process_info) {
> +		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
> +			p->kgd_process_info, &p->ef);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	ret = kfd_process_restore_queues(p);
> +	if (!ret)
> +		pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
> +	else
> +		pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
> +
> +	return ret;
> +}
> +
>   static void restore_process_worker(struct work_struct *work)
>   {
>   	struct delayed_work *dwork;
> @@ -1970,24 +2003,15 @@ static void restore_process_worker(struct work_struct *work)
>   	 */
>   
>   	p->last_restore_timestamp = get_jiffies_64();
> -	/* VMs may not have been acquired yet during debugging. */
> -	if (p->kgd_process_info)
> -		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> -							     &p->ef);
> +
> +	ret = restore_process_helper(p);
>   	if (ret) {
>   		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
>   			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
>   		ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
>   				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
>   		WARN(!ret, "reschedule restore work failed\n");
> -		return;
>   	}
> -
> -	ret = kfd_process_restore_queues(p);
> -	if (!ret)
> -		pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
> -	else
> -		pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
>   }
>   
>   void kfd_suspend_all_processes(void)
> @@ -1998,14 +2022,9 @@ void kfd_suspend_all_processes(void)
>   
>   	WARN(debug_evictions, "Evicting all processes");
>   	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> -		cancel_delayed_work_sync(&p->eviction_work);
> -		flush_delayed_work(&p->restore_work);
> -
>   		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);
> -		dma_fence_signal(p->ef);
> -		dma_fence_put(p->ef);
> -		p->ef = NULL;
> +		signal_eviction_fence(p);
>   	}
>   	srcu_read_unlock(&kfd_processes_srcu, idx);
>   }
> @@ -2017,7 +2036,7 @@ int kfd_resume_all_processes(void)
>   	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
>   
>   	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> -		if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
> +		if (restore_process_helper(p)) {
>   			pr_err("Restore process %d failed during resume\n",
>   			       p->pasid);
>   			ret = -EFAULT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index b23ba92a794c..42b5279c7010 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1871,7 +1871,7 @@ static void svm_range_restore_work(struct work_struct *work)
>   	/* If validation failed, reschedule another attempt */
>   	if (evicted_ranges) {
>   		pr_debug("reschedule to restore svm range\n");
> -		schedule_delayed_work(&svms->restore_work,
> +		queue_delayed_work(system_freezable_wq, &svms->restore_work,
>   			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>   
>   		kfd_smi_event_queue_restore_rescheduled(mm);
> @@ -1947,7 +1947,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
>   			pr_debug("failed to quiesce KFD\n");
>   
>   		pr_debug("schedule to restore svm %p ranges\n", svms);
> -		schedule_delayed_work(&svms->restore_work,
> +		queue_delayed_work(system_freezable_wq, &svms->restore_work,
>   			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>   	} else {
>   		unsigned long s, l;