[PATCH] drm/amdkfd: restore_process_worker race with GPU reset

Thu Aug 29 21:15:04 UTC 2024

On 2024-08-23 15:49, Philip Yang wrote:
> If GPU reset kick in while KFD restore_process_worker running, this may
> causes different issues, for example below rcu stall warning, because
> restore work may move BOs and evict queues under VRAM pressure.
>
> Fix this race by taking adev reset_domain read semaphore to prevent GPU
> reset in restore_process_worker, the reset read semaphore can be taken
> recursively if adev have multiple partitions.
>
> Then there is live locking issue if CP hangs while
> restore_process_worker runs, then GPU reset wait for semaphore to start
> and restore_process_worker cannot finish to release semaphore. We need
> signal eviction fence to solve the live locking if evict queue return
> -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs,
>
>   amdgpu 0000:af:00.0: amdgpu: GPU reset(21) succeeded!
>   rcu: INFO: rcu_sched self-detected stall on CPU
>
>   Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
>   Call Trace:
>    update_process_times+0x94/0xd0
>   RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
>    amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
>    restore_process_helper+0x27/0x80 [amdgpu]
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>

See comments inline. I'd also like Christian to take a look at this 
patch since he's the expert on the reset locking stuff.

> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++++++++++++++++++++++-
>   1 file changed, 55 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a902950cc060..53a814347522 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -35,6 +35,7 @@
>   #include <linux/pm_runtime.h>
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu.h"
> +#include "amdgpu_reset.h"
>   
>   struct mm_struct;
>   
> @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct *work)
>   			kfd_process_restore_queues(p);
>   
>   		pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
> -	} else
> +	} else if (ret == -ETIMEDOUT || ret == -ETIME) {
> +		/* If CP hangs, signal the eviction fence, then restore_bo_worker
> +		 * can finish to up_read GPU reset semaphore to start GPU reset.
> +		 */
> +		signal_eviction_fence(p);
> +	} else {
>   		pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
> +	}
>   }
>   
>   static int restore_process_helper(struct kfd_process *p)
> @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p)
>   	return ret;
>   }
>   
> +/*
> + * kfd_hold_devices_reset_semaphore
> + *
> + * return:
> + *   true : hold reset domain semaphore to prevent device reset
> + *   false: one of the device is resetting or already reset
> + *
> + */
> +static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p)

I find the function naming of these functions (hold/unhold) a bit weird. 
I'd suggest kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems.

> +{
> +	struct amdgpu_device *adev;
> +	int i;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		adev = p->pdds[i]->dev->adev;
> +		if (!down_read_trylock(&adev->reset_domain->sem))
> +			goto out_upread;
> +	}
> +	return true;
> +
> +out_upread:
> +	while (i--) {
> +		adev = p->pdds[i]->dev->adev;
> +		up_read(&adev->reset_domain->sem);
> +	}
> +	return false;
> +}
> +
> +static void kfd_unhold_devices_reset_semaphore(struct kfd_process *p)
> +{
> +	struct amdgpu_device *adev;
> +	int i;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		adev = p->pdds[i]->dev->adev;
> +		up_read(&adev->reset_domain->sem);
> +	}
> +}
> +
>   static void restore_process_worker(struct work_struct *work)
>   {
>   	struct delayed_work *dwork;
> @@ -2009,6 +2055,12 @@ static void restore_process_worker(struct work_struct *work)
>   	 * lifetime of this thread, kfd_process p will be valid
>   	 */
>   	p = container_of(dwork, struct kfd_process, restore_work);
> +
> +	if (!kfd_hold_devices_reset_semaphore(p)) {
> +		pr_debug("GPU resetting, restore bo and queue skipped\n");

Should we reschedule the restore worker to make sure it runs again after 
the reset is done?

Thanks,
   Felix

> +		return;
> +	}
> +
>   	pr_debug("Started restoring pasid 0x%x\n", p->pasid);
>   
>   	/* Setting last_restore_timestamp before successful restoration.
> @@ -2031,6 +2083,8 @@ static void restore_process_worker(struct work_struct *work)
>   				     msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
>   			kfd_process_restore_queues(p);
>   	}
> +
> +	kfd_unhold_devices_reset_semaphore(p);
>   }
>   
>   void kfd_suspend_all_processes(void)