[PATCH] drm/amd/amdkfd: Don't sent command to HWS on kfd reset

Thu Nov 4 03:15:46 UTC 2021

On 2021-11-03 11:04 a.m., shaoyunl wrote:
> When kfd need to be reset, sent command to HWS might cause hang and get unnecessary timeout.
> This change try not to touch HW in pre_reset and keep queues to be in the evicted state
> when the reset is done, so they are not put back on the runlist. These queues will be destroied
> on process termination.
>
> Signed-off-by: shaoyunl <shaoyun.liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c               | 6 +++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                 | 2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c              | 6 +++++-
>   4 files changed, 13 insertions(+), 3 deletions(-)
>   mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device.c
>   mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>   mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>   mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_process.c
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> old mode 100644
> new mode 100755
> index c8aade17efef..536ef766d09e
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1100,6 +1100,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   	if (!kfd->init_complete)
>   		return 0;
>   
> +	kfd->is_resetting = true;
> +
>   	kfd_smi_event_update_gpu_reset(kfd, false);
>   
>   	kfd->dqm->ops.pre_reset(kfd->dqm);
> @@ -1132,6 +1134,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>   
>   	kfd_smi_event_update_gpu_reset(kfd, true);
>   
> +	kfd->is_resetting = false;
> +
>   	return 0;
>   }
>   
> @@ -1168,7 +1172,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>   		return ret;
>   
>   	/* for runtime resume, skip unlocking kfd */
> -	if (!run_pm) {
> +	if (!run_pm && !kfd->is_resetting) {

This is not needed. post_reset calls kfd_resume, not kgd2kfd_resume. It 
should never get here.

Regards,
   Felix

>   		count = atomic_dec_return(&kfd_locked);
>   		WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
>   		if (count == 0)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> old mode 100644
> new mode 100755
> index e9601d4dfb77..0a60317509c8
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1430,7 +1430,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   
>   	if (!dqm->sched_running)
>   		return 0;
> -	if (dqm->is_hws_hang)
> +	if (dqm->is_hws_hang || dqm->is_resetting)
>   		return -EIO;
>   	if (!dqm->active_runlist)
>   		return retval;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> old mode 100644
> new mode 100755
> index bfe7bacccb73..e4bcc2a09ca8
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -275,6 +275,8 @@ struct kfd_dev {
>   	struct device_queue_manager *dqm;
>   
>   	bool init_complete;
> +	bool is_resetting;
> +
>   	/*
>   	 * Interrupts of interest to KFD are copied
>   	 * from the HW ring into a SW ring.
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> old mode 100644
> new mode 100755
> index f8a8fdb95832..f29b3932e3dc
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1715,7 +1715,11 @@ int kfd_process_evict_queues(struct kfd_process *p)
>   
>   		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>   							    &pdd->qpd);
> -		if (r) {
> +		/* evict return -EIO if HWS is hang or asic is resetting, in this case
> +		 * we would like to set all the queues to be in evicted state to prevent
> +		 * them been add back since they actually not be saved right now.
> +		 */
> +		if (r && r != -EIO) {
>   			pr_err("Failed to evict process queues\n");
>   			goto fail;
>   		}