[PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling

Fri Dec 20 17:49:29 UTC 2019

On 2019-12-20 12:18, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> With this improvement, it is still possible that two reset be scheduled. There is a period of time after HWS hang and before kfd pre-reset is called, during which, if a thread already passed the is_hws_hang check but was scheduled out, then it will also schedule another reset. The whole sequence is:
>
> Thread 1: call unmap_queues_cpsch, pass the is_hws_hang, scheduled out before sending unmap command to HWS
> Thread 2: send unmap to HWS ->hang, schedule a reset
> Thread1: before the reset worker thread is run(resetting is still false), thread1 continus, another reset is scheduled.

Rescheduling the reset worker "before the reset worker thread is run" 
results in the reset worker only running once. The work item can be on 
the queue twice at the same time. The more interesting case is if the 
reset worker is already running but hasn't called 
amdgpu_amdkfd_pre_reset yet. In that case we may end up scheduling a 
second reset. I can't think of a good way to prevent this race.

It gets more confusing when you consider that GPU resets can be 
triggered outside of KFD. So a reset can start outside a KFD reset 
worker thread and KFD can schedule another reset. I think the only place 
to really prevent this type of race would be in 
amdgpu_device_should_recover_gpu with some kind of reset decision flag 
protected by a lock.

I could also try to get rid of the worker thread for GPU resets in KFD. 
I think we created the worker to avoid locking issues, but there may be 
better ways to do this.

Regards,
   Felix

>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx at lists.freedesktop.org
> Subject: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
>
> Move HWS hand detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  3 +++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++-----  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 ++
>   3 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c6b6901bbda3..2a9e40131735 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
>   	if (!kfd->init_complete)
>   		return 0;
> +
> +	kfd->dqm->ops.pre_reset(kfd->dqm);
> +
>   	kgd2kfd_suspend(kfd);
>   
>   	kfd_signal_reset_event(kfd);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 558c0ad81848..a7e9ec1b3ce3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> +static void pre_reset(struct device_queue_manager *dqm) {
> +	dqm_lock(dqm);
> +	dqm->is_resetting = true;
> +	dqm_unlock(dqm);
> +}
> +
>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q)
>   {
> @@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm_lock(dqm);
>   	/* clear hang status when driver try to start the hw scheduler */
>   	dqm->is_hws_hang = false;
> +	dqm->is_resetting = false;
>   	dqm->sched_running = true;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
> @@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   	/* should be timed out */
>   	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
>   				queue_preemption_timeout_ms);
> -	if (retval)
> +	if (retval) {
> +		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
> +		dqm->is_hws_hang = true;
> +		/* It's possible we're detecting a HWS hang in the
> +		 * middle of a GPU reset. No need to schedule another
> +		 * reset in this case.
> +		 */
> +		if (!dqm->is_resetting)
> +			schedule_work(&dqm->hw_exception_work);
>   		return retval;
> +	}
>   
>   	pm_release_ib(&dqm->packets);
>   	dqm->active_runlist = false;
> @@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   	if (dqm->is_hws_hang)
>   		return -EIO;
>   	retval = unmap_queues_cpsch(dqm, filter, filter_param);
> -	if (retval) {
> -		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
> -		dqm->is_hws_hang = true;
> -		schedule_work(&dqm->hw_exception_work);
> +	if (retval)
>   		return retval;
> -	}
>   
>   	return map_queues_cpsch(dqm);
>   }
> @@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>   		dqm->ops.initialize = initialize_cpsch;
>   		dqm->ops.start = start_cpsch;
>   		dqm->ops.stop = stop_cpsch;
> +		dqm->ops.pre_reset = pre_reset;
>   		dqm->ops.destroy_queue = destroy_queue_cpsch;
>   		dqm->ops.update_queue = update_queue;
>   		dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>   		/* initialize dqm for no cp scheduling */
>   		dqm->ops.start = start_nocpsch;
>   		dqm->ops.stop = stop_nocpsch;
> +		dqm->ops.pre_reset = pre_reset;
>   		dqm->ops.create_queue = create_queue_nocpsch;
>   		dqm->ops.destroy_queue = destroy_queue_nocpsch;
>   		dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 8991120c4fa2..871d3b628d2d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -104,6 +104,7 @@ struct device_queue_manager_ops {
>   	int	(*initialize)(struct device_queue_manager *dqm);
>   	int	(*start)(struct device_queue_manager *dqm);
>   	int	(*stop)(struct device_queue_manager *dqm);
> +	void	(*pre_reset)(struct device_queue_manager *dqm);
>   	void	(*uninitialize)(struct device_queue_manager *dqm);
>   	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
>   					struct kernel_queue *kq,
> @@ -198,6 +199,7 @@ struct device_queue_manager {
>   
>   	/* hw exception  */
>   	bool			is_hws_hang;
> +	bool			is_resetting;
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
>   	bool			sched_running;
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coak.zeng%40amd.com%7Ca59ace5396cb46ed384708d78526df99%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274434007022&sdata=g%2B57MBWpTbFzbchp6%2Bi2dfmUzYXlsf77InUj3R1XaaY%3D&reserved=0