[PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling

Fri Dec 20 17:18:08 UTC 2019

[AMD Official Use Only - Internal Distribution Only]

With this improvement, it is still possible that two reset be scheduled. There is a period of time after HWS hang and before kfd pre-reset is called, during which, if a thread already passed the is_hws_hang check but was scheduled out, then it will also schedule another reset. The whole sequence is:

Thread 1: call unmap_queues_cpsch, pass the is_hws_hang, scheduled out before sending unmap command to HWS
Thread 2: send unmap to HWS ->hang, schedule a reset
Thread1: before the reset worker thread is run(resetting is still false), thread1 continus, another reset is scheduled.


Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx at lists.freedesktop.org
Subject: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling

Move HWS hand detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it.

Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  3 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++-----  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 ++
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c6b6901bbda3..2a9e40131735 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
 	if (!kfd->init_complete)
 		return 0;
+
+	kfd->dqm->ops.pre_reset(kfd->dqm);
+
 	kgd2kfd_suspend(kfd);
 
 	kfd_signal_reset_event(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 558c0ad81848..a7e9ec1b3ce3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 	return 0;
 }
 
+static void pre_reset(struct device_queue_manager *dqm) {
+	dqm_lock(dqm);
+	dqm->is_resetting = true;
+	dqm_unlock(dqm);
+}
+
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q)
 {
@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->is_resetting = false;
 	dqm->sched_running = true;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	/* should be timed out */
 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
 				queue_preemption_timeout_ms);
-	if (retval)
+	if (retval) {
+		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+		dqm->is_hws_hang = true;
+		/* It's possible we're detecting a HWS hang in the
+		 * middle of a GPU reset. No need to schedule another
+		 * reset in this case.
+		 */
+		if (!dqm->is_resetting)
+			schedule_work(&dqm->hw_exception_work);
 		return retval;
+	}
 
 	pm_release_ib(&dqm->packets);
 	dqm->active_runlist = false;
@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 	if (dqm->is_hws_hang)
 		return -EIO;
 	retval = unmap_queues_cpsch(dqm, filter, filter_param);
-	if (retval) {
-		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
-		dqm->is_hws_hang = true;
-		schedule_work(&dqm->hw_exception_work);
+	if (retval)
 		return retval;
-	}
 
 	return map_queues_cpsch(dqm);
 }
@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.initialize = initialize_cpsch;
 		dqm->ops.start = start_cpsch;
 		dqm->ops.stop = stop_cpsch;
+		dqm->ops.pre_reset = pre_reset;
 		dqm->ops.destroy_queue = destroy_queue_cpsch;
 		dqm->ops.update_queue = update_queue;
 		dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		/* initialize dqm for no cp scheduling */
 		dqm->ops.start = start_nocpsch;
 		dqm->ops.stop = stop_nocpsch;
+		dqm->ops.pre_reset = pre_reset;
 		dqm->ops.create_queue = create_queue_nocpsch;
 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
 		dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 8991120c4fa2..871d3b628d2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
 	int	(*initialize)(struct device_queue_manager *dqm);
 	int	(*start)(struct device_queue_manager *dqm);
 	int	(*stop)(struct device_queue_manager *dqm);
+	void	(*pre_reset)(struct device_queue_manager *dqm);
 	void	(*uninitialize)(struct device_queue_manager *dqm);
 	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
 					struct kernel_queue *kq,
@@ -198,6 +199,7 @@ struct device_queue_manager {
 
 	/* hw exception  */
 	bool			is_hws_hang;
+	bool			is_resetting;
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
--
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coak.zeng%40amd.com%7Ca59ace5396cb46ed384708d78526df99%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274434007022&sdata=g%2B57MBWpTbFzbchp6%2Bi2dfmUzYXlsf77InUj3R1XaaY%3D&reserved=0