[PATCH] drm/amdkfd: fix missed queue reset on queue destroy

Jonathan Kim Jonathan.Kim at amd.com
Wed Aug 21 21:17:37 UTC 2024


If a queue is being destroyed but causes a HWS hang on removal, the KFD
may issue an unnecessary gpu reset if the destroyed queue can be fixed
by a queue reset.

This is because the queue has been removed from the KFD's queue list
prior to the preemption action on destroy so the reset call will fail to
match the HQD PQ reset information against the KFD's queue record to do
the actual reset.

Since a queue destroy request is under the same device lock as any other
preemption request (which subsumes queue reset calls), transiently
store the destroyed queue's reference so that a potential subsequent queue
reset call can check against this queue as well.

Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 +++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 577d121cc6d1..09e39a72ca31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1842,6 +1842,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
 		goto fail_detect_hang_buffer;
 	}
 
+	dqm->cur_destroyed_queue = NULL;
+
 	dqm_unlock(dqm);
 
 	return 0;
@@ -2105,7 +2107,7 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
 		q->properties.queue_id, q->process->pasid);
 
 	pdd->has_reset_queue = true;
-	if (q->properties.is_active) {
+	if (q->properties.is_active && dqm->cur_destroyed_queue != q) {
 		q->properties.is_active = false;
 		decrement_queue_count(dqm, qpd, q);
 	}
@@ -2160,6 +2162,10 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
 	struct qcm_process_device *qpd;
 	struct queue *q;
 
+	if (dqm->cur_destroyed_queue &&
+	    dqm->cur_destroyed_queue->properties.queue_address == queue_address)
+		return dqm->cur_destroyed_queue;
+
 	list_for_each_entry(cur, &dqm->queues, list) {
 		qpd = cur->qpd;
 		list_for_each_entry(q, &qpd->queues_list, list) {
@@ -2409,6 +2415,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	list_del(&q->list);
 	qpd->queue_count--;
+	dqm->cur_destroyed_queue = q;
 	if (q->properties.is_active) {
 		decrement_queue_count(dqm, qpd, q);
 		if (!dqm->dev->kfd->shared_resources.enable_mes) {
@@ -2421,6 +2428,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 			retval = remove_queue_mes(dqm, q, qpd);
 		}
 	}
+	dqm->cur_destroyed_queue = NULL;
 
 	/*
 	 * Unconditionally decrement this counter, regardless of the queue's
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 08b40826ad1e..5425c1dd7924 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -285,6 +285,7 @@ struct device_queue_manager {
 	struct dqm_detect_hang_info *detect_hang_info;
 	size_t detect_hang_info_size;
 	int detect_hang_count;
+	struct queue *cur_destroyed_queue;
 };
 
 void device_queue_manager_init_cik(
-- 
2.34.1



More information about the amd-gfx mailing list