[PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset

Thu Jul 18 17:56:45 UTC 2024

In order to allow ROCm GDB to handle reset queues, raise an
EC_QUEUE_RESET exception so that the debugger can subscribe and
query this exception.

Reset queues should still be considered suspendable with a status
flag of KFD_DBG_QUEUE_RESET_MASK.
However they should not be resumable since user space will no longer
be able to access reset queues.

Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 30 +++++++++++++++----
 include/uapi/linux/kfd_ioctl.h                |  4 +++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 037b75a64e66..8c41806df39e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -163,6 +163,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
 		list_for_each_entry(q, &qpd->queues_list, list) {
 			q->properties.is_reset = true;
 			q->process->has_reset_queue = true;
+
+			kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
+					 q->process, q->device, q->doorbell_id,
+					 false, NULL, 0);
 		}
 	}
 
@@ -987,7 +991,7 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
 {
 	bool is_new;
 
-	if (q->properties.is_suspended)
+	if (q->properties.is_suspended || q->properties.is_reset)
 		return 0;
 
 	pr_debug("Suspending PASID %u queue [%i]\n",
@@ -1008,6 +1012,9 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
 		if (dqm->dev->kfd->shared_resources.enable_mes) {
 			int r = remove_queue_mes(dqm, q, &pdd->qpd);
 
+			if (q->properties.is_reset)
+				return 0;
+
 			if (r)
 				return r;
 		}
@@ -1971,6 +1978,9 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
 		q->properties.is_active = false;
 		decrement_queue_count(dqm, qpd, q);
 	}
+
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
+			 q->doorbell_id, false, NULL, 0);
 }
 
 static int detect_queue_hang(struct device_queue_manager *dqm)
@@ -3036,7 +3046,8 @@ int resume_queues(struct kfd_process *p,
 						queue_ids[q_idx] &=
 							~KFD_DBG_QUEUE_INVALID_MASK;
 					} else {
-						queue_ids[q_idx] |=
+						queue_ids[q_idx] |= q->properties.is_reset ?
+							KFD_DBG_QUEUE_RESET_MASK :
 							KFD_DBG_QUEUE_ERROR_MASK;
 						break;
 					}
@@ -3071,7 +3082,7 @@ int resume_queues(struct kfd_process *p,
 							queue_ids);
 
 					/* mask queue as error on resume fail */
-					if (q_idx != QUEUE_NOT_FOUND)
+					if (q_idx != QUEUE_NOT_FOUND && !q->properties.is_reset)
 						queue_ids[q_idx] |=
 							KFD_DBG_QUEUE_ERROR_MASK;
 				}
@@ -3118,6 +3129,7 @@ int suspend_queues(struct kfd_process *p,
 		struct qcm_process_device *qpd = &pdd->qpd;
 		struct queue *q;
 		int r, per_device_suspended = 0;
+		bool has_queue_reset_fail = false;
 
 		mutex_lock(&p->event_mutex);
 		dqm_lock(dqm);
@@ -3134,6 +3146,9 @@ int suspend_queues(struct kfd_process *p,
 
 				if (!err) {
 					queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK;
+					if (q->properties.is_reset)
+						queue_ids[q_idx] |= KFD_DBG_QUEUE_RESET_MASK;
+
 					if (exception_clear_mask && is_mes)
 						q->properties.exception_status &=
 							~exception_clear_mask;
@@ -3175,13 +3190,18 @@ int suspend_queues(struct kfd_process *p,
 				continue;
 
 			/* mask queue as error on suspend fail */
-			if (r)
+			if (r && !q->properties.is_reset) {
+				has_queue_reset_fail = true;
 				queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
-			else if (exception_clear_mask)
+			} else if (exception_clear_mask) {
 				q->properties.exception_status &=
 							~exception_clear_mask;
+			}
 		}
 
+		if (!has_queue_reset_fail)
+			total_suspended += per_device_suspended;
+
 		dqm_unlock(dqm);
 		mutex_unlock(&p->event_mutex);
 		amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 285a36601dc9..4713f9a6796e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
 	EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
 	EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
 	EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+	EC_QUEUE_RESET = 29,
 	EC_QUEUE_PREEMPTION_ERROR = 30,
 	EC_QUEUE_NEW = 31,
 	/* per device */
@@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
 				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |	\
 				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |	\
 				 KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)	|	\
+				 KFD_EC_MASK(EC_QUEUE_RESET)	|	\
 				 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR)	|	\
 				 KFD_EC_MASK(EC_QUEUE_NEW))
 #define KFD_EC_MASK_DEVICE	(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |		\
@@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
 };
 
 /* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_RESET_BIT		29
 #define KFD_DBG_QUEUE_ERROR_BIT		30
 #define KFD_DBG_QUEUE_INVALID_BIT	31
+#define KFD_DBG_QUEUE_RESET_MASK	(1 << KFD_DBG_QUEUE_RESET_BIT)
 #define KFD_DBG_QUEUE_ERROR_MASK	(1 << KFD_DBG_QUEUE_ERROR_BIT)
 #define KFD_DBG_QUEUE_INVALID_MASK	(1 << KFD_DBG_QUEUE_INVALID_BIT)
 
-- 
2.34.1