[PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset

Jonathan Kim Jonathan.Kim at amd.com
Fri Jul 26 15:30:49 UTC 2024


In order to allow ROCm GDB to handle reset queues, raise an
EC_QUEUE_RESET exception so that the debugger can subscribe and
query this exception.

Reset queues should still be considered suspendable with a status
flag of KFD_DBG_QUEUE_RESET_MASK.
However they should not be resumable since user space will no longer
be able to access reset queues.

v2: move per-queue reset flag to this patch
rebase based on patch 1 changes

Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 ++++++++++++++++---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
 include/uapi/linux/kfd_ioctl.h                |  4 +++
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e335703eff84..cb7b5bbf5c40 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -164,6 +164,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
 			struct kfd_process_device *pdd = qpd_to_pdd(qpd);
 
 			pdd->has_reset_queue = true;
+			q->properties.is_reset = true;
+			kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
+					 q->process, q->device, q->doorbell_id,
+					 false, NULL, 0);
 		}
 	}
 
@@ -986,7 +990,7 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
 {
 	bool is_new;
 
-	if (q->properties.is_suspended)
+	if (q->properties.is_suspended || q->properties.is_reset)
 		return 0;
 
 	pr_debug("Suspending PASID %u queue [%i]\n",
@@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
 		if (dqm->dev->kfd->shared_resources.enable_mes) {
 			int r = remove_queue_mes(dqm, q, &pdd->qpd);
 
+			if (q->properties.is_reset)
+				return 0;
+
 			if (r)
 				return r;
 		}
@@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
 	       q->properties.queue_id, q->process->pasid);
 
 	pdd->has_reset_queue = true;
+	q->properties.is_reset = true;
 	if (q->properties.is_active) {
 		q->properties.is_active = false;
 		decrement_queue_count(dqm, qpd, q);
 	}
+
+	kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
+			 q->doorbell_id, false, NULL, 0);
 }
 
 static int detect_queue_hang(struct device_queue_manager *dqm)
@@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p,
 						queue_ids[q_idx] &=
 							~KFD_DBG_QUEUE_INVALID_MASK;
 					} else {
-						queue_ids[q_idx] |=
+						queue_ids[q_idx] |= q->properties.is_reset ?
+							KFD_DBG_QUEUE_RESET_MASK :
 							KFD_DBG_QUEUE_ERROR_MASK;
 						break;
 					}
@@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p,
 							queue_ids);
 
 					/* mask queue as error on resume fail */
-					if (q_idx != QUEUE_NOT_FOUND)
+					if (q_idx != QUEUE_NOT_FOUND && !q->properties.is_reset)
 						queue_ids[q_idx] |=
 							KFD_DBG_QUEUE_ERROR_MASK;
 				}
@@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p,
 		struct qcm_process_device *qpd = &pdd->qpd;
 		struct queue *q;
 		int r, per_device_suspended = 0;
+		bool has_queue_reset_fail = false;
 
 		mutex_lock(&p->event_mutex);
 		dqm_lock(dqm);
@@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p,
 
 				if (!err) {
 					queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK;
+					if (q->properties.is_reset)
+						queue_ids[q_idx] |= KFD_DBG_QUEUE_RESET_MASK;
+
 					if (exception_clear_mask && is_mes)
 						q->properties.exception_status &=
 							~exception_clear_mask;
@@ -3176,13 +3192,18 @@ int suspend_queues(struct kfd_process *p,
 				continue;
 
 			/* mask queue as error on suspend fail */
-			if (r)
+			if (r && !q->properties.is_reset) {
+				has_queue_reset_fail = true;
 				queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
-			else if (exception_clear_mask)
+			} else if (exception_clear_mask) {
 				q->properties.exception_status &=
 							~exception_clear_mask;
+			}
 		}
 
+		if (!has_queue_reset_fail)
+			total_suspended += per_device_suspended;
+
 		dqm_unlock(dqm);
 		mutex_unlock(&p->event_mutex);
 		amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 892a85408c09..192e3102c152 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -504,6 +504,7 @@ struct queue_properties {
 	bool is_being_destroyed;
 	bool is_active;
 	bool is_gws;
+	bool is_reset;
 	uint32_t pm4_target_xcc;
 	bool is_dbg_wa;
 	bool is_user_cu_masked;
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 285a36601dc9..4713f9a6796e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
 	EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
 	EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
 	EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+	EC_QUEUE_RESET = 29,
 	EC_QUEUE_PREEMPTION_ERROR = 30,
 	EC_QUEUE_NEW = 31,
 	/* per device */
@@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
 				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |	\
 				 KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |	\
 				 KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED)	|	\
+				 KFD_EC_MASK(EC_QUEUE_RESET)	|	\
 				 KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR)	|	\
 				 KFD_EC_MASK(EC_QUEUE_NEW))
 #define KFD_EC_MASK_DEVICE	(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) |		\
@@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
 };
 
 /* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_RESET_BIT		29
 #define KFD_DBG_QUEUE_ERROR_BIT		30
 #define KFD_DBG_QUEUE_INVALID_BIT	31
+#define KFD_DBG_QUEUE_RESET_MASK	(1 << KFD_DBG_QUEUE_RESET_BIT)
 #define KFD_DBG_QUEUE_ERROR_MASK	(1 << KFD_DBG_QUEUE_ERROR_BIT)
 #define KFD_DBG_QUEUE_INVALID_MASK	(1 << KFD_DBG_QUEUE_INVALID_BIT)
 
-- 
2.34.1



More information about the amd-gfx mailing list