[PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset
Jonathan Kim
Jonathan.Kim at amd.com
Thu Jul 18 17:56:45 UTC 2024
In order to allow ROCm GDB to handle reset queues, raise an
EC_QUEUE_RESET exception so that the debugger can subscribe and
query this exception.
Reset queues should still be considered suspendable with a status
flag of KFD_DBG_QUEUE_RESET_MASK.
However they should not be resumable since user space will no longer
be able to access reset queues.
Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 30 +++++++++++++++----
include/uapi/linux/kfd_ioctl.h | 4 +++
2 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 037b75a64e66..8c41806df39e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -163,6 +163,10 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_reset = true;
q->process->has_reset_queue = true;
+
+ kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
+ q->process, q->device, q->doorbell_id,
+ false, NULL, 0);
}
}
@@ -987,7 +991,7 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
{
bool is_new;
- if (q->properties.is_suspended)
+ if (q->properties.is_suspended || q->properties.is_reset)
return 0;
pr_debug("Suspending PASID %u queue [%i]\n",
@@ -1008,6 +1012,9 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
if (dqm->dev->kfd->shared_resources.enable_mes) {
int r = remove_queue_mes(dqm, q, &pdd->qpd);
+ if (q->properties.is_reset)
+ return 0;
+
if (r)
return r;
}
@@ -1971,6 +1978,9 @@ static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
}
+
+ kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
+ q->doorbell_id, false, NULL, 0);
}
static int detect_queue_hang(struct device_queue_manager *dqm)
@@ -3036,7 +3046,8 @@ int resume_queues(struct kfd_process *p,
queue_ids[q_idx] &=
~KFD_DBG_QUEUE_INVALID_MASK;
} else {
- queue_ids[q_idx] |=
+ queue_ids[q_idx] |= q->properties.is_reset ?
+ KFD_DBG_QUEUE_RESET_MASK :
KFD_DBG_QUEUE_ERROR_MASK;
break;
}
@@ -3071,7 +3082,7 @@ int resume_queues(struct kfd_process *p,
queue_ids);
/* mask queue as error on resume fail */
- if (q_idx != QUEUE_NOT_FOUND)
+ if (q_idx != QUEUE_NOT_FOUND && !q->properties.is_reset)
queue_ids[q_idx] |=
KFD_DBG_QUEUE_ERROR_MASK;
}
@@ -3118,6 +3129,7 @@ int suspend_queues(struct kfd_process *p,
struct qcm_process_device *qpd = &pdd->qpd;
struct queue *q;
int r, per_device_suspended = 0;
+ bool has_queue_reset_fail = false;
mutex_lock(&p->event_mutex);
dqm_lock(dqm);
@@ -3134,6 +3146,9 @@ int suspend_queues(struct kfd_process *p,
if (!err) {
queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK;
+ if (q->properties.is_reset)
+ queue_ids[q_idx] |= KFD_DBG_QUEUE_RESET_MASK;
+
if (exception_clear_mask && is_mes)
q->properties.exception_status &=
~exception_clear_mask;
@@ -3175,13 +3190,18 @@ int suspend_queues(struct kfd_process *p,
continue;
/* mask queue as error on suspend fail */
- if (r)
+ if (r && !q->properties.is_reset) {
+ has_queue_reset_fail = true;
queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK;
- else if (exception_clear_mask)
+ } else if (exception_clear_mask) {
q->properties.exception_status &=
~exception_clear_mask;
+ }
}
+ if (!has_queue_reset_fail)
+ total_suspended += per_device_suspended;
+
dqm_unlock(dqm);
mutex_unlock(&p->event_mutex);
amdgpu_device_flush_hdp(dqm->dev->adev, NULL);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 285a36601dc9..4713f9a6796e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -875,6 +875,7 @@ enum kfd_dbg_trap_exception_code {
EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21,
EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22,
EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23,
+ EC_QUEUE_RESET = 29,
EC_QUEUE_PREEMPTION_ERROR = 30,
EC_QUEUE_NEW = 31,
/* per device */
@@ -907,6 +908,7 @@ enum kfd_dbg_trap_exception_code {
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \
KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \
+ KFD_EC_MASK(EC_QUEUE_RESET) | \
KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \
KFD_EC_MASK(EC_QUEUE_NEW))
#define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \
@@ -997,8 +999,10 @@ struct kfd_queue_snapshot_entry {
};
/* Queue status return for suspend/resume */
+#define KFD_DBG_QUEUE_RESET_BIT 29
#define KFD_DBG_QUEUE_ERROR_BIT 30
#define KFD_DBG_QUEUE_INVALID_BIT 31
+#define KFD_DBG_QUEUE_RESET_MASK (1 << KFD_DBG_QUEUE_RESET_BIT)
#define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT)
#define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT)
--
2.34.1
More information about the amd-gfx
mailing list