[RFC 1/5] drm/xe/devcoredump: Support coredumps without jobs
John.C.Harrison at Intel.com
John.C.Harrison at Intel.com
Sat Nov 9 01:59:30 UTC 2024
From: John Harrison <John.C.Harrison at Intel.com>
A devcoredump is an extremely useful debug feature. So allow it to be
used for issues where there is no DRM scheduler job available.
Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
---
drivers/gpu/drm/xe/xe_devcoredump.c | 155 +++++++++++++++++++++++-----
drivers/gpu/drm/xe/xe_devcoredump.h | 5 +-
drivers/gpu/drm/xe/xe_guc_submit.c | 2 +-
3 files changed, 137 insertions(+), 25 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index d3570d3d573c..f0fcc898b3ae 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -237,8 +237,8 @@ static void xe_devcoredump_free(void *data)
"Xe device coredump has been deleted.\n");
}
-static void devcoredump_snapshot(struct xe_devcoredump *coredump,
- struct xe_sched_job *job)
+static void devcoredump_snapshot_job(struct xe_devcoredump *coredump,
+ struct xe_sched_job *job)
{
struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
struct xe_exec_queue *q = job->q;
@@ -246,23 +246,16 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
u32 adj_logical_mask = q->logical_mask;
u32 width_mask = (0x1 << q->width) - 1;
const char *process_name = "no process";
-
unsigned int fw_ref;
- bool cookie;
int i;
- ss->snapshot_time = ktime_get_real();
- ss->boot_time = ktime_get_boottime();
-
if (q->vm && q->vm->xef)
process_name = q->vm->xef->process_name;
strscpy(ss->process_name, process_name);
ss->gt = q->gt;
coredump->job = job;
- INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
- cookie = dma_fence_begin_signalling();
for (i = 0; q->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) {
if (adj_logical_mask & BIT(i)) {
adj_logical_mask |= width_mask << i;
@@ -283,32 +276,109 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
xe_engine_snapshot_capture_for_job(job);
+ xe_force_wake_put(gt_to_fw(q->gt), fw_ref);
+}
+
+static void devcoredump_snapshot_gt(struct xe_devcoredump *coredump, struct xe_gt *gt)
+{
+ struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
+ struct xe_guc *guc = >->uc.guc;
+ unsigned int fw_ref;
+
+ strscpy(ss->process_name, "no proccess");
+
+ ss->gt = gt;
+
+ /* keep going if fw fails as we still want to save the memory and SW data */
+ fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
+
+ ss->guc.log = xe_guc_log_snapshot_capture(&guc->log, true);
+ ss->guc.ct = xe_guc_ct_snapshot_capture(&guc->ct);
+
+ xe_force_wake_put(gt_to_fw(gt), fw_ref);
+}
+
+static void devcoredump_snapshot_xe(struct xe_devcoredump *coredump, struct xe_device *xe)
+{
+ struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
+
+ strscpy(ss->process_name, "no proccess");
+
+ /* No implemented yet - need to keep a list of GTs in the snapshot */
+#if 0
+ struct xe_gt *gt;
+ int i;
+
+ for_each_gt(gt, xe, i) {
+ struct xe_gt_snapshot *ss_gt;
+ struct xe_guc *guc = >->uc.guc;
+ unsigned int fw_ref;
+
+ ss_gt = kzalloc(sizeof(*ss_gt), GFP_ATOMIC);
+ if (!ss_gt)
+ continue;
+
+ ss_gt->gt = gt;
+
+ /* keep going if fw fails as we still want to save the memory and SW data */
+ fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
+
+ ss_gt->guc.log = xe_guc_log_snapshot_capture(&guc->log, true);
+ ss_gt->guc.ct = xe_guc_ct_snapshot_capture(&guc->ct);
+
+ xe_force_wake_put(gt_to_fw(gt), fw_ref);
+
+ list_add(&ss_gt->link, &ss->gt_list);
+ }
+#endif
+}
+
+static void devcoredump_snapshot_for_thing(struct xe_devcoredump *coredump,
+ struct xe_gt *gt, struct xe_sched_job *job)
+{
+ struct xe_devcoredump_snapshot *ss;
+ struct xe_device *xe;
+ bool cookie;
+
+ xe = coredump_to_xe(coredump);
+
+ xe_assert(xe, !coredump->captured);
+ coredump->captured = true;
+
+ ss = &coredump->snapshot;
+ ss->snapshot_time = ktime_get_real();
+ ss->boot_time = ktime_get_boottime();
+
+ INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
+
+ cookie = dma_fence_begin_signalling();
+
+ if (job)
+ devcoredump_snapshot_job(coredump, job);
+ else if (gt)
+ devcoredump_snapshot_gt(coredump, gt);
+ else
+ devcoredump_snapshot_xe(coredump, xe);
+
queue_work(system_unbound_wq, &ss->work);
- xe_force_wake_put(gt_to_fw(q->gt), fw_ref);
dma_fence_end_signalling(cookie);
}
-/**
- * xe_devcoredump - Take the required snapshots and initialize coredump device.
- * @job: The faulty xe_sched_job, where the issue was detected.
- *
- * This function should be called at the crash time within the serialized
- * gt_reset. It is skipped if we still have the core dump device available
- * with the information of the 'first' snapshot.
- */
-void xe_devcoredump(struct xe_sched_job *job)
+static void devcoredump_for_thing(struct xe_device *_xe, struct xe_gt *gt, struct xe_sched_job *job)
{
- struct xe_device *xe = gt_to_xe(job->q->gt);
- struct xe_devcoredump *coredump = &xe->devcoredump;
+ struct xe_devcoredump *coredump;
+ struct xe_device *xe;
+
+ xe = _xe ? _xe : gt_to_xe(gt ? gt : job->q->gt);
+ coredump = &xe->devcoredump;
if (coredump->captured) {
drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
return;
}
- coredump->captured = true;
- devcoredump_snapshot(coredump, job);
+ devcoredump_snapshot_for_thing(coredump, gt, job);
drm_info(&xe->drm, "Xe device coredump has been created\n");
drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
@@ -319,6 +389,45 @@ void xe_devcoredump(struct xe_sched_job *job)
XE_COREDUMP_TIMEOUT_JIFFIES);
}
+/**
+ * xe_devcoredump_for_job - Take the required snapshots and initialize coredump device.
+ * @job: The faulty xe_sched_job, where the issue was detected.
+ *
+ * This function should be called at the crash time within the serialized gt_reset.
+ * The capture is skipped if a prior device core dump snapshot is still available with
+ * information about the 'first' error.
+ */
+void xe_devcoredump_for_job(struct xe_sched_job *job)
+{
+ devcoredump_for_thing(NULL, NULL, job);
+}
+
+/**
+ * xe_devcoredump_for_gt - Take the required snapshots and initialize coredump device.
+ * @gt: The faulty GT.
+ *
+ * This function should be called when an error occurs but without access to a
+ * scheduler job. The capture is skipped if a prior device core dump snapshot is
+ * still available with information about the 'first' error.
+ */
+void xe_devcoredump_for_gt(struct xe_gt *gt)
+{
+ devcoredump_for_thing(NULL, gt, NULL);
+}
+
+/**
+ * xe_devcoredump_for_xe - Take the required snapshots and initialize coredump device.
+ * @xe: The faulty device.
+ *
+ * This function should be called when an error occurs but without access to either a
+ * scheduler job or even a GT. The capture is skipped if a prior device core dump
+ * snapshot is still available with information about the 'first' error.
+ */
+void xe_devcoredump_for_xe(struct xe_device *xe)
+{
+ devcoredump_for_thing(xe, NULL, NULL);
+}
+
static void xe_driver_devcoredump_fini(void *arg)
{
struct drm_device *drm = arg;
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
index a4eebc285fc8..a83c93d0e82c 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump.h
@@ -10,10 +10,13 @@
struct drm_printer;
struct xe_device;
+struct xe_gt;
struct xe_sched_job;
#ifdef CONFIG_DEV_COREDUMP
-void xe_devcoredump(struct xe_sched_job *job);
+void xe_devcoredump_for_job(struct xe_sched_job *job);
+void xe_devcoredump_for_gt(struct xe_gt *gt);
+void xe_devcoredump_for_xe(struct xe_device *xe);
int xe_devcoredump_init(struct xe_device *xe);
#else
static inline void xe_devcoredump(struct xe_sched_job *job)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 5bd40e94eeba..293a1cbc2486 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1159,7 +1159,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
trace_xe_sched_job_timedout(job);
if (!exec_queue_killed(q))
- xe_devcoredump(job);
+ xe_devcoredump_for_job(job);
/*
* Kernel jobs should never fail, nor should VM jobs if they do
--
2.47.0
More information about the Intel-xe
mailing list