[PATCH 2/2] drm/xe: Make a standalone snapshot to that survives unbind

Rodrigo Vivi rodrigo.vivi at intel.com
Tue Jan 30 22:37:09 UTC 2024


Instead of having the coredump embedded to the xe device,
let's dynamically allocate that and remove only when
requested by devcoredump.

This will allow the 'data' to be read even when the xe_device
is already gone at unbind for instance.

Of course, the module cannot be unloaded, but this is
guaranteed by devcoredump holding the xe module reference.
Only after devcoredump device deletion is that this reference
will be put and the xe module can be removed.

Our scripts and IGT helpers neeed to be adjusted to write
something to the data file before rmmod so the driver can
be properly removed or reloaded.

Cc: José Roberto de Souza <jose.souza at intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c  | 32 ++++++++++------------------
 drivers/gpu/drm/xe/xe_device_types.h |  2 +-
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index 30e7edbb8b6f..64886773b70b 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -49,11 +49,6 @@
 
 #ifdef CONFIG_DEV_COREDUMP
 
-static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump)
-{
-	return container_of(coredump, struct xe_device, devcoredump);
-}
-
 static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
 {
 	return &q->gt->uc.guc;
@@ -69,10 +64,6 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	struct timespec64 ts;
 	int i;
 
-	/* Our device is gone already... */
-	if (!data || !coredump_to_xe(coredump))
-		return -ENODEV;
-
 	iter.data = buffer;
 	iter.offset = 0;
 	iter.start = offset;
@@ -109,10 +100,6 @@ static void xe_devcoredump_free(void *data)
 	struct xe_devcoredump *coredump = data;
 	int i;
 
-	/* Our device is gone. Nothing to do... */
-	if (!data || !coredump_to_xe(coredump))
-		return;
-
 	xe_device_snapshot_free(coredump->snapshot.xe);
 	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
 	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
@@ -121,8 +108,8 @@ static void xe_devcoredump_free(void *data)
 			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
 
 	coredump->captured = false;
-	drm_info(&coredump_to_xe(coredump)->drm,
-		 "Xe device coredump has been deleted.\n");
+	coredump = NULL;
+	kfree(coredump);
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -181,21 +168,24 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
 void xe_devcoredump(struct xe_sched_job *job)
 {
 	struct xe_device *xe = gt_to_xe(job->q->gt);
-	struct xe_devcoredump *coredump = &xe->devcoredump;
 
-	if (coredump->captured) {
-		drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
+	if (xe->devcoredump && xe->devcoredump->captured) {
+		drm_info(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
 		return;
 	}
 
-	coredump->captured = true;
-	devcoredump_snapshot(coredump, job);
+	xe->devcoredump = kzalloc(sizeof(*xe->devcoredump), GFP_KERNEL);
+	if (!xe->devcoredump)
+		drm_err(&xe->drm, "devcoredump failed\n");
+
+	xe->devcoredump->captured = true;
+	devcoredump_snapshot(xe->devcoredump, job);
 
 	drm_info(&xe->drm, "Xe device coredump has been created\n");
 	drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
 		 xe->drm.primary->index);
 
-	dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
+	dev_coredumpm(xe->drm.dev, THIS_MODULE, xe->devcoredump, 0, GFP_KERNEL,
 		      xe_devcoredump_read, xe_devcoredump_free);
 }
 #endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 50dac1a5b053..4372f5cc98b6 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -214,7 +214,7 @@ struct xe_device {
 	struct drm_device drm;
 
 	/** @devcoredump: device coredump */
-	struct xe_devcoredump devcoredump;
+	struct xe_devcoredump *devcoredump;
 
 	/** @info: device info */
 	struct intel_device_info {
-- 
2.43.0



More information about the Intel-xe mailing list