[PATCH 2/2] drm/xe: Add lock around devcoredump capture status

Wed Nov 20 23:22:57 UTC 2024

From: John Harrison <John.C.Harrison at Intel.com>

There are now multiple places that can trigger a coredump. Some of
which can happen in parallel. There is already a check against
capturing multiple dumps sequentially, but without locking it doesn't
guarantee to work against concurrent dumps. And if two dumps do happen
in parallel, they can end up doing Bad Things such as one call stack
freeing the data the other call stack is still processing. Which leads
to a crashed kernel. So add the lock.

Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 18 ++++++++++++++----
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 ++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index b5aebe8fb06d..05f3c9185ef3 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -235,9 +235,13 @@ static void xe_devcoredump_free(void *data)
 
 	/* To prevent stale data on next snapshot, clear everything */
 	memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
-	coredump->captured = false;
+
 	drm_info(&coredump_to_xe(coredump)->drm,
 		 "Xe device coredump has been deleted.\n");
+
+	spin_lock_irq(&coredump->lock);
+	coredump->captured = false;
+	spin_unlock_irq(&coredump->lock);
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
@@ -310,14 +314,18 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const cha
 	struct xe_device *xe = gt_to_xe(q->gt);
 	struct xe_devcoredump *coredump = &xe->devcoredump;
 	va_list varg;
+	bool captured;
 
-	if (coredump->captured) {
+	spin_lock_irq(&coredump->lock);
+	captured = coredump->captured;
+	coredump->captured = true;
+	spin_unlock_irq(&coredump->lock);
+
+	if (captured) {
 		drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
 		return;
 	}
 
-	coredump->captured = true;
-
 	va_start(varg, fmt);
 	coredump->snapshot.reason = kvasprintf(GFP_KERNEL, fmt, varg);
 	va_end(varg);
@@ -342,6 +350,8 @@ static void xe_driver_devcoredump_fini(void *arg)
 
 int xe_devcoredump_init(struct xe_device *xe)
 {
+	spin_lock_init(&xe->devcoredump.lock);
+
 	return devm_add_action_or_reset(xe->drm.dev, xe_driver_devcoredump_fini, &xe->drm);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index e6234e887102..6b776abd8bf4 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -80,6 +80,8 @@ struct xe_devcoredump_snapshot {
  * for reading the information.
  */
 struct xe_devcoredump {
+	/** @lock: protects access to 'captured' to prevent concurrent captures */
+	spinlock_t lock;
 	/** @captured: The snapshot of the first hang has already been taken. */
 	bool captured;
 	/** @snapshot: Snapshot is captured at time of the first crash */
-- 
2.47.0