[PATCH 1/2] drm/xe: Add a reason string to the devcoredump
Matthew Brost
matthew.brost at intel.com
Thu Nov 21 03:38:30 UTC 2024
On Wed, Nov 20, 2024 at 03:22:56PM -0800, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> There are debug level prints giving more information about the cause
> of the hang immediately before core dumps are created. However, not
> everyone has debug level prints enabled or saves the dmesg log at all.
> So include that information in the dump file itself. Also, at least
> one of those prints included the pid as well as the process name. So
> include that in the capture too.
>
This looks helpful.
A couple of nits below.
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 20 +++++++++++++++++---
> drivers/gpu/drm/xe/xe_devcoredump.h | 5 +++--
> drivers/gpu/drm/xe/xe_devcoredump_types.h | 4 ++++
> drivers/gpu/drm/xe/xe_guc_submit.c | 14 ++++++++++----
> 4 files changed, 34 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 0e5edf14a241..b5aebe8fb06d 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -99,6 +99,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> p = drm_coredump_printer(&iter);
>
> drm_puts(&p, "**** Xe Device Coredump ****\n");
> + drm_printf(&p, "Reason: %s\n", ss->reason);
> drm_puts(&p, "kernel: " UTS_RELEASE "\n");
> drm_puts(&p, "module: " KBUILD_MODNAME "\n");
>
> @@ -106,7 +107,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
> ts = ktime_to_timespec64(ss->boot_time);
> drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
> - drm_printf(&p, "Process: %s\n", ss->process_name);
> + drm_printf(&p, "Process: %s [%d]\n", ss->process_name, ss->pid);
> xe_device_snapshot_print(xe, &p);
>
> drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id);
> @@ -138,6 +139,9 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> {
> int i;
>
> + kvfree(ss->reason);
Do we need kvalloc / kvfree here? Seems like this would be small in size.
> + ss->reason = NULL;
> +
> xe_guc_log_snapshot_free(ss->guc.log);
> ss->guc.log = NULL;
>
> @@ -253,8 +257,11 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> ss->snapshot_time = ktime_get_real();
> ss->boot_time = ktime_get_boottime();
>
> - if (q->vm && q->vm->xef)
> + if (q->vm && q->vm->xef) {
> process_name = q->vm->xef->process_name;
> + ss->pid = q->vm->xef->pid;
> + }
> +
> strscpy(ss->process_name, process_name);
>
> ss->gt = q->gt;
> @@ -297,10 +304,12 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> * gt_reset. It is skipped if we still have the core dump device available
> * with the information of the 'first' snapshot.
> */
Seems like kernel doc should be updated.
> -void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
> +__printf(3, 4)
> +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...)
> {
> struct xe_device *xe = gt_to_xe(q->gt);
> struct xe_devcoredump *coredump = &xe->devcoredump;
> + va_list varg;
>
> if (coredump->captured) {
> drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
> @@ -308,6 +317,11 @@ void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
> }
>
> coredump->captured = true;
> +
> + va_start(varg, fmt);
> + coredump->snapshot.reason = kvasprintf(GFP_KERNEL, fmt, varg);
See above. Off the top my head idk if there is a non-kv* version of this
function but if there is, maybe use that.
Nits aside, LGTM:
Reviewed-by: Matthew Brost <matthew.brost at intel.com>
> + va_end(varg);
> +
> devcoredump_snapshot(coredump, q, job);
>
> drm_info(&xe->drm, "Xe device coredump has been created\n");
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
> index c04a534e3384..6a17e6d60102 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.h
> @@ -14,11 +14,12 @@ struct xe_exec_queue;
> struct xe_sched_job;
>
> #ifdef CONFIG_DEV_COREDUMP
> -void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job);
> +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, const char *fmt, ...);
> int xe_devcoredump_init(struct xe_device *xe);
> #else
> static inline void xe_devcoredump(struct xe_exec_queue *q,
> - struct xe_sched_job *job)
> + struct xe_sched_job *job,
> + const char *fmt, ...)
> {
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index be4d59ea9ac8..e6234e887102 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -28,6 +28,10 @@ struct xe_devcoredump_snapshot {
> ktime_t boot_time;
> /** @process_name: Name of process that triggered this gpu hang */
> char process_name[TASK_COMM_LEN];
> + /** @pid: Process id of process that triggered this gpu hang */
> + pid_t pid;
> + /** @reason: The reason the coredump was triggered */
> + char *reason;
>
> /** @gt: Affected GT, used by forcewake for delayed capture */
> struct xe_gt *gt;
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index e132294fde51..7d0e7cb977ad 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -898,7 +898,8 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
> if (!ret) {
> xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n",
> q->guc->id);
> - xe_devcoredump(q, NULL);
> + xe_devcoredump(q, NULL, "Schedule disable failed to respond, guc_id=%d\n",
> + q->guc->id);
> xe_sched_submission_start(sched);
> xe_gt_reset_async(q->gt);
> return;
> @@ -906,7 +907,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
> }
>
> if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
> - xe_devcoredump(q, NULL);
> + xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", q->guc->id);
>
> xe_sched_submission_start(sched);
> }
> @@ -1132,7 +1133,9 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> xe_gt_warn(guc_to_gt(guc),
> "Schedule disable failed to respond, guc_id=%d",
> q->guc->id);
> - xe_devcoredump(q, job);
> + xe_devcoredump(q, job,
> + "Schedule disable failed to respond, guc_id=%d, ret=%d, guc_read=%d",
> + q->guc->id, ret, xe_guc_read_stopped(guc));
> set_exec_queue_extra_ref(q);
> xe_exec_queue_get(q); /* GT reset owns this */
> set_exec_queue_banned(q);
> @@ -1162,7 +1165,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> trace_xe_sched_job_timedout(job);
>
> if (!exec_queue_killed(q))
> - xe_devcoredump(q, job);
> + xe_devcoredump(q, job,
> + "Timedout job - seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
> + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> + q->guc->id, q->flags);
>
> /*
> * Kernel jobs should never fail, nor should VM jobs if they do
> --
> 2.47.0
>
More information about the Intel-xe
mailing list