[PATCH 3/3] Revert "drm/xe: Add a reason string to the devcoredump"

Wed Nov 27 08:13:02 UTC 2024

On Wed, 2024-11-27 at 14:00 +0530, Himal Prasad Ghimiray wrote:
> This reverts commit 06e2cb496396613e583a72a85e0822fb43b2ead5. The
> commit
> was accidentally and unintentionally pushed.
> 
> Cc: John Harrison <John.C.Harrison at Intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Signed-off-by: Himal Prasad Ghimiray
> <himal.prasad.ghimiray at intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>

> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 21 +++------------------
>  drivers/gpu/drm/xe/xe_devcoredump.h       |  5 ++---
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  4 ----
>  drivers/gpu/drm/xe/xe_guc_submit.c        | 14 ++++----------
>  4 files changed, 9 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c
> b/drivers/gpu/drm/xe/xe_devcoredump.c
> index f4c77f525819..0e5edf14a241 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -99,7 +99,6 @@ static ssize_t __xe_devcoredump_read(char *buffer,
> size_t count,
>  	p = drm_coredump_printer(&iter);
>  
>  	drm_puts(&p, "**** Xe Device Coredump ****\n");
> -	drm_printf(&p, "Reason: %s\n", ss->reason);
>  	drm_puts(&p, "kernel: " UTS_RELEASE "\n");
>  	drm_puts(&p, "module: " KBUILD_MODNAME "\n");
>  
> @@ -107,7 +106,7 @@ static ssize_t __xe_devcoredump_read(char
> *buffer, size_t count,
>  	drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec,
> ts.tv_nsec);
>  	ts = ktime_to_timespec64(ss->boot_time);
>  	drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec,
> ts.tv_nsec);
> -	drm_printf(&p, "Process: %s [%d]\n", ss->process_name, ss-
> >pid);
> +	drm_printf(&p, "Process: %s\n", ss->process_name);
>  	xe_device_snapshot_print(xe, &p);
>  
>  	drm_printf(&p, "\n**** GT #%d ****\n", ss->gt->info.id);
> @@ -139,9 +138,6 @@ static void xe_devcoredump_snapshot_free(struct
> xe_devcoredump_snapshot *ss)
>  {
>  	int i;
>  
> -	kfree(ss->reason);
> -	ss->reason = NULL;
> -
>  	xe_guc_log_snapshot_free(ss->guc.log);
>  	ss->guc.log = NULL;
>  
> @@ -257,11 +253,8 @@ static void devcoredump_snapshot(struct
> xe_devcoredump *coredump,
>  	ss->snapshot_time = ktime_get_real();
>  	ss->boot_time = ktime_get_boottime();
>  
> -	if (q->vm && q->vm->xef) {
> +	if (q->vm && q->vm->xef)
>  		process_name = q->vm->xef->process_name;
> -		ss->pid = q->vm->xef->pid;
> -	}
> -
>  	strscpy(ss->process_name, process_name);
>  
>  	ss->gt = q->gt;
> @@ -299,18 +292,15 @@ static void devcoredump_snapshot(struct
> xe_devcoredump *coredump,
>   * xe_devcoredump - Take the required snapshots and initialize
> coredump device.
>   * @q: The faulty xe_exec_queue, where the issue was detected.
>   * @job: The faulty xe_sched_job, where the issue was detected.
> - * @fmt: Printf format + args to describe the reason for the core
> dump
>   *
>   * This function should be called at the crash time within the
> serialized
>   * gt_reset. It is skipped if we still have the core dump device
> available
>   * with the information of the 'first' snapshot.
>   */
> -__printf(3, 4)
> -void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job
> *job, const char *fmt, ...)
> +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job
> *job)
>  {
>  	struct xe_device *xe = gt_to_xe(q->gt);
>  	struct xe_devcoredump *coredump = &xe->devcoredump;
> -	va_list varg;
>  
>  	if (coredump->captured) {
>  		drm_dbg(&xe->drm, "Multiple hangs are occurring, but
> only the first snapshot was taken\n");
> @@ -318,11 +308,6 @@ void xe_devcoredump(struct xe_exec_queue *q,
> struct xe_sched_job *job, const cha
>  	}
>  
>  	coredump->captured = true;
> -
> -	va_start(varg, fmt);
> -	coredump->snapshot.reason = kvasprintf(GFP_ATOMIC, fmt,
> varg);
> -	va_end(varg);
> -
>  	devcoredump_snapshot(coredump, q, job);
>  
>  	drm_info(&xe->drm, "Xe device coredump has been created\n");
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h
> b/drivers/gpu/drm/xe/xe_devcoredump.h
> index 6a17e6d60102..c04a534e3384 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.h
> @@ -14,12 +14,11 @@ struct xe_exec_queue;
>  struct xe_sched_job;
>  
>  #ifdef CONFIG_DEV_COREDUMP
> -void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job
> *job, const char *fmt, ...);
> +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job
> *job);
>  int xe_devcoredump_init(struct xe_device *xe);
>  #else
>  static inline void xe_devcoredump(struct xe_exec_queue *q,
> -				  struct xe_sched_job *job,
> -				  const char *fmt, ...)
> +				  struct xe_sched_job *job)
>  {
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index e6234e887102..be4d59ea9ac8 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -28,10 +28,6 @@ struct xe_devcoredump_snapshot {
>  	ktime_t boot_time;
>  	/** @process_name: Name of process that triggered this gpu
> hang */
>  	char process_name[TASK_COMM_LEN];
> -	/** @pid: Process id of process that triggered this gpu hang
> */
> -	pid_t pid;
> -	/** @reason: The reason the coredump was triggered */
> -	char *reason;
>  
>  	/** @gt: Affected GT, used by forcewake for delayed capture
> */
>  	struct xe_gt *gt;
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 9c36329fe857..d00af8d8acbe 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -901,8 +901,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct
> work_struct *w)
>  		if (!ret) {
>  			xe_gt_warn(q->gt, "Schedule disable failed
> to respond, guc_id=%d\n",
>  				   q->guc->id);
> -			xe_devcoredump(q, NULL, "Schedule disable
> failed to respond, guc_id=%d\n",
> -				       q->guc->id);
> +			xe_devcoredump(q, NULL);
>  			xe_sched_submission_start(sched);
>  			xe_gt_reset_async(q->gt);
>  			return;
> @@ -910,7 +909,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct
> work_struct *w)
>  	}
>  
>  	if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q-
> >lrc[0]))
> -		xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d",
> q->guc->id);
> +		xe_devcoredump(q, NULL);
>  
>  	xe_sched_submission_start(sched);
>  }
> @@ -1137,9 +1136,7 @@ guc_exec_queue_timedout_job(struct
> drm_sched_job *drm_job)
>  				xe_gt_warn(guc_to_gt(guc),
>  					   "Schedule disable failed
> to respond, guc_id=%d",
>  					   q->guc->id);
> -			xe_devcoredump(q, job,
> -				       "Schedule disable failed to
> respond, guc_id=%d, ret=%d, guc_read=%d",
> -				       q->guc->id, ret,
> xe_guc_read_stopped(guc));
> +			xe_devcoredump(q, job);
>  			set_exec_queue_extra_ref(q);
>  			xe_exec_queue_get(q);	/* GT reset owns
> this */
>  			set_exec_queue_banned(q);
> @@ -1169,10 +1166,7 @@ guc_exec_queue_timedout_job(struct
> drm_sched_job *drm_job)
>  	trace_xe_sched_job_timedout(job);
>  
>  	if (!exec_queue_killed(q))
> -		xe_devcoredump(q, job,
> -			       "Timedout job - seqno=%u,
> lrc_seqno=%u, guc_id=%d, flags=0x%lx",
> -			       xe_sched_job_seqno(job),
> xe_sched_job_lrc_seqno(job),
> -			       q->guc->id, q->flags);
> +		xe_devcoredump(q, job);
>  
>  	/*
>  	 * Kernel jobs should never fail, nor should VM jobs if they
> do