[PATCH 5/7] drm/xe: Add exec queue param to devcoredump
Matthew Brost
matthew.brost at intel.com
Fri Nov 8 22:56:55 UTC 2024
On Fri, Nov 08, 2024 at 05:21:08PM -0500, Rodrigo Vivi wrote:
> On Fri, Nov 08, 2024 at 09:43:10AM -0800, Matthew Brost wrote:
> > Add job may unavailable at capture time (e.g., LR mode) while an exec
> > queue is. Add exec queue param for such use cases.
>
> why?! if so, don't we have other problems?
>
Job immediately free in LR mode after run_job is called by return NULL
to the drm scheduler rather than a fence. This is very much intentional
and not a problem.
We can however get an IOMMU cat error or engine reset triggered from a
bad LR job which in turn triggers xe_guc_exec_queue_lr_cleanup where we
have the queue and want to take a devcoredump.
Make sense?
Matt
> >
> > Cc: Zhanjun Dong <zhanjun.dong at intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_devcoredump.c | 15 +++++++++------
> > drivers/gpu/drm/xe/xe_devcoredump.h | 6 ++++--
> > drivers/gpu/drm/xe/xe_guc_submit.c | 2 +-
> > 3 files changed, 14 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > index d3570d3d573c..c32cbb46ef8c 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > @@ -238,10 +238,10 @@ static void xe_devcoredump_free(void *data)
> > }
> >
> > static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > + struct xe_exec_queue *q,
> > struct xe_sched_job *job)
> > {
> > struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
> > - struct xe_exec_queue *q = job->q;
> > struct xe_guc *guc = exec_queue_to_guc(q);
> > u32 adj_logical_mask = q->logical_mask;
> > u32 width_mask = (0x1 << q->width) - 1;
> > @@ -278,10 +278,12 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > ss->guc.log = xe_guc_log_snapshot_capture(&guc->log, true);
> > ss->guc.ct = xe_guc_ct_snapshot_capture(&guc->ct);
> > ss->ge = xe_guc_exec_queue_snapshot_capture(q);
> > - ss->job = xe_sched_job_snapshot_capture(job);
> > + if (job)
> > + ss->job = xe_sched_job_snapshot_capture(job);
> > ss->vm = xe_vm_snapshot_capture(q->vm);
> >
> > - xe_engine_snapshot_capture_for_job(job);
> > + if (job)
> > + xe_engine_snapshot_capture_for_job(job);
> >
> > queue_work(system_unbound_wq, &ss->work);
> >
> > @@ -291,15 +293,16 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> >
> > /**
> > * xe_devcoredump - Take the required snapshots and initialize coredump device.
> > + * @q: The faulty xe_exec_queue, where the issue was detected.
> > * @job: The faulty xe_sched_job, where the issue was detected.
> > *
> > * This function should be called at the crash time within the serialized
> > * gt_reset. It is skipped if we still have the core dump device available
> > * with the information of the 'first' snapshot.
> > */
> > -void xe_devcoredump(struct xe_sched_job *job)
> > +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job)
> > {
> > - struct xe_device *xe = gt_to_xe(job->q->gt);
> > + struct xe_device *xe = gt_to_xe(q->gt);
> > struct xe_devcoredump *coredump = &xe->devcoredump;
> >
> > if (coredump->captured) {
> > @@ -308,7 +311,7 @@ void xe_devcoredump(struct xe_sched_job *job)
> > }
> >
> > coredump->captured = true;
> > - devcoredump_snapshot(coredump, job);
> > + devcoredump_snapshot(coredump, q, job);
> >
> > drm_info(&xe->drm, "Xe device coredump has been created\n");
> > drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h
> > index a4eebc285fc8..c04a534e3384 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump.h
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump.h
> > @@ -10,13 +10,15 @@
> >
> > struct drm_printer;
> > struct xe_device;
> > +struct xe_exec_queue;
> > struct xe_sched_job;
> >
> > #ifdef CONFIG_DEV_COREDUMP
> > -void xe_devcoredump(struct xe_sched_job *job);
> > +void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job);
> > int xe_devcoredump_init(struct xe_device *xe);
> > #else
> > -static inline void xe_devcoredump(struct xe_sched_job *job)
> > +static inline void xe_devcoredump(struct xe_exec_queue *q,
> > + struct xe_sched_job *job)
> > {
> > }
> >
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 2cf4750bc24d..974c7af7064d 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1162,7 +1162,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > trace_xe_sched_job_timedout(job);
> >
> > if (!exec_queue_killed(q))
> > - xe_devcoredump(job);
> > + xe_devcoredump(q, job);
> >
> > /*
> > * Kernel jobs should never fail, nor should VM jobs if they do
> > --
> > 2.34.1
> >
More information about the Intel-xe
mailing list