[PATCH 2/9] drm/xe: Change devcoredump functions parameters to xe_sched_job
Summers, Stuart
stuart.summers at intel.com
Mon Jan 22 18:39:45 UTC 2024
On Mon, 2024-01-22 at 09:04 -0800, José Roberto de Souza wrote:
> When devcoredump start to dump the VMs contents it will be necessary
> to know the starting addresses of batch buffers of the job that hang.
>
> This information it set in xe_sched_job and xe_sched_job is not
> easily
> acessible from xe_exec_queue, so here changing the parameter, next
> patch will append the batch buffer addresses to devcoredump snapshot
> capture.
This looks reasonable to me and I like that we're moving some of this a
little closer to the drm layer.
Reviewed-by: Stuart Summers <stuart.summers at intel.com>
Thanks,
Stuart
>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Maarten Lankhorst <dev at lankhorst.se>
> Signed-off-by: José Roberto de Souza <jose.souza at intel.com>
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 12 ++++++----
> drivers/gpu/drm/xe/xe_devcoredump.h | 6 ++---
> drivers/gpu/drm/xe/xe_guc_submit.c | 36 ++++++++++++++++++++++-----
> --
> drivers/gpu/drm/xe/xe_guc_submit.h | 4 ++--
> 4 files changed, 40 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c
> b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 68abc0b195beb..0f23ecc74b162 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -16,6 +16,7 @@
> #include "xe_guc_ct.h"
> #include "xe_guc_submit.h"
> #include "xe_hw_engine.h"
> +#include "xe_sched_job.h"
>
> /**
> * DOC: Xe device coredump
> @@ -123,9 +124,10 @@ static void xe_devcoredump_free(void *data)
> }
>
> static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> - struct xe_exec_queue *q)
> + struct xe_sched_job *job)
> {
> struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
> + struct xe_exec_queue *q = job->q;
> struct xe_guc *guc = exec_queue_to_guc(q);
> struct xe_hw_engine *hwe;
> enum xe_hw_engine_id id;
> @@ -150,7 +152,7 @@ static void devcoredump_snapshot(struct
> xe_devcoredump *coredump,
> xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
>
> coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct,
> true);
> - coredump->snapshot.ge =
> xe_guc_exec_queue_snapshot_capture(q);
> + coredump->snapshot.ge =
> xe_guc_exec_queue_snapshot_capture(job);
>
> for_each_hw_engine(hwe, q->gt, id) {
> if (hwe->class != q->hwe->class ||
> @@ -173,9 +175,9 @@ static void devcoredump_snapshot(struct
> xe_devcoredump *coredump,
> * gt_reset. It is skipped if we still have the core dump device
> available
> * with the information of the 'first' snapshot.
> */
> -void xe_devcoredump(struct xe_exec_queue *q)
> +void xe_devcoredump(struct xe_sched_job *job)
> {
> - struct xe_device *xe = gt_to_xe(q->gt);
> + struct xe_device *xe = gt_to_xe(job->q->gt);
> struct xe_devcoredump *coredump = &xe->devcoredump;
>
> if (coredump->captured) {
> @@ -184,7 +186,7 @@ void xe_devcoredump(struct xe_exec_queue *q)
> }
>
> coredump->captured = true;
> - devcoredump_snapshot(coredump, q);
> + devcoredump_snapshot(coredump, job);
>
> drm_info(&xe->drm, "Xe device coredump has been created\n");
> drm_info(&xe->drm, "Check your
> /sys/class/drm/card%d/device/devcoredump/data\n",
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h
> b/drivers/gpu/drm/xe/xe_devcoredump.h
> index 6ac218a5c1945..df8671f0b5eb2 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.h
> @@ -7,12 +7,12 @@
> #define _XE_DEVCOREDUMP_H_
>
> struct xe_device;
> -struct xe_exec_queue;
> +struct xe_sched_job;
>
> #ifdef CONFIG_DEV_COREDUMP
> -void xe_devcoredump(struct xe_exec_queue *q);
> +void xe_devcoredump(struct xe_sched_job *job);
> #else
> -static inline void xe_devcoredump(struct xe_exec_queue *q)
> +static inline void xe_devcoredump(struct xe_sched_job *job)
> {
> }
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 7c29b8333c719..dfcc7a0af0a23 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -934,7 +934,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job
> *drm_job)
> drm_notice(&xe->drm, "Timedout job: seqno=%u,
> guc_id=%d, flags=0x%lx",
> xe_sched_job_seqno(job), q->guc->id, q-
> >flags);
> simple_error_capture(q);
> - xe_devcoredump(q);
> + xe_devcoredump(job);
> } else {
> drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u,
> guc_id=%d, flags=0x%lx",
> xe_sched_job_seqno(job), q->guc->id, q-
> >flags);
> @@ -1789,12 +1789,12 @@ guc_exec_queue_wq_snapshot_print(struct
> xe_guc_submit_exec_queue_snapshot *snaps
> * caller, using `xe_guc_exec_queue_snapshot_free`.
> */
> struct xe_guc_submit_exec_queue_snapshot *
> -xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
> +xe_guc_exec_queue_snapshot_capture(struct xe_sched_job *job)
> {
> + struct xe_exec_queue *q = job->q;
> struct xe_guc *guc = exec_queue_to_guc(q);
> struct xe_device *xe = guc_to_xe(guc);
> struct xe_gpu_scheduler *sched = &q->guc->sched;
> - struct xe_sched_job *job;
> struct xe_guc_submit_exec_queue_snapshot *snapshot;
> int i;
>
> @@ -1852,14 +1852,16 @@ xe_guc_exec_queue_snapshot_capture(struct
> xe_exec_queue *q)
> if (!snapshot->pending_list) {
> drm_err(&xe->drm, "Skipping GuC Engine pending_list
> snapshot.\n");
> } else {
> + struct xe_sched_job *job_iter;
> +
> i = 0;
> - list_for_each_entry(job, &sched->base.pending_list,
> drm.list) {
> + list_for_each_entry(job_iter, &sched-
> >base.pending_list, drm.list) {
> snapshot->pending_list[i].seqno =
> - xe_sched_job_seqno(job);
> + xe_sched_job_seqno(job_iter);
> snapshot->pending_list[i].fence =
> - dma_fence_is_signaled(job->fence) ? 1
> : 0;
> + dma_fence_is_signaled(job_iter-
> >fence) ? 1 : 0;
> snapshot->pending_list[i].finished =
> - dma_fence_is_signaled(&job-
> >drm.s_fence->finished)
> + dma_fence_is_signaled(&job_iter-
> >drm.s_fence->finished)
> ? 1 : 0;
> i++;
> }
> @@ -1945,10 +1947,28 @@ void xe_guc_exec_queue_snapshot_free(struct
> xe_guc_submit_exec_queue_snapshot *s
> static void guc_exec_queue_print(struct xe_exec_queue *q, struct
> drm_printer *p)
> {
> struct xe_guc_submit_exec_queue_snapshot *snapshot;
> + struct xe_gpu_scheduler *sched = &q->guc->sched;
> + struct xe_sched_job *job;
> + bool found = false;
> +
> + spin_lock(&sched->base.job_list_lock);
> + list_for_each_entry(job, &sched->base.pending_list, drm.list)
> {
> + if (job->q == q) {
> + xe_sched_job_get(job);
> + found = true;
> + break;
> + }
> + }
> + spin_unlock(&sched->base.job_list_lock);
>
> - snapshot = xe_guc_exec_queue_snapshot_capture(q);
> + if (!found)
> + return;
> +
> + snapshot = xe_guc_exec_queue_snapshot_capture(job);
> xe_guc_exec_queue_snapshot_print(snapshot, p);
> xe_guc_exec_queue_snapshot_free(snapshot);
> +
> + xe_sched_job_put(job);
> }
>
> /**
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h
> b/drivers/gpu/drm/xe/xe_guc_submit.h
> index fc97869c5b865..723dc2bd8df91 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -9,8 +9,8 @@
> #include <linux/types.h>
>
> struct drm_printer;
> -struct xe_exec_queue;
> struct xe_guc;
> +struct xe_sched_job;
>
> int xe_guc_submit_init(struct xe_guc *guc);
>
> @@ -27,7 +27,7 @@ int
> xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32
> *msg,
> int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32
> *msg, u32 len);
>
> struct xe_guc_submit_exec_queue_snapshot *
> -xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q);
> +xe_guc_exec_queue_snapshot_capture(struct xe_sched_job *job);
> void
> xe_guc_exec_queue_snapshot_print(struct
> xe_guc_submit_exec_queue_snapshot *snapshot,
> struct drm_printer *p);
More information about the Intel-xe
mailing list