[PATCH v9 4/4] drm/xe/guc: Extract GuC capture lists to register snapshot

Thu Jun 13 23:26:22 UTC 2024

alan: Hi there Zhanjun, as per offline conversation, i was surprised to
see you squash the "Guc-error-capture-extraction" together with the
"Plumb GuC-capture into dev coredump" patch but now realize i made a
regrettable typo in my rev8 review comments - when i mentioned about
posisbly squashing pre-allocate-nodes with node-extraction i said
"patch 4 and patch 6" but what i meant was "patch 4 and patch 5".
I'm terribly sorry about this mistake. Since we know that you
have to redo this patch anyways based on offline conversation to
resolve race between drm-tdr vs guc-context-reset, thanks for agreeing
to keep extraction separate again. And ofc we don't need pre-allocating
of nodes now.

On Thu, 2024-06-06 at 17:07 -0700, Zhanjun Dong wrote:
> Upon the G2H Notify-Err-Capture event, parse through the
> GuC Log Buffer (error-capture-subregion) and generate one or
> more capture-nodes. A single node represents a single "engine-
> instance-capture-dump" and contains at least 3 register lists:
> global, engine-class and engine-instance. An internal link
> list is maintained to store one or more nodes.
> Because the link-list node generation happen before the call
> to devcoredump, duplicate global and engine-class register
> lists for each engine-instance register dump if we find
> dependent-engine resets in a engine-capture-group.
> When xe_devcoredump calls into snapshot_from_capture_engine,
> we detach the matching node (guc-id, LRCA, etc) from the link list
> above and attach it to snapshot_regs structure when have
> matching LRCA/guc-id/engine-instance.
> 
> To avoid dynamically allocate the output nodes during gt reset,
> pre-allocate a fixed number of empty nodes up front (at the
> time of ADS registration) that we can consume from or return to
> an internal cached list of nodes.
> Add guc capture data structure definition.
> 
> Add xe_hw_engine_snapshot_from_capture to take snapshot from capture
> node list.
> Move snapshot register struct out of engine snapshot struct.
> Add offset in snapshot register to register definition list at
> xe_guc_capture.c.
> Snapshot could be split into global, engine class, engine instance
> and steering register zone, few macros defined to separate zones.
> Support combines 2 32bit registers as a 64bit register in snapshot,
> perform endian convert if needed.
> 
> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
> ---
>  drivers/gpu/drm/xe/abi/guc_actions_abi.h  |   7 +
>  drivers/gpu/drm/xe/regs/xe_gt_regs.h      |   2 +
>  drivers/gpu/drm/xe/xe_devcoredump.c       |   4 +
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |   2 +
>  drivers/gpu/drm/xe/xe_guc.h               |  23 +
>  drivers/gpu/drm/xe/xe_guc_capture.c       | 876
> +++++++++++++++++++++-
>  drivers/gpu/drm/xe/xe_guc_capture.h       |   9 +
>  drivers/gpu/drm/xe/xe_guc_capture_fwif.h  |  45 ++
>  drivers/gpu/drm/xe/xe_guc_ct.c            |   2 +
>  drivers/gpu/drm/xe/xe_guc_fwif.h          |   6 +
>  drivers/gpu/drm/xe/xe_guc_submit.c        |  63 +-
>  drivers/gpu/drm/xe/xe_guc_submit.h        |   2 +
>  drivers/gpu/drm/xe/xe_hw_engine.c         | 218 ++++--
>  drivers/gpu/drm/xe/xe_hw_engine_types.h   | 159 ++--
>  drivers/gpu/drm/xe/xe_lrc.h               |   1 +
>  15 files changed, 1244 insertions(+), 175 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> index 79ba98a169f9..ed1eeea34e8e 100644
> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> @@ -182,6 +182,13 @@ enum xe_guc_sleep_state_status {
>  #define GUC_LOG_CONTROL_VERBOSITY_MASK (0xF <<
> GUC_LOG_CONTROL_VERBOSITY_SHIFT)
>  #define GUC_LOG_CONTROL_DEFAULT_LOGGING        (1 << 8)
>  
> +enum intel_guc_state_capture_event_status {
> +       XE_GUC_STATE_CAPTURE_EVENT_STATUS_SUCCESS = 0x0,
> +       XE_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE = 0x1,
> +};
> +
> +#define XE_GUC_STATE_CAPTURE_EVENT_STATUS_MASK      0x000000FF
> +
>  #define XE_GUC_TLB_INVAL_TYPE_SHIFT 0
>  #define XE_GUC_TLB_INVAL_MODE_SHIFT 8
>  /* Flush PPC or SMRO caches along with TLB invalidation request */
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index d09b2473259f..c6bd50738e2b 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -574,4 +574,6 @@
>  #define   GT_CS_MASTER_ERROR_INTERRUPT         REG_BIT(3)
>  #define   GT_RENDER_USER_INTERRUPT             REG_BIT(0)
>  
> +#define SFC_DONE(n)                            XE_REG(0x1cc000 + (n)
> * 0x1000)
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c
> b/drivers/gpu/drm/xe/xe_devcoredump.c
> index d7f2d19a77c1..5e80710d3cc8 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -16,6 +16,7 @@
>  #include "xe_force_wake.h"
>  #include "xe_gt.h"
>  #include "xe_gt_printk.h"
> +#include "xe_guc_capture.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_submit.h"
>  #include "xe_hw_engine.h"
> @@ -149,10 +150,12 @@ static void xe_devcoredump_free(void *data)
>                 if (coredump->snapshot.hwe[i])
>                         xe_hw_engine_snapshot_free(coredump-
> >snapshot.hwe[i]);
>         xe_vm_snapshot_free(coredump->snapshot.vm);
> +       xe_guc_capture_free(&coredump->snapshot.gt->uc.guc);
>  
>         /* To prevent stale data on next snapshot, clear everything
> */
>         memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
>         coredump->captured = false;
> +       coredump->job = NULL;
>         drm_info(&coredump_to_xe(coredump)->drm,
>                  "Xe device coredump has been deleted.\n");
>  }
> @@ -186,6 +189,7 @@ static void devcoredump_snapshot(struct
> xe_devcoredump *coredump,
>                 put_task_struct(task);
>  
>         ss->gt = q->gt;
> +       coredump->job = job;
>         INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
>  
>         cookie = dma_fence_begin_signalling();
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 923cdf72a816..c39ab73a9f6a 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -61,6 +61,8 @@ struct xe_devcoredump {
>         bool captured;
>         /** @snapshot: Snapshot is captured at time of the first
> crash */
>         struct xe_devcoredump_snapshot snapshot;
> +       /** @job: Point to the issue job */
> +       struct xe_sched_job *job;
>  };
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc.h
> b/drivers/gpu/drm/xe/xe_guc.h
> index ddfa855458ab..e1afda9070f4 100644
> --- a/drivers/gpu/drm/xe/xe_guc.h
> +++ b/drivers/gpu/drm/xe/xe_guc.h
> @@ -59,6 +59,29 @@ static inline u16
> xe_engine_class_to_guc_class(enum xe_engine_class class)
>         }
>  }
>  
> +static inline u16 xe_guc_class_to_capture_class(uint class)
> +{
> +       switch (class) {
> +       case GUC_RENDER_CLASS:
> +       case GUC_COMPUTE_CLASS:
> +               return GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE;
> +       case GUC_GSC_OTHER_CLASS:
> +               return GUC_CAPTURE_LIST_CLASS_GSC_OTHER;
> +       case GUC_VIDEO_CLASS:
> +       case GUC_VIDEOENHANCE_CLASS:
> +       case GUC_BLITTER_CLASS:
> +               return class;
> +       default:
> +               XE_WARN_ON(class);
> +               return -1;
> +       }
> +}
> +
> +static inline u16 xe_engine_class_to_guc_capture_class(enum
> xe_engine_class class)
> +{
> +       return
> xe_guc_class_to_capture_class(xe_guc_class_to_capture_class(class));
> +}
> +
>  static inline struct xe_gt *guc_to_gt(struct xe_guc *guc)
>  {
>         return container_of(guc, struct xe_gt, uc.guc);
> 

alan:snip.

> +static void
> +__guc_capture_create_prealloc_nodes(struct xe_guc *guc)
> +{
> +       struct __guc_capture_parsed_output *node = NULL;
> +       int i;
> +
> +       for (i = 0; i < PREALLOC_NODES_MAX_COUNT; ++i) {
> +               node = guc_capture_alloc_one_node(guc);
> +               if (!node) {
> +                       xe_gt_warn(guc_to_gt(guc), "Register capture
> pre-alloc-cache failure\n");
> +                       /* dont free the priors, use what we got and
> cleanup at shutdown */
> +                       return;
> +               }
> +               guc_capture_add_node_to_cachelist(guc->capture,
> node);
> +       }
> +}
> +
btw, why are we still using this pre-allocated-nodes framework and
functions? i thought we decided we didn't need it for the case of Xe?
(since we'll serialize the extraction-flow vs any reset-flow vs the
devcoredump-printing  by putting them on the same workqueue). In that
case we can remove the prealocated nodes' cachelist extraction code
requiring new ndes can dynamically by calling 
guc_capture_alloc_one_node directly

alan:snip. same comment above for below hnk.
> +static void
> +guc_capture_create_prealloc_nodes(struct xe_guc *guc)
> +{
> +       /* skip if we've already done the pre-alloc */
> +       if (guc->capture->max_mmio_per_node)
> +               return;
> +
> +       guc->capture->max_mmio_per_node =
> guc_get_max_reglist_count(guc);
> +       __guc_capture_create_prealloc_nodes(guc);
> +}
> +

> +static void cp_reg_to_snapshot(int type, u16 hwe_guc_class, u32
> offset, u32 value,
> +                              struct snapshot_regs *regs)
> +{
> +       int i;
> +       const struct __guc_mmio_reg_descr_group *list;
> +
> +       /* Get register list for the type/class */
> +       list =
> xe_guc_capture_get_reg_desc_list(GUC_CAPTURE_LIST_INDEX_PF, type,
> +                                               xe_guc_class_to_captu
> re_class(hwe_guc_class));
> +       if (!list)
> +               return;
> +
> +       for (i = 0; i < list->num_regs; i++)
> +               if (offset == list->list[i].reg.addr) {
> +                       u32 *field = (u32 *)((uintptr_t)regs + list-
> >list[i].position_in_snapshot);
> +                       *field = value;
> +                       return;
> +               }
> +}
> +
> +static void guc_capture_parse_reglist(struct
> __guc_capture_parsed_output *node,
> +                                     struct xe_hw_engine_snapshot
> *snapshot, u16 hwe_guc_class)
> +{
> +       int i, type;
> +
> +       if (!node)
> +               return;
> +
> +       for (type = GUC_CAPTURE_LIST_TYPE_GLOBAL; type <
> GUC_CAPTURE_LIST_TYPE_MAX; type++) {
> +               struct gcap_reg_list_info *reginfo = &node-
> >reginfo[type];
> +               struct guc_mmio_reg *regs = reginfo->regs;
> +
> +               for (i = 0; i < reginfo->num_regs; i++)
> +                       cp_reg_to_snapshot(type, hwe_guc_class,
> regs[i].offset, regs[i].value,
> +                                          &snapshot->reg);
> +       }
> +}
> +
> +/**
> + * xe_hw_engine_find_and_copy_guc_capture_snapshot - Take a engine
> snapshot from GuC capture.
> + * @hwe: Xe HW Engine.
> + * @snapshot: Xe HW Engine snapshot object to save data, copied from
> error capture
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: None
> + */
> +void
> +xe_hw_engine_find_and_copy_guc_capture_snapshot(struct xe_hw_engine
> *hwe,
> +                                               struct
> xe_hw_engine_snapshot *snapshot)
> +{
> +       struct xe_gt *gt = hwe->gt;
> +       struct xe_device *xe = gt_to_xe(gt);
> +       struct xe_guc *guc = &gt->uc.guc;
> +       struct __guc_capture_parsed_output *n, *ntmp;
> +       struct xe_devcoredump *devcoredump = &xe->devcoredump;
> +       struct list_head *list = &guc->capture->outlist;
> +       struct xe_sched_job *job = devcoredump->job;
> +       struct xe_guc_submit_exec_queue_snapshot *ge = devcoredump-
> >snapshot.ge;
> +       u16 guc_id = ge->guc.id;
> +       u32 lrca;
> +       u16 hwe_guc_class = xe_engine_class_to_guc_class(hwe->class);
> +
> +       lrca = xe_lrc_ggtt_addr(job->q->lrc[0]) &
> LRC_GTT_ADDRESS_MASK;
> +
> +       /*
> +        * Look for a matching GuC reported error capture node from
> +        * the internal output link-list based on engine class and
> instance.
> +        */
> +       list_for_each_entry_safe(n, ntmp, list, link) {
> +               if (n->eng_class == hwe_guc_class && n->eng_inst ==
> hwe->instance &&
> +                   n->guc_id == guc_id && (n->lrca &
> LRC_GTT_ADDRESS_MASK) == lrca) {
> +                       guc_capture_parse_reglist(n, snapshot,
> hwe_guc_class);
> +                       list_del(&n->link);
> +                       return;
> +               }
> +       }
> +}
> +
> +void xe_guc_capture_free(struct xe_guc *guc)
> +{
> +       if (guc->capture && !list_empty(&guc->capture->outlist)) {
> +               struct __guc_capture_parsed_output *n, *ntmp;
> +
> +               list_for_each_entry_safe(n, ntmp, &guc->capture-
> >outlist, link) {
> +                       list_del(&n->link);
> +                       /* put node back to cache list */
> +                       /* No need to init here,
> guc_capture_get_prealloc_node init it later */
> +                       guc_capture_add_node_to_cachelist(guc-
> >capture, n);
> +               }
> +       }
> +}
> +
>  int xe_guc_capture_init(struct xe_guc *guc)
>  {
>         guc->capture = drmm_kzalloc(guc_to_drm(guc), sizeof(*guc-
> >capture), GFP_KERNEL);
> @@ -574,7 +1404,9 @@ int xe_guc_capture_init(struct xe_guc *guc)
>                 return -ENOMEM;
>  
>         guc->capture->reglists = guc_capture_get_device_reglist(guc);
> -
>         check_guc_capture_size(guc);
> +       INIT_LIST_HEAD(&guc->capture->outlist);
> +       INIT_LIST_HEAD(&guc->capture->cachelist);
> +
>         return 0;
>  }
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.h
> b/drivers/gpu/drm/xe/xe_guc_capture.h
> index a62b1dbd47a6..c0bada99c9ec 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture.h
> +++ b/drivers/gpu/drm/xe/xe_guc_capture.h
> @@ -10,6 +10,8 @@
>  #include "regs/xe_reg_defs.h"
>  
>  struct xe_guc;
> +struct xe_hw_engine;
> +struct xe_hw_engine_snapshot;
>  
>  /*
>   * struct __guc_mmio_reg_descr / struct __guc_mmio_reg_descr_group
> @@ -25,6 +27,7 @@ struct __guc_mmio_reg_descr {
>         u32 flags;
>         u32 mask;
>         const char *regname;
> +       u32 position_in_snapshot;
>  };
>  
>  struct __guc_mmio_reg_descr_group {
> @@ -36,9 +39,15 @@ struct __guc_mmio_reg_descr_group {
>         struct __guc_mmio_reg_descr *extlist; /* only used for
> steered registers */
>  };
>  
> +void xe_guc_capture_process(struct xe_guc *guc);
>  int xe_guc_capture_getlist(struct xe_guc *guc, u32 owner, u32 type,
> u32 classid, void **outptr);
>  int xe_guc_capture_getlistsize(struct xe_guc *guc, u32 owner, u32
> type, u32 classid, size_t *size);
>  int xe_guc_capture_getnullheader(struct xe_guc *guc, void **outptr,
> size_t *size);
> +const struct __guc_mmio_reg_descr_group *
> +xe_guc_capture_get_reg_desc_list(u32 owner, u32 type, u32
> engine_classid);
> +void xe_hw_engine_find_and_copy_guc_capture_snapshot(struct
> xe_hw_engine *hwe,
> +                                                    struct
> xe_hw_engine_snapshot *snapshot);
> +void xe_guc_capture_free(struct xe_guc *guc);
>  int xe_guc_capture_init(struct xe_guc *guc);
>  
>  #endif /* _XE_GUC_CAPTURE_H */
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> b/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> index 199e3c0108a4..5ef8c20fe9bc 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> +++ b/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> @@ -10,6 +10,51 @@
>  
>  #include "xe_guc_fwif.h"
>  
> +/*
> + * struct __guc_capture_bufstate
> + *
> + * Book-keeping structure used to track read and write pointers
> + * as we extract error capture data from the GuC-log-buffer's
> + * error-capture region as a stream of dwords.
> + */
> +struct __guc_capture_bufstate {
> +       u32 size;
> +       u32 data_offset;
> +       u32 rd;
> +       u32 wr;
> +};
> +
> +/*
> + * struct __guc_capture_parsed_output - extracted error capture node
> + *
> + * A single unit of extracted error-capture output data grouped
> together
> + * at an engine-instance level. We keep these nodes in a linked
> list.
> + * See cachelist and outlist below.
> + */
> +struct __guc_capture_parsed_output {
> +       /*
> +        * A single set of 3 capture lists: a global-list
> +        * an engine-class-list and an engine-instance list.
> +        * outlist in __guc_capture_parsed_output will keep
> +        * a linked list of these nodes that will eventually
> +        * be detached from outlist and attached into to
> +        * xe_codedump in response to a context reset
> +        */
> +       struct list_head link;
> +       bool is_partial;
> +       u32 eng_class;
> +       u32 eng_inst;
> +       u32 guc_id;
> +       u32 lrca;
> +       struct gcap_reg_list_info {
> +               u32 vfid;
> +               u32 num_regs;
> +               struct guc_mmio_reg *regs;
> +       } reginfo[GUC_CAPTURE_LIST_TYPE_MAX];
> +#define GCAP_PARSED_REGLIST_INDEX_GLOBAL  
> BIT(GUC_CAPTURE_LIST_TYPE_GLOBAL)
> +#define GCAP_PARSED_REGLIST_INDEX_ENGCLASS
> BIT(GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS)
> +};
> +
>  /*
>   * struct guc_debug_capture_list_header / struct
> guc_debug_capture_list
>   *
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c
> b/drivers/gpu/drm/xe/xe_guc_ct.c
> index c1f258348f5c..865b58bb4fd9 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -1045,6 +1045,8 @@ static int process_g2h_msg(struct xe_guc_ct
> *ct, u32 *msg, u32 len)
>                 /* Selftest only at the moment */
>                 break;
>         case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION:
> +               ret = xe_guc_error_capture_handler(guc, payload,
> adj_len);
> +               break;
>         case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE:
>                 /* FIXME: Handle this */
>                 break;
> diff --git a/drivers/gpu/drm/xe/xe_guc_fwif.h
> b/drivers/gpu/drm/xe/xe_guc_fwif.h
> index 908298791c93..f8f9c76eb7ac 100644
> --- a/drivers/gpu/drm/xe/xe_guc_fwif.h
> +++ b/drivers/gpu/drm/xe/xe_guc_fwif.h
> @@ -206,6 +206,12 @@ enum guc_capture_type {
>         GUC_CAPTURE_LIST_TYPE_MAX,
>  };
>  
> +/* GuC support limited registers range to be captured for debug
> purpose,
> + * for unsupported registers, direct read is the only way to save
> the data.
> + * GuC capture handling will ignore all lists with this type:
> GUC_CAPTURE_LIST_TYPE_DIRECT_READ
> + */
> +#define GUC_CAPTURE_LIST_TYPE_DIRECT_READ GUC_CAPTURE_LIST_TYPE_MAX
> +
>  /* Class indecies for capture_class and capture_instance arrays */
>  enum {
>         GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE = 0,
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 47aab04cf34f..f02f4c0c9568 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -25,6 +25,7 @@
>  #include "xe_gt.h"
>  #include "xe_gt_printk.h"
>  #include "xe_guc.h"
> +#include "xe_guc_capture.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_exec_queue_types.h"
>  #include "xe_guc_id_mgr.h"
> @@ -769,7 +770,7 @@ static void guc_exec_queue_free_job(struct
> drm_sched_job *drm_job)
>         xe_sched_job_put(job);
>  }
>  
> -static int guc_read_stopped(struct xe_guc *guc)
> +int xe_guc_read_stopped(struct xe_guc *guc)
>  {
>         return atomic_read(&guc->submission_state.stopped);
>  }
> @@ -791,7 +792,7 @@ static void disable_scheduling_deregister(struct
> xe_guc *guc,
>         set_min_preemption_timeout(guc, q);
>         smp_rmb();
>         ret = wait_event_timeout(guc->ct.wq,
> !exec_queue_pending_enable(q) ||
> -                                guc_read_stopped(guc), HZ * 5);
> +                                xe_guc_read_stopped(guc), HZ * 5);
>         if (!ret) {
>                 struct xe_gpu_scheduler *sched = &q->guc->sched;
>  
> @@ -906,7 +907,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct
> work_struct *w)
>                  */
>                 ret = wait_event_timeout(guc->ct.wq,
>                                         
> !exec_queue_pending_disable(q) ||
> -                                        guc_read_stopped(guc), HZ *
> 5);
> +                                        xe_guc_read_stopped(guc), HZ
> * 5);
>                 if (!ret) {
>                         drm_warn(&xe->drm, "Schedule disable failed
> to respond");
>                         xe_sched_submission_start(sched);
> @@ -929,6 +930,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job
> *drm_job)
>         int err = -ETIME;
>         int i = 0;
>         bool wedged;
> +       bool reset_status = exec_queue_reset(q);
> +       bool guc_en = xe_device_uc_enabled(xe);
>  
>         /*
>          * TDR has fired before free job worker. Common if exec queue
> @@ -948,7 +951,15 @@ guc_exec_queue_timedout_job(struct drm_sched_job
> *drm_job)
>         xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM &&
> !exec_queue_killed(q),
>                    "VM job timed out on non-killed execqueue\n");
>  
> -       if (!exec_queue_killed(q))
> +       /* take devcoredump on:
> +        * 1. GuC not enabled
> +        * 2. GuC enabled with GuC reset status == 1
> +        * When GuC enabled, register value is captured by GuC, GuC
> will notify host
> +        * with capture notification message, which is right before
> reset.
> +        * GuC reset status 1 also means capture ready.
> +        * If not ready, will take snapshot after wait event within
> this function.
> +        */
> +       if (!exec_queue_killed(q) && (!guc_en || (guc_en &&
> reset_status)))
>                 xe_devcoredump(job);
>  
>         trace_xe_sched_job_timedout(job);
> @@ -996,8 +1007,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job
> *drm_job)
>                 smp_rmb();
>                 ret = wait_event_timeout(guc->ct.wq,
>                                         
> !exec_queue_pending_disable(q) ||
> -                                        guc_read_stopped(guc), HZ *
> 5);
> -               if (!ret || guc_read_stopped(guc)) {
> +                                        xe_guc_read_stopped(guc), HZ
> * 5);
> +               if (!ret || xe_guc_read_stopped(guc)) {
>                         drm_warn(&xe->drm, "Schedule disable failed
> to respond");
>                         xe_sched_add_pending_job(sched, job);
>                         xe_sched_submission_start(sched);
> @@ -1007,6 +1018,10 @@ guc_exec_queue_timedout_job(struct
> drm_sched_job *drm_job)
>                 }
>         }
>  
> +       /* When entring this function, if capture/reset not ready,
> now is time to take snapshot */
> +       if (!exec_queue_killed(q) && guc_en && !reset_status)
> +               xe_devcoredump(job);
> +
>         /* Stop fence signaling */
>         xe_hw_fence_irq_stop(q->fence_irq);
>  
> @@ -1112,7 +1127,7 @@ static void suspend_fence_signal(struct
> xe_exec_queue *q)
>         struct xe_device *xe = guc_to_xe(guc);
>  
>         xe_assert(xe, exec_queue_suspended(q) || exec_queue_killed(q)
> ||
> -                 guc_read_stopped(guc));
> +                 xe_guc_read_stopped(guc));
>         xe_assert(xe, q->guc->suspend_pending);
>  
>         q->guc->suspend_pending = false;
> @@ -1128,9 +1143,9 @@ static void
> __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>         if (guc_exec_queue_allowed_to_change_state(q) &&
> !exec_queue_suspended(q) &&
>             exec_queue_enabled(q)) {
>                 wait_event(guc->ct.wq, q->guc->resume_time !=
> RESUME_PENDING ||
> -                          guc_read_stopped(guc));
> +                          xe_guc_read_stopped(guc));
>  
> -               if (!guc_read_stopped(guc)) {
> +               if (!xe_guc_read_stopped(guc)) {
>                         MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
>                         s64 since_resume_ms =
>                                 ktime_ms_delta(ktime_get(),
> @@ -1258,7 +1273,7 @@ static int guc_exec_queue_init(struct
> xe_exec_queue *q)
>  
>         q->entity = &ge->entity;
>  
> -       if (guc_read_stopped(guc))
> +       if (xe_guc_read_stopped(guc))
>                 xe_sched_stop(sched);
>  
>         mutex_unlock(&guc->submission_state.lock);
> @@ -1385,7 +1400,7 @@ static void guc_exec_queue_suspend_wait(struct
> xe_exec_queue *q)
>         struct xe_guc *guc = exec_queue_to_guc(q);
>  
>         wait_event(q->guc->suspend_wait, !q->guc->suspend_pending ||
> -                  guc_read_stopped(guc));
> +                  xe_guc_read_stopped(guc));
>  }
>  
>  static void guc_exec_queue_resume(struct xe_exec_queue *q)
> @@ -1495,7 +1510,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc
> *guc)
>  
>  void xe_guc_submit_reset_wait(struct xe_guc *guc)
>  {
> -       wait_event(guc->ct.wq, !guc_read_stopped(guc));
> +       wait_event(guc->ct.wq, !xe_guc_read_stopped(guc));
>  }
>  
>  void xe_guc_submit_stop(struct xe_guc *guc)
> @@ -1504,7 +1519,7 @@ void xe_guc_submit_stop(struct xe_guc *guc)
>         unsigned long index;
>         struct xe_device *xe = guc_to_xe(guc);
>  
> -       xe_assert(xe, guc_read_stopped(guc) == 1);
> +       xe_assert(xe, xe_guc_read_stopped(guc) == 1);
>  
>         mutex_lock(&guc->submission_state.lock);
>  
> @@ -1542,7 +1557,7 @@ int xe_guc_submit_start(struct xe_guc *guc)
>         unsigned long index;
>         struct xe_device *xe = guc_to_xe(guc);
>  
> -       xe_assert(xe, guc_read_stopped(guc) == 1);
> +       xe_assert(xe, xe_guc_read_stopped(guc) == 1);
>  
>         mutex_lock(&guc->submission_state.lock);
>         atomic_dec(&guc->submission_state.stopped);
> @@ -1698,8 +1713,6 @@ int xe_guc_exec_queue_reset_handler(struct
> xe_guc *guc, u32 *msg, u32 len)
>         xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask:
> 0x%x, guc_id=%d",
>                    xe_hw_engine_class_to_str(q->class), q-
> >logical_mask, guc_id);
>  
> -       /* FIXME: Do error capture, most likely async */
> -
>         trace_xe_exec_queue_reset(q);
>  
>         /*
> @@ -1715,6 +1728,24 @@ int xe_guc_exec_queue_reset_handler(struct
> xe_guc *guc, u32 *msg, u32 len)
>         return 0;
>  }
>  
> +int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32
> len)
> +{
> +       u32 status;
> +
> +       if (unlikely(len != 1)) {
> +               xe_gt_dbg(guc_to_gt(guc), "Invalid length %u", len);
> +               return -EPROTO;
> +       }
> +
> +       status = msg[0] & XE_GUC_STATE_CAPTURE_EVENT_STATUS_MASK;
> +       if (status == XE_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE)
> +               xe_gt_warn(guc_to_gt(guc), "G2H-Error capture no
> space");
> +
> +       xe_guc_capture_process(guc);
> +
> +       return 0;
> +}
> +
>  int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc,
> u32 *msg,
>                                                u32 len)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h
> b/drivers/gpu/drm/xe/xe_guc_submit.h
> index 4ad5f4c1b084..d92256de473e 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -19,12 +19,14 @@ void xe_guc_submit_reset_wait(struct xe_guc
> *guc);
>  void xe_guc_submit_stop(struct xe_guc *guc);
>  int xe_guc_submit_start(struct xe_guc *guc);
>  
> +int xe_guc_read_stopped(struct xe_guc *guc);
>  int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32
> len);
>  int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32
> len);
>  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg,
> u32 len);
>  int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc,
> u32 *msg,
>                                                u32 len);
>  int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32
> *msg, u32 len);
> +int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32
> len);
>  
>  struct xe_guc_submit_exec_queue_snapshot *
>  xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c
> b/drivers/gpu/drm/xe/xe_hw_engine.c
> index 0a83506e1ad8..3bc88fbad952 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -20,6 +20,9 @@
>  #include "xe_gt_printk.h"
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
> +#include "xe_guc.h"
> +#include "xe_guc_capture.h"
> +#include "xe_guc_capture_fwif.h"
>  #include "xe_hw_fence.h"
>  #include "xe_irq.h"
>  #include "xe_lrc.h"
> @@ -287,6 +290,7 @@ static void hw_engine_mmio_write32(struct
> xe_hw_engine *hwe, struct xe_reg reg,
>  static u32 hw_engine_mmio_read32(struct xe_hw_engine *hwe, struct
> xe_reg reg)
>  {
>         xe_gt_assert(hwe->gt, !(reg.addr & hwe->mmio_base));
> +
>         xe_force_wake_assert_held(gt_to_fw(hwe->gt), hwe->domain);
>  
>         reg.addr += hwe->mmio_base;
> @@ -825,6 +829,62 @@ xe_hw_engine_snapshot_instdone_capture(struct
> xe_hw_engine *hwe,
>         }
>  }
>  
> +static void
> +xe_hw_engine_snapshot_from_hw_by_type(struct xe_hw_engine *hwe,
> +                                     struct xe_hw_engine_snapshot
> *snapshot, int type)
> +{
> +       const struct __guc_mmio_reg_descr_group *list;
> +       u16 capture_class = xe_engine_class_to_guc_capture_class(hwe-
> >class);
> +       int i;
> +
> +       list =
> xe_guc_capture_get_reg_desc_list(GUC_CAPTURE_LIST_INDEX_PF, type,
> capture_class);
> +       if (!list)
> +               return;
> +
> +       for (i = 0; i < list->num_regs; i++) {
> +               u32 *field;
> +
> +               /* loop until extra operation registers zone */
> +               if (list->list[i].reg.addr ==
> XE_GUC_SNAPSHOT_EXTRA_OPERATION_REGS_START_REG_ADDR)
> +                       break;
> +
> +               field = (u32 *)((uintptr_t)&snapshot->reg +
> +                               list->list[i].position_in_snapshot);
> +               if (type == GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE)
> +                       *field = hw_engine_mmio_read32(hwe, list-
> >list[i].reg);
> +               else
> +                       *field = xe_mmio_read32(hwe->gt, list-
> >list[i].reg);
> +       }
> +}
> +
> +/**
> + * xe_hw_engine_snapshot_from_hw - Take a quick engine snapshot from
> HW.
> + * @hwe: Xe HW Engine.
> + * @snapshot: Point to the Xe HW Engine snapshot object to save
> data.
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: None
> + */
> +static void
> +xe_hw_engine_snapshot_from_hw(struct xe_hw_engine *hwe, struct
> xe_hw_engine_snapshot *snapshot)
> +{
> +       int type;
> +
> +       for (type = GUC_CAPTURE_LIST_TYPE_GLOBAL; type <
> GUC_CAPTURE_LIST_TYPE_MAX; type++)
> +               xe_hw_engine_snapshot_from_hw_by_type(hwe, snapshot,
> type);
> +
> +       /* Extra operation required registers zone - start */
> +       if (xe_gt_has_indirect_ring_state(hwe->gt))
> +               snapshot->reg.indirect_ring_state =
> +                       hw_engine_mmio_read32(hwe,
> INDIRECT_RING_STATE(0));
> +       /* Extra operation required registers zone - End */
> +
> +       /* Capture steering registers */
> +       xe_hw_engine_snapshot_instdone_capture(hwe, snapshot);
> +}
> +
>  /**
>   * xe_hw_engine_snapshot_capture - Take a quick snapshot of the HW
> Engine.
>   * @hwe: Xe HW Engine.
> @@ -839,8 +899,12 @@ struct xe_hw_engine_snapshot *
>  xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe)
>  {
>         struct xe_hw_engine_snapshot *snapshot;
> +       struct xe_gt *gt = hwe->gt;
> +       struct xe_device *xe = gt_to_xe(gt);
> +       struct xe_guc *guc = &gt->uc.guc;
> 
alan:snip

> +       /* Check GuC settings, job is set and capture outlist not
> empty,
> +        * otherwise take it from engine
> +        */
> +       if (xe_device_uc_enabled(xe) && xe->wedged.mode >= 1 &&
> +           !list_empty(&guc->capture->outlist) && xe-
> >devcoredump.job)
> +               xe_hw_engine_find_and_copy_guc_capture_snapshot(hwe,
> snapshot);
> +       else
> +               xe_hw_engine_snapshot_from_hw(hwe, snapshot);

alan: in the above if-else check, we are assuming that guc's capture
list not being empty means we have the captured node for this
devcoredump triger... however, in larger scale platforms we could have
multiple captured nodes from multiple engines so we should not
be relying on just whether its empty or not, rather the check should
call guc capture to find matching node based on lrc, guc-id, engnine
etc (i.e. basically call 
xe_hw_engine_find_and_copy_guc_capture_snapshot directly to see if we
have a dump.)

alan:snip.

I'll review the rest of this patch after rev9 is out since there are
other changes incoming as per my first review comments on top.