[PATCH v9 4/4] drm/xe/guc: Extract GuC capture lists to register snapshot

Michal Wajdeczko michal.wajdeczko at intel.com
Fri Jun 14 12:31:10 UTC 2024



On 07.06.2024 02:07, Zhanjun Dong wrote:
> Upon the G2H Notify-Err-Capture event, parse through the
> GuC Log Buffer (error-capture-subregion) and generate one or
> more capture-nodes. A single node represents a single "engine-
> instance-capture-dump" and contains at least 3 register lists:
> global, engine-class and engine-instance. An internal link
> list is maintained to store one or more nodes.
> Because the link-list node generation happen before the call
> to devcoredump, duplicate global and engine-class register
> lists for each engine-instance register dump if we find
> dependent-engine resets in a engine-capture-group.
> When xe_devcoredump calls into snapshot_from_capture_engine,
> we detach the matching node (guc-id, LRCA, etc) from the link list
> above and attach it to snapshot_regs structure when have
> matching LRCA/guc-id/engine-instance.
> 
> To avoid dynamically allocate the output nodes during gt reset,
> pre-allocate a fixed number of empty nodes up front (at the
> time of ADS registration) that we can consume from or return to
> an internal cached list of nodes.
> Add guc capture data structure definition.
> 
> Add xe_hw_engine_snapshot_from_capture to take snapshot from capture
> node list.
> Move snapshot register struct out of engine snapshot struct.
> Add offset in snapshot register to register definition list at
> xe_guc_capture.c.
> Snapshot could be split into global, engine class, engine instance
> and steering register zone, few macros defined to separate zones.
> Support combines 2 32bit registers as a 64bit register in snapshot,
> perform endian convert if needed.
> 
> Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
> ---
>  drivers/gpu/drm/xe/abi/guc_actions_abi.h  |   7 +
>  drivers/gpu/drm/xe/regs/xe_gt_regs.h      |   2 +
>  drivers/gpu/drm/xe/xe_devcoredump.c       |   4 +
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |   2 +
>  drivers/gpu/drm/xe/xe_guc.h               |  23 +
>  drivers/gpu/drm/xe/xe_guc_capture.c       | 876 +++++++++++++++++++++-
>  drivers/gpu/drm/xe/xe_guc_capture.h       |   9 +
>  drivers/gpu/drm/xe/xe_guc_capture_fwif.h  |  45 ++
>  drivers/gpu/drm/xe/xe_guc_ct.c            |   2 +
>  drivers/gpu/drm/xe/xe_guc_fwif.h          |   6 +
>  drivers/gpu/drm/xe/xe_guc_submit.c        |  63 +-
>  drivers/gpu/drm/xe/xe_guc_submit.h        |   2 +
>  drivers/gpu/drm/xe/xe_hw_engine.c         | 218 ++++--
>  drivers/gpu/drm/xe/xe_hw_engine_types.h   | 159 ++--
>  drivers/gpu/drm/xe/xe_lrc.h               |   1 +
>  15 files changed, 1244 insertions(+), 175 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> index 79ba98a169f9..ed1eeea34e8e 100644
> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
> @@ -182,6 +182,13 @@ enum xe_guc_sleep_state_status {
>  #define GUC_LOG_CONTROL_VERBOSITY_MASK	(0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT)
>  #define GUC_LOG_CONTROL_DEFAULT_LOGGING	(1 << 8)
>  
> +enum intel_guc_state_capture_event_status {
> +	XE_GUC_STATE_CAPTURE_EVENT_STATUS_SUCCESS = 0x0,
> +	XE_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE = 0x1,
> +};
> +
> +#define XE_GUC_STATE_CAPTURE_EVENT_STATUS_MASK      0x000000FF
> +
>  #define XE_GUC_TLB_INVAL_TYPE_SHIFT 0
>  #define XE_GUC_TLB_INVAL_MODE_SHIFT 8
>  /* Flush PPC or SMRO caches along with TLB invalidation request */
> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> index d09b2473259f..c6bd50738e2b 100644
> --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
> @@ -574,4 +574,6 @@
>  #define   GT_CS_MASTER_ERROR_INTERRUPT		REG_BIT(3)
>  #define   GT_RENDER_USER_INTERRUPT		REG_BIT(0)
>  
> +#define SFC_DONE(n)				XE_REG(0x1cc000 + (n) * 0x1000)
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index d7f2d19a77c1..5e80710d3cc8 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -16,6 +16,7 @@
>  #include "xe_force_wake.h"
>  #include "xe_gt.h"
>  #include "xe_gt_printk.h"
> +#include "xe_guc_capture.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_submit.h"
>  #include "xe_hw_engine.h"
> @@ -149,10 +150,12 @@ static void xe_devcoredump_free(void *data)
>  		if (coredump->snapshot.hwe[i])
>  			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
>  	xe_vm_snapshot_free(coredump->snapshot.vm);
> +	xe_guc_capture_free(&coredump->snapshot.gt->uc.guc);
>  
>  	/* To prevent stale data on next snapshot, clear everything */
>  	memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
>  	coredump->captured = false;
> +	coredump->job = NULL;
>  	drm_info(&coredump_to_xe(coredump)->drm,
>  		 "Xe device coredump has been deleted.\n");
>  }
> @@ -186,6 +189,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>  		put_task_struct(task);
>  
>  	ss->gt = q->gt;
> +	coredump->job = job;
>  	INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
>  
>  	cookie = dma_fence_begin_signalling();
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 923cdf72a816..c39ab73a9f6a 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -61,6 +61,8 @@ struct xe_devcoredump {
>  	bool captured;
>  	/** @snapshot: Snapshot is captured at time of the first crash */
>  	struct xe_devcoredump_snapshot snapshot;
> +	/** @job: Point to the issue job */
> +	struct xe_sched_job *job;
>  };
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
> index ddfa855458ab..e1afda9070f4 100644
> --- a/drivers/gpu/drm/xe/xe_guc.h
> +++ b/drivers/gpu/drm/xe/xe_guc.h
> @@ -59,6 +59,29 @@ static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
>  	}
>  }
>  
> +static inline u16 xe_guc_class_to_capture_class(uint class)
> +{
> +	switch (class) {
> +	case GUC_RENDER_CLASS:
> +	case GUC_COMPUTE_CLASS:
> +		return GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE;
> +	case GUC_GSC_OTHER_CLASS:
> +		return GUC_CAPTURE_LIST_CLASS_GSC_OTHER;
> +	case GUC_VIDEO_CLASS:
> +	case GUC_VIDEOENHANCE_CLASS:
> +	case GUC_BLITTER_CLASS:
> +		return class;
> +	default:
> +		XE_WARN_ON(class);
> +		return -1;

it doesn't look like a safe value nor that you handle it correctly

> +	}
> +}
> +
> +static inline u16 xe_engine_class_to_guc_capture_class(enum xe_engine_class class)
> +{
> +	return xe_guc_class_to_capture_class(xe_guc_class_to_capture_class(class));

are you sure this is correct ?

> +}
> +
>  static inline struct xe_gt *guc_to_gt(struct xe_guc *guc)
>  {
>  	return container_of(guc, struct xe_gt, uc.guc);
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
> index 0c90def290de..f18933503672 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture.c
> +++ b/drivers/gpu/drm/xe/xe_guc_capture.c
> @@ -26,10 +26,13 @@
>  #include "xe_guc_capture_fwif.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_log.h"
> +#include "xe_guc_submit_types.h"
>  #include "xe_guc_submit.h"
>  #include "xe_hw_engine_types.h"
> +#include "xe_lrc.h"
>  #include "xe_macros.h"
>  #include "xe_map.h"
> +#include "xe_sched_job.h"
>  
>  /*
>   * Define all device tables of GuC error capture register lists
> @@ -37,28 +40,81 @@
>   *       from the engine-mmio-base
>   */
>  #define COMMON_XELP_BASE_GLOBAL \
> -	{ FORCEWAKE_GT,		    0,      0}
> +	{ FORCEWAKE_GT,				0,	0, "FORCEWAKE_GT",\
> +	  offsetof(struct snapshot_regs, forcewake_gt)	}
>  
>  #define COMMON_BASE_ENGINE_INSTANCE \
> -	{ RING_ESR(0),              0,      0}, \
> -	{ RING_EMR(0),              0,      0}, \
> -	{ RING_EIR(0),              0,      0}, \
> -	{ RING_EXECLIST_STATUS_HI(0), 0,    0}, \
> -	{ RING_EXECLIST_STATUS_LO(0), 0,    0}, \
> -	{ RING_DMA_FADD(0),         0,      0}, \
> -	{ RING_DMA_FADD_UDW(0),     0,      0}, \
> -	{ RING_IPEHR(0),            0,      0}, \
> -	{ RING_BBADDR(0),           0,      0}, \
> -	{ RING_BBADDR_UDW(0),       0,      0}, \
> -	{ RING_ACTHD(0),            0,      0}, \
> -	{ RING_ACTHD_UDW(0),        0,      0}, \
> -	{ RING_START(0),            0,      0}, \
> -	{ RING_HEAD(0),             0,      0}, \
> -	{ RING_TAIL(0),             0,      0}, \
> -	{ RING_CTL(0),              0,      0}, \
> -	{ RING_MI_MODE(0),          0,      0}, \
> -	{ RING_HWS_PGA(0),          0,      0}, \
> -	{ RING_MODE(0),             0,      0}
> +	{ RING_HWSTAM(0),			0,	0, "HWSTAM",\
> +	  offsetof(struct snapshot_regs, ring_hwstam)	}, \
> +	{ RING_HWS_PGA(0),			0,	0, "RING_HWS_PGA",\
> +	  offsetof(struct snapshot_regs, ring_hws_pga)	}, \
> +	{ RING_HEAD(0),				0,	0, "RING_HEAD",\
> +	  offsetof(struct snapshot_regs, ring_head)	}, \
> +	{ RING_TAIL(0),				0,	0, "RING_TAIL",\
> +	  offsetof(struct snapshot_regs, ring_tail)	}, \
> +	{ RING_CTL(0),				0,	0, "RING_CTL",\
> +	  offsetof(struct snapshot_regs, ring_ctl)	}, \
> +	{ RING_MI_MODE(0),			0,	0, "RING_MI_MODE",\
> +	  offsetof(struct snapshot_regs, ring_mi_mode)	}, \
> +	{ RING_MODE(0),				0,	0, "RING_MODE",\
> +	  offsetof(struct snapshot_regs, ring_mode)	}, \
> +	{ RING_ESR(0),				0,	0, "RING_ESR",\
> +	  offsetof(struct snapshot_regs, ring_esr)	}, \
> +	{ RING_EMR(0),				0,	0, "RING_EMR",\
> +	  offsetof(struct snapshot_regs, ring_emr)	}, \
> +	{ RING_EIR(0),				0,	0, "RING_EIR",\
> +	  offsetof(struct snapshot_regs, ring_eir)	}, \
> +	{ RING_IMR(0),				0,	0, "RING_IMR",\
> +	  offsetof(struct snapshot_regs, ring_imr)	}, \
> +	{ RING_IPEHR(0),			0,	0, "IPEHR",\
> +	  offsetof(struct snapshot_regs, ipehr)		}, \
> +	/* 64 bit register - Start */			   \
> +	/* defined XE_GUC_SNAPSHOT_REGS_U64_START_REG_ADDR to the address of 1st register below */ \
> +	/* into xe_hw_engine_types.h				*/ \
> +	{ RING_ACTHD(0),			0,	0, "ACTHD",\
> +	  offsetof(struct snapshot_regs, ring_acthd)	}, \
> +	{ RING_ACTHD_UDW(0),			0,	0, NULL,\
> +	  offsetof(struct snapshot_regs, ring_acthd) + 4}, \
> +	{ RING_BBADDR(0),			0,	0, "RING_BBADDR",\
> +	  offsetof(struct snapshot_regs, ring_bbaddr)	}, \
> +	{ RING_BBADDR_UDW(0),			0,	0, NULL,\
> +	  offsetof(struct snapshot_regs, ring_bbaddr) + 4}, \
> +	{ RING_START(0),			0,	0, "RING_START",\
> +	  offsetof(struct snapshot_regs, ring_start)	}, \
> +	{ RING_START_UDW(0),			0,	0, NULL,\
> +	  offsetof(struct snapshot_regs, ring_start) + 4}, \
> +	{ RING_DMA_FADD(0),			0,	0, "RING_DMA_FADD",\
> +	  offsetof(struct snapshot_regs, ring_dma_fadd)	}, \
> +	{ RING_DMA_FADD_UDW(0),			0,	0, NULL,\
> +	  offsetof(struct snapshot_regs, ring_dma_fadd) + 4}, \
> +	{ RING_EXECLIST_STATUS_LO(0),		0,	0, "RING_EXECLIST_STATUS",\
> +	  offsetof(struct snapshot_regs, ring_execlist_status)}, \
> +	{ RING_EXECLIST_STATUS_HI(0),		0,	0, NULL,\
> +	  offsetof(struct snapshot_regs, ring_execlist_status) + 4}, \
> +	{ RING_EXECLIST_SQ_CONTENTS_LO(0),	0,	0, "RING_EXECLIST_SQ_CONTENTS",\
> +	  offsetof(struct snapshot_regs, ring_execlist_sq_contents)}, \
> +	{ RING_EXECLIST_SQ_CONTENTS_HI(0),	0,	0, NULL,\
> +	  offsetof(struct snapshot_regs, ring_execlist_sq_contents) + 4}, \
> +	/* 64 bit register - End */			   \
> +	/* Extra handling registers */				   \
> +	/* define XE_GUC_SNAPSHOT_EXTRA_OPERATION_REGS_START_REG_ADDR to the address of 1st */ \
> +	/* register below into xe_hw_engine_types.h				*/ \
> +	{ INDIRECT_RING_STATE(0),		0,	0, "INDIRECT_RING_STATE",\
> +	  offsetof(struct snapshot_regs, indirect_ring_state)}
> +
> +#define COMMON_XELP_RC_CLASS \
> +	{ RCU_MODE,				0,	0, "RCU_MODE",\
> +	  offsetof(struct snapshot_regs, rcu_mode)	}
> +
> +#define XELP_DIRECT_READ_VEC_CLASS \
> +	{ SFC_DONE(0),				0,	0, "SFC_DONE[0]", \
> +	  offsetof(struct snapshot_regs, sfc_done_0)	}, \
> +	{ SFC_DONE(1),				0,	0, "SFC_DONE[1]", \
> +	  offsetof(struct snapshot_regs, sfc_done_1)	}, \
> +	{ SFC_DONE(2),				0,	0, "SFC_DONE[2]", \
> +	  offsetof(struct snapshot_regs, sfc_done_2)	}, \
> +	{ SFC_DONE(3),				0,	0, "SFC_DONE[3]", \
> +	  offsetof(struct snapshot_regs, sfc_done_3)	}
>  
>  /* XE_LP Global */
>  static const struct __guc_mmio_reg_descr xe_lp_global_regs[] = {
> @@ -70,6 +126,11 @@ static const struct __guc_mmio_reg_descr xe_rc_inst_regs[] = {
>  	COMMON_BASE_ENGINE_INSTANCE,
>  };
>  
> +/* Render / Compute Per-Engine-Instance */
> +static const struct __guc_mmio_reg_descr xe_rc_class_regs[] = {
> +	COMMON_XELP_RC_CLASS,
> +};
> +
>  /* Media Decode/Encode Per-Engine-Instance */
>  static const struct __guc_mmio_reg_descr xe_vd_inst_regs[] = {
>  	COMMON_BASE_ENGINE_INSTANCE,
> @@ -80,6 +141,11 @@ static const struct __guc_mmio_reg_descr xe_vec_inst_regs[] = {
>  	COMMON_BASE_ENGINE_INSTANCE,
>  };
>  
> +/* Video Enhancement Per-Engine-Class */
> +static const struct __guc_mmio_reg_descr xe_vec_direct_read_regs[] = {
> +	XELP_DIRECT_READ_VEC_CLASS,
> +};
> +
>  /* Blitter Per-Engine-Instance */
>  static const struct __guc_mmio_reg_descr xe_blt_inst_regs[] = {
>  	COMMON_BASE_ENGINE_INSTANCE,
> @@ -112,12 +178,13 @@ static const struct __guc_mmio_reg_descr empty_regs_list[] = {
>  /* List of lists */
>  static const struct __guc_mmio_reg_descr_group xe_lp_lists[] = {
>  	MAKE_REGLIST(xe_lp_global_regs, PF, GLOBAL, 0),
> -	MAKE_REGLIST(empty_regs_list, PF, ENGINE_CLASS, GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE),
> +	MAKE_REGLIST(xe_rc_class_regs, PF, ENGINE_CLASS, GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE),
>  	MAKE_REGLIST(xe_rc_inst_regs, PF, ENGINE_INSTANCE,
>  		     GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE),
>  	MAKE_REGLIST(empty_regs_list, PF, ENGINE_CLASS, GUC_CAPTURE_LIST_CLASS_VIDEO),
>  	MAKE_REGLIST(xe_vd_inst_regs, PF, ENGINE_INSTANCE, GUC_CAPTURE_LIST_CLASS_VIDEO),
>  	MAKE_REGLIST(empty_regs_list, PF, ENGINE_CLASS, GUC_CAPTURE_LIST_CLASS_VIDEOENHANCE),
> +	MAKE_REGLIST(xe_vec_direct_read_regs, PF, DIRECT_READ, GUC_CAPTURE_LIST_CLASS_VIDEOENHANCE),
>  	MAKE_REGLIST(xe_vec_inst_regs, PF, ENGINE_INSTANCE,
>  		     GUC_CAPTURE_LIST_CLASS_VIDEOENHANCE),
>  	MAKE_REGLIST(empty_regs_list, PF, ENGINE_CLASS, GUC_CAPTURE_LIST_CLASS_BLITTER),
> @@ -148,6 +215,7 @@ static const char * const capture_engine_class_names[] = {
>   */
>  #define get_item_with_default(ar, index) (ar[(index) >= ARRAY_SIZE(ar) ? ARRAY_SIZE(ar) -  1 : \
>  									 (index)])
> +static void guc_capture_create_prealloc_nodes(struct xe_guc *guc);
>  
>  static const struct __guc_mmio_reg_descr_group *
>  guc_capture_get_one_list(const struct __guc_mmio_reg_descr_group *reglists,
> @@ -167,6 +235,12 @@ guc_capture_get_one_list(const struct __guc_mmio_reg_descr_group *reglists,
>  	return NULL;
>  }
>  
> +const struct __guc_mmio_reg_descr_group *
> +xe_guc_capture_get_reg_desc_list(u32 owner, u32 type, u32 engine_classid)
> +{
> +	return guc_capture_get_one_list(xe_lp_lists, owner, type, engine_classid);
> +}
> +
>  static struct __guc_mmio_reg_descr_group *
>  guc_capture_get_one_ext_list(struct __guc_mmio_reg_descr_group *reglists,
>  			     u32 owner, u32 type, u32 id)
> @@ -430,6 +504,12 @@ xe_guc_capture_getlist(struct xe_guc *guc, u32 owner, u32 type, u32 classid, voi
>  		return cache->status;
>  	}
>  
> +	/*
> +	 * ADS population of input registers is a good
> +	 * time to pre-allocate cachelist output nodes
> +	 */
> +	guc_capture_create_prealloc_nodes(guc);
> +
>  	ret = xe_guc_capture_getlistsize(guc, owner, type, classid, &size);
>  	if (ret) {
>  		cache->is_valid = true;
> @@ -567,6 +647,756 @@ static void check_guc_capture_size(struct xe_guc *guc)
>  			  buffer_size, spare_size, capture_size);
>  }
>  
> +static void
> +guc_capture_add_node_to_list(struct __guc_capture_parsed_output *node,
> +			     struct list_head *list)
> +{
> +	list_add_tail(&node->link, list);
> +}
> +
> +static void
> +guc_capture_add_node_to_outlist(struct xe_guc_state_capture *guc,
> +				struct __guc_capture_parsed_output *node)
> +{
> +	guc_capture_add_node_to_list(node, &guc->outlist);
> +}
> +
> +static void
> +guc_capture_add_node_to_cachelist(struct xe_guc_state_capture *guc,
> +				  struct __guc_capture_parsed_output *node)
> +{
> +	guc_capture_add_node_to_list(node, &guc->cachelist);
> +}
> +
> +static void
> +guc_capture_init_node(struct xe_guc *guc, struct __guc_capture_parsed_output *node)
> +{
> +	struct guc_mmio_reg *tmp[GUC_CAPTURE_LIST_TYPE_MAX];
> +	int i;
> +
> +	for (i = 0; i < GUC_CAPTURE_LIST_TYPE_MAX; ++i) {
> +		tmp[i] = node->reginfo[i].regs;
> +		memset(tmp[i], 0, sizeof(struct guc_mmio_reg) *
> +		       guc->capture->max_mmio_per_node);
> +	}
> +	memset(node, 0, sizeof(*node));
> +	for (i = 0; i < GUC_CAPTURE_LIST_TYPE_MAX; ++i)
> +		node->reginfo[i].regs = tmp[i];
> +
> +	INIT_LIST_HEAD(&node->link);
> +}
> +
> +/*
> + * KMD Init time flows:
> + * --------------------
> + *     --> alloc A: GuC input capture regs lists (registered to GuC via ADS).
> + *                  xe_guc_ads acquires the register lists by calling
> + *                  xe_guc_capture_list_size and xe_guc_capture_list_get 'n' times,
> + *                  where n = 1 for global-reg-list +
> + *                            num_engine_classes for class-reg-list +
> + *                            num_engine_classes for instance-reg-list
> + *                               (since all instances of the same engine-class type
> + *                                have an identical engine-instance register-list).
> + *                  ADS module also calls separately for PF vs VF.
> + *
> + *     --> alloc B: GuC output capture buf (registered via guc_init_params(log_param))
> + *                  Size = #define CAPTURE_BUFFER_SIZE (warns if on too-small)
> + *                  Note2: 'x 3' to hold multiple capture groups
> + *
> + * GUC Runtime notify capture:
> + * --------------------------
> + *     --> G2H STATE_CAPTURE_NOTIFICATION
> + *                   L--> xe_guc_capture_process
> + *                           L--> Loop through B (head..tail) and for each engine instance's
> + *                                err-state-captured register-list we find, we alloc 'C':
> + *      --> alloc C: A capture-output-node structure that includes misc capture info along
> + *                   with 3 register list dumps (global, engine-class and engine-instance)
> + *                   This node is created from a pre-allocated list of blank nodes in
> + *                   guc->capture->cachelist and populated with the error-capture
> + *                   data from GuC and then it's added into guc->capture->outlist linked
> + *                   list. This list is used for matchup and printout by xe_devcoredump_read
> + *                   and xe_hw_engine_snapshot_print, (when user invokes the devcoredump sysfs).
> + *
> + * GUC --> notify context reset:
> + * -----------------------------
> + *     --> guc_exec_queue_timedout_job
> + *                   L--> xe_devcoredump
> + *                          L--> devcoredump_snapshot(..IS_GUC_CAPTURE)
> + *                               --> xe_hw_engine_snapshot_capture(..IS_GUC_CAPTURE)
> + *                               L--> xe_hw_engine_find_and_copy_guc_capture_snapshot is where
> + *                                    detach C from internal linked list and add it into
> + *                                    xe_hw_engine_snapshot struct (if the context and
> + *                                    engine of the event notification matches a node
> + *                                    in the link list).
> + *
> + * User Sysfs / Debugfs
> + * --------------------
> + *      --> xe_devcoredump_read->
> + *             L--> xxx_snapshot_print
> + *                    L--> xe_hw_engine_snapshot_print
> + *                         register lists values of the xe_hw_engine_snapshot
> + *                         saved from the error-engine-dump.
> + *
> + */
> +
> +static int guc_capture_buf_cnt(struct __guc_capture_bufstate *buf)
> +{
> +	if (buf->wr >= buf->rd)
> +		return (buf->wr - buf->rd);
> +	return (buf->size - buf->rd) + buf->wr;
> +}
> +
> +static int guc_capture_buf_cnt_to_end(struct __guc_capture_bufstate *buf)
> +{
> +	if (buf->rd > buf->wr)
> +		return (buf->size - buf->rd);
> +	return (buf->wr - buf->rd);
> +}
> +
> +/*
> + * GuC's error-capture output is a ring buffer populated in a byte-stream fashion:
> + *
> + * The GuC Log buffer region for error-capture is managed like a ring buffer.
> + * The GuC firmware dumps error capture logs into this ring in a byte-stream flow.
> + * Additionally, as per the current and foreseeable future, all packed error-
> + * capture output structures are dword aligned.
> + *
> + * That said, if the GuC firmware is in the midst of writing a structure that is larger
> + * than one dword but the tail end of the err-capture buffer-region has lesser space left,
> + * we would need to extract that structure one dword at a time straddled across the end,
> + * onto the start of the ring.
> + *
> + * Below function, guc_capture_log_remove_dw is a helper for that. All callers of this
> + * function would typically do a straight-up memcpy from the ring contents and will only
> + * call this helper if their structure-extraction is straddling across the end of the
> + * ring. GuC firmware does not add any padding. The reason for the no-padding is to ease
> + * scalability for future expansion of output data types without requiring a redesign
> + * of the flow controls.
> + */
> +static int
> +guc_capture_log_remove_dw(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> +			  u32 *dw)
> +{
> +	int tries = 2;
> +	int avail = 0;
> +
> +	if (!guc_capture_buf_cnt(buf))
> +		return 0;
> +
> +	while (tries--) {
> +		avail = guc_capture_buf_cnt_to_end(buf);
> +		if (avail >= sizeof(u32)) {
> +			*dw = xe_map_rd(guc_to_xe(guc), &guc->log.bo->vmap,
> +					buf->data_offset + buf->rd, u32);
> +			buf->rd += 4;
> +			return 4;
> +		}
> +		if (avail)
> +			xe_gt_dbg(guc_to_gt(guc), "Register capture log not dword aligned, skipping.\n");
> +		buf->rd = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +static bool
> +guc_capture_data_extracted(struct xe_guc *guc, struct __guc_capture_bufstate *b,
> +			   int size, void *dest)
> +{
> +	if (guc_capture_buf_cnt_to_end(b) >= size) {
> +		xe_map_memcpy_from(guc_to_xe(guc), dest, &guc->log.bo->vmap,
> +				   b->data_offset + b->rd, size);
> +		b->rd += size;
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static int
> +guc_capture_log_get_group_hdr(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> +			      struct guc_state_capture_group_header_t *ghdr)
> +{
> +	int read = 0;
> +	int fullsize = sizeof(struct guc_state_capture_group_header_t);
> +
> +	if (fullsize > guc_capture_buf_cnt(buf))
> +		return -1;
> +
> +	if (guc_capture_data_extracted(guc, buf, fullsize, (void *)ghdr))
> +		return 0;
> +
> +	read += guc_capture_log_remove_dw(guc, buf, &ghdr->owner);
> +	read += guc_capture_log_remove_dw(guc, buf, &ghdr->info);
> +	if (read != fullsize)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int
> +guc_capture_log_get_data_hdr(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> +			     struct guc_state_capture_header_t *hdr)
> +{
> +	int read = 0;
> +	int fullsize = sizeof(struct guc_state_capture_header_t);
> +
> +	if (fullsize > guc_capture_buf_cnt(buf))
> +		return -1;
> +
> +	if (guc_capture_data_extracted(guc, buf, fullsize, (void *)hdr))
> +		return 0;
> +
> +	read += guc_capture_log_remove_dw(guc, buf, &hdr->owner);
> +	read += guc_capture_log_remove_dw(guc, buf, &hdr->info);
> +	read += guc_capture_log_remove_dw(guc, buf, &hdr->lrca);
> +	read += guc_capture_log_remove_dw(guc, buf, &hdr->guc_id);
> +	read += guc_capture_log_remove_dw(guc, buf, &hdr->num_mmios);
> +	if (read != fullsize)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int
> +guc_capture_log_get_register(struct xe_guc *guc, struct __guc_capture_bufstate *buf,
> +			     struct guc_mmio_reg *reg)
> +{
> +	int read = 0;
> +	int fullsize = sizeof(struct guc_mmio_reg);
> +
> +	if (fullsize > guc_capture_buf_cnt(buf))
> +		return -1;
> +
> +	if (guc_capture_data_extracted(guc, buf, fullsize, (void *)reg))
> +		return 0;
> +
> +	read += guc_capture_log_remove_dw(guc, buf, &reg->offset);
> +	read += guc_capture_log_remove_dw(guc, buf, &reg->value);
> +	read += guc_capture_log_remove_dw(guc, buf, &reg->flags);
> +	read += guc_capture_log_remove_dw(guc, buf, &reg->mask);
> +	if (read != fullsize)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static struct __guc_capture_parsed_output *
> +guc_capture_get_prealloc_node(struct xe_guc *guc)
> +{
> +	struct __guc_capture_parsed_output *found = NULL;
> +
> +	if (!list_empty(&guc->capture->cachelist)) {
> +		struct __guc_capture_parsed_output *n, *ntmp;
> +
> +		/* get first avail node from the cache list */
> +		list_for_each_entry_safe(n, ntmp, &guc->capture->cachelist, link) {
> +			found = n;
> +			break;
> +		}
> +	} else {
> +		struct __guc_capture_parsed_output *n, *ntmp;
> +
> +		/* traverse down and steal back the oldest node already allocated */
> +		list_for_each_entry_safe(n, ntmp, &guc->capture->outlist, link) {
> +			found = n;
> +		}
> +	}
> +	if (found) {
> +		list_del(&found->link);
> +		guc_capture_init_node(guc, found);
> +	}
> +
> +	return found;
> +}
> +
> +static struct __guc_capture_parsed_output *
> +guc_capture_clone_node(struct xe_guc *guc, struct __guc_capture_parsed_output *original,
> +		       u32 keep_reglist_mask)
> +{
> +	struct __guc_capture_parsed_output *new;
> +	int i;
> +
> +	new = guc_capture_get_prealloc_node(guc);
> +	if (!new)
> +		return NULL;
> +	if (!original)
> +		return new;
> +
> +	new->is_partial = original->is_partial;
> +
> +	/* copy reg-lists that we want to clone */
> +	for (i = 0; i < GUC_CAPTURE_LIST_TYPE_MAX; ++i) {
> +		if (keep_reglist_mask & BIT(i)) {
> +			XE_WARN_ON(original->reginfo[i].num_regs  >
> +				   guc->capture->max_mmio_per_node);
> +
> +			memcpy(new->reginfo[i].regs, original->reginfo[i].regs,
> +			       original->reginfo[i].num_regs * sizeof(struct guc_mmio_reg));
> +
> +			new->reginfo[i].num_regs = original->reginfo[i].num_regs;
> +			new->reginfo[i].vfid  = original->reginfo[i].vfid;
> +
> +			if (i == GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS) {
> +				new->eng_class = original->eng_class;
> +			} else if (i == GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE) {
> +				new->eng_inst = original->eng_inst;
> +				new->guc_id = original->guc_id;
> +				new->lrca = original->lrca;
> +			}
> +		}
> +	}
> +
> +	return new;
> +}
> +
> +static int
> +guc_capture_extract_reglists(struct xe_guc *guc, struct __guc_capture_bufstate *buf)
> +{
> +	struct xe_gt *gt = guc_to_gt(guc);
> +	struct guc_state_capture_group_header_t ghdr = {0};
> +	struct guc_state_capture_header_t hdr = {0};
> +	struct __guc_capture_parsed_output *node = NULL;
> +	struct guc_mmio_reg *regs = NULL;
> +	int i, numlists, numregs, ret = 0;
> +	enum guc_capture_type datatype;
> +	struct guc_mmio_reg tmp;
> +	bool is_partial = false;
> +
> +	i = guc_capture_buf_cnt(buf);
> +	if (!i)
> +		return -ENODATA;
> +
> +	if (i % sizeof(u32)) {
> +		xe_gt_warn(gt, "Got mis-aligned register capture entries\n");
> +		ret = -EIO;
> +		goto bailout;
> +	}
> +
> +	/* first get the capture group header */
> +	if (guc_capture_log_get_group_hdr(guc, buf, &ghdr)) {
> +		ret = -EIO;
> +		goto bailout;
> +	}
> +	/*
> +	 * we would typically expect a layout as below where n would be expected to be
> +	 * anywhere between 3 to n where n > 3 if we are seeing multiple dependent engine
> +	 * instances being reset together.
> +	 * ____________________________________________
> +	 * | Capture Group                            |
> +	 * | ________________________________________ |
> +	 * | | Capture Group Header:                | |
> +	 * | |  - num_captures = 5                  | |
> +	 * | |______________________________________| |
> +	 * | ________________________________________ |
> +	 * | | Capture1:                            | |
> +	 * | |  Hdr: GLOBAL, numregs=a              | |
> +	 * | | ____________________________________ | |
> +	 * | | | Reglist                          | | |
> +	 * | | | - reg1, reg2, ... rega           | | |
> +	 * | | |__________________________________| | |
> +	 * | |______________________________________| |
> +	 * | ________________________________________ |
> +	 * | | Capture2:                            | |
> +	 * | |  Hdr: CLASS=RENDER/COMPUTE, numregs=b| |
> +	 * | | ____________________________________ | |
> +	 * | | | Reglist                          | | |
> +	 * | | | - reg1, reg2, ... regb           | | |
> +	 * | | |__________________________________| | |
> +	 * | |______________________________________| |
> +	 * | ________________________________________ |
> +	 * | | Capture3:                            | |
> +	 * | |  Hdr: INSTANCE=RCS, numregs=c        | |
> +	 * | | ____________________________________ | |
> +	 * | | | Reglist                          | | |
> +	 * | | | - reg1, reg2, ... regc           | | |
> +	 * | | |__________________________________| | |
> +	 * | |______________________________________| |
> +	 * | ________________________________________ |
> +	 * | | Capture4:                            | |
> +	 * | |  Hdr: CLASS=RENDER/COMPUTE, numregs=d| |
> +	 * | | ____________________________________ | |
> +	 * | | | Reglist                          | | |
> +	 * | | | - reg1, reg2, ... regd           | | |
> +	 * | | |__________________________________| | |
> +	 * | |______________________________________| |
> +	 * | ________________________________________ |
> +	 * | | Capture5:                            | |
> +	 * | |  Hdr: INSTANCE=CCS0, numregs=e       | |
> +	 * | | ____________________________________ | |
> +	 * | | | Reglist                          | | |
> +	 * | | | - reg1, reg2, ... rege           | | |
> +	 * | | |__________________________________| | |
> +	 * | |______________________________________| |
> +	 * |__________________________________________|
> +	 */
> +	is_partial = FIELD_GET(CAP_GRP_HDR_CAPTURE_TYPE, ghdr.info);
> +	numlists = FIELD_GET(CAP_GRP_HDR_NUM_CAPTURES, ghdr.info);
> +
> +	while (numlists--) {
> +		if (guc_capture_log_get_data_hdr(guc, buf, &hdr)) {
> +			ret = -EIO;
> +			break;
> +		}
> +
> +		datatype = FIELD_GET(CAP_HDR_CAPTURE_TYPE, hdr.info);
> +		if (datatype > GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE) {
> +			/* unknown capture type - skip over to next capture set */
> +			numregs = FIELD_GET(CAP_HDR_NUM_MMIOS, hdr.num_mmios);
> +			while (numregs--) {
> +				if (guc_capture_log_get_register(guc, buf, &tmp)) {
> +					ret = -EIO;
> +					break;
> +				}
> +			}
> +			continue;
> +		} else if (node) {
> +			/*
> +			 * Based on the current capture type and what we have so far,
> +			 * decide if we should add the current node into the internal
> +			 * linked list for match-up when xe_devcoredump calls later
> +			 * (and alloc a blank node for the next set of reglists)
> +			 * or continue with the same node or clone the current node
> +			 * but only retain the global or class registers (such as the
> +			 * case of dependent engine resets).
> +			 */
> +			if (datatype == GUC_CAPTURE_LIST_TYPE_GLOBAL) {
> +				guc_capture_add_node_to_outlist(guc->capture, node);
> +				node = NULL;
> +			} else if (datatype == GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS &&
> +				   node->reginfo[GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS].num_regs) {
> +				/* Add to list, clone node and duplicate global list */
> +				guc_capture_add_node_to_outlist(guc->capture, node);
> +				node = guc_capture_clone_node(guc, node,
> +							      GCAP_PARSED_REGLIST_INDEX_GLOBAL);
> +			} else if (datatype == GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE &&
> +				   node->reginfo[GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE].num_regs) {
> +				/* Add to list, clone node and duplicate global + class lists */
> +				guc_capture_add_node_to_outlist(guc->capture, node);
> +				node = guc_capture_clone_node(guc, node,
> +							      (GCAP_PARSED_REGLIST_INDEX_GLOBAL |
> +							      GCAP_PARSED_REGLIST_INDEX_ENGCLASS));
> +			}
> +		}
> +
> +		if (!node) {
> +			node = guc_capture_get_prealloc_node(guc);
> +			if (!node) {
> +				ret = -ENOMEM;
> +				break;
> +			}
> +			if (datatype != GUC_CAPTURE_LIST_TYPE_GLOBAL)
> +				xe_gt_dbg(gt, "Register capture missing global dump: %08x!\n",
> +					  datatype);
> +		}
> +		node->is_partial = is_partial;
> +		node->reginfo[datatype].vfid = FIELD_GET(CAP_HDR_CAPTURE_VFID, hdr.owner);
> +
> +		switch (datatype) {
> +		case GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE:
> +			node->eng_class = FIELD_GET(CAP_HDR_ENGINE_CLASS, hdr.info);
> +			node->eng_inst = FIELD_GET(CAP_HDR_ENGINE_INSTANCE, hdr.info);
> +			node->lrca = hdr.lrca;
> +			node->guc_id = hdr.guc_id;
> +			break;
> +		case GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS:
> +			node->eng_class = FIELD_GET(CAP_HDR_ENGINE_CLASS, hdr.info);
> +			break;
> +		default:
> +			break;
> +		}
> +
> +		numregs = FIELD_GET(CAP_HDR_NUM_MMIOS, hdr.num_mmios);
> +		if (numregs > guc->capture->max_mmio_per_node) {
> +			xe_gt_dbg(gt, "Register capture list extraction clipped by prealloc!\n");
> +			numregs = guc->capture->max_mmio_per_node;
> +		}
> +		node->reginfo[datatype].num_regs = numregs;
> +		regs = node->reginfo[datatype].regs;
> +		i = 0;
> +		while (numregs--) {
> +			if (guc_capture_log_get_register(guc, buf, &regs[i++])) {
> +				ret = -EIO;
> +				break;
> +			}
> +		}
> +	}
> +
> +bailout:
> +	if (node) {
> +		/* If we have data, add to linked list for match-up when xe_devcoredump calls */
> +		for (i = GUC_CAPTURE_LIST_TYPE_GLOBAL; i < GUC_CAPTURE_LIST_TYPE_MAX; ++i) {
> +			if (node->reginfo[i].regs) {
> +				guc_capture_add_node_to_outlist(guc->capture, node);
> +				node = NULL;
> +				break;
> +			}
> +		}
> +		if (node) /* else return it back to cache list */
> +			guc_capture_add_node_to_cachelist(guc->capture, node);
> +	}
> +	return ret;
> +}
> +
> +static int __guc_capture_flushlog_complete(struct xe_guc *guc)
> +{
> +	u32 action[] = {
> +		XE_GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE,
> +		GUC_CAPTURE_LOG_BUFFER
> +	};
> +
> +	return xe_guc_ct_send_g2h_handler(&guc->ct, action, ARRAY_SIZE(action));
> +}
> +
> +static void __guc_capture_process_output(struct xe_guc *guc)
> +{
> +	unsigned int buffer_size, read_offset, write_offset, full_count;
> +	struct xe_uc *uc = container_of(guc, typeof(*uc), guc);
> +	struct guc_log_buffer_state log_buf_state_local;
> +	struct __guc_capture_bufstate buf;
> +	bool new_overflow;
> +	int ret;
> +	u32 log_buf_state_offset;
> +	u32 src_data_offset;
> +
> +	log_buf_state_offset = sizeof(struct guc_log_buffer_state) * GUC_CAPTURE_LOG_BUFFER;
> +	src_data_offset = xe_guc_get_log_buffer_offset(&guc->log, GUC_CAPTURE_LOG_BUFFER);
> +
> +	/*
> +	 * Make a copy of the state structure, inside GuC log buffer
> +	 * (which is uncached mapped), on the stack to avoid reading
> +	 * from it multiple times.
> +	 */
> +	xe_map_memcpy_from(guc_to_xe(guc), &log_buf_state_local, &guc->log.bo->vmap,
> +			   log_buf_state_offset, sizeof(struct guc_log_buffer_state));
> +
> +	buffer_size = xe_guc_get_log_buffer_size(&guc->log, GUC_CAPTURE_LOG_BUFFER);
> +	read_offset = log_buf_state_local.read_ptr;
> +	write_offset = log_buf_state_local.sampled_write_ptr;
> +	full_count = log_buf_state_local.buffer_full_cnt;
> +
> +	/* Bookkeeping stuff */
> +	guc->log.stats[GUC_CAPTURE_LOG_BUFFER].flush += log_buf_state_local.flush_to_file;
> +	new_overflow = xe_guc_check_log_buf_overflow(&guc->log, GUC_CAPTURE_LOG_BUFFER,
> +						     full_count);
> +
> +	/* Now copy the actual logs. */
> +	if (unlikely(new_overflow)) {
> +		/* copy the whole buffer in case of overflow */
> +		read_offset = 0;
> +		write_offset = buffer_size;
> +	} else if (unlikely((read_offset > buffer_size) ||
> +			(write_offset > buffer_size))) {
> +		xe_gt_err(guc_to_gt(guc),
> +			  "Register capture buffer in invalid state: read = 0x%X, size = 0x%X!\n",
> +			  read_offset, buffer_size);
> +		/* copy whole buffer as offsets are unreliable */
> +		read_offset = 0;
> +		write_offset = buffer_size;
> +	}
> +
> +	buf.size = buffer_size;
> +	buf.rd = read_offset;
> +	buf.wr = write_offset;
> +	buf.data_offset = src_data_offset;
> +
> +	if (!xe_guc_read_stopped(guc)) {
> +		do {
> +			ret = guc_capture_extract_reglists(guc, &buf);
> +		} while (ret >= 0);
> +	}
> +
> +	/* Update the state of log buffer err-cap state */
> +	xe_map_wr(guc_to_xe(guc), &guc->log.bo->vmap,
> +		  log_buf_state_offset + offsetof(struct guc_log_buffer_state, read_ptr), u32,
> +		  write_offset);
> +	/* Clear the flush_to_file from local first, the local was loaded by above
> +	 * xe_map_memcpy_from.
> +	 */
> +	log_buf_state_local.flush_to_file = 0;
> +	/* Then write out the "updated local" through xe_map_wr() */
> +	xe_map_wr(guc_to_xe(guc), &guc->log.bo->vmap,
> +		  log_buf_state_offset + offsetof(struct guc_log_buffer_state, flags), u32,
> +		  log_buf_state_local.flags);
> +	__guc_capture_flushlog_complete(guc);
> +}
> +

public functions require kernel-doc

> +void xe_guc_capture_process(struct xe_guc *guc)
> +{
> +	if (guc->capture)
> +		__guc_capture_process_output(guc);
> +}
> +
> +static struct __guc_capture_parsed_output *
> +guc_capture_alloc_one_node(struct xe_guc *guc)
> +{
> +	struct drm_device *drm = guc_to_drm(guc);
> +	struct __guc_capture_parsed_output *new;
> +	int i;
> +
> +	new = drmm_kzalloc(drm, sizeof(*new), GFP_KERNEL);
> +	if (!new)
> +		return NULL;
> +
> +	for (i = 0; i < GUC_CAPTURE_LIST_TYPE_MAX; ++i) {
> +		new->reginfo[i].regs = drmm_kzalloc(drm, guc->capture->max_mmio_per_node *
> +					       sizeof(struct guc_mmio_reg), GFP_KERNEL);
> +		if (!new->reginfo[i].regs) {
> +			while (i)
> +				drmm_kfree(drm, new->reginfo[--i].regs);
> +			drmm_kfree(drm, new);
> +			return NULL;
> +		}
> +	}
> +	guc_capture_init_node(guc, new);
> +
> +	return new;
> +}
> +
> +static void
> +__guc_capture_create_prealloc_nodes(struct xe_guc *guc)
> +{
> +	struct __guc_capture_parsed_output *node = NULL;
> +	int i;
> +
> +	for (i = 0; i < PREALLOC_NODES_MAX_COUNT; ++i) {
> +		node = guc_capture_alloc_one_node(guc);
> +		if (!node) {
> +			xe_gt_warn(guc_to_gt(guc), "Register capture pre-alloc-cache failure\n");
> +			/* dont free the priors, use what we got and cleanup at shutdown */
> +			return;
> +		}
> +		guc_capture_add_node_to_cachelist(guc->capture, node);
> +	}
> +}
> +
> +static int
> +guc_get_max_reglist_count(struct xe_guc *guc)
> +{
> +	int i, j, k, tmp, maxregcount = 0;
> +
> +	for (i = 0; i < GUC_CAPTURE_LIST_INDEX_MAX; ++i) {
> +		for (j = 0; j < GUC_CAPTURE_LIST_TYPE_MAX; ++j) {
> +			for (k = 0; k < GUC_MAX_ENGINE_CLASSES; ++k) {
> +				if (j == GUC_CAPTURE_LIST_TYPE_GLOBAL && k > 0)
> +					continue;
> +
> +				tmp = guc_cap_list_num_regs(guc->capture, i, j, k);
> +				if (tmp > maxregcount)
> +					maxregcount = tmp;
> +			}
> +		}
> +	}
> +	if (!maxregcount)
> +		maxregcount = PREALLOC_NODES_DEFAULT_NUMREGS;
> +
> +	return maxregcount;
> +}
> +
> +static void
> +guc_capture_create_prealloc_nodes(struct xe_guc *guc)
> +{
> +	/* skip if we've already done the pre-alloc */
> +	if (guc->capture->max_mmio_per_node)
> +		return;
> +
> +	guc->capture->max_mmio_per_node = guc_get_max_reglist_count(guc);
> +	__guc_capture_create_prealloc_nodes(guc);
> +}
> +
> +static void cp_reg_to_snapshot(int type, u16 hwe_guc_class, u32 offset, u32 value,
> +			       struct snapshot_regs *regs)
> +{
> +	int i;
> +	const struct __guc_mmio_reg_descr_group *list;
> +
> +	/* Get register list for the type/class */
> +	list = xe_guc_capture_get_reg_desc_list(GUC_CAPTURE_LIST_INDEX_PF, type,
> +						xe_guc_class_to_capture_class(hwe_guc_class));
> +	if (!list)
> +		return;
> +
> +	for (i = 0; i < list->num_regs; i++)
> +		if (offset == list->list[i].reg.addr) {
> +			u32 *field = (u32 *)((uintptr_t)regs + list->list[i].position_in_snapshot);
> +			*field = value;
> +			return;
> +		}
> +}
> +
> +static void guc_capture_parse_reglist(struct __guc_capture_parsed_output *node,
> +				      struct xe_hw_engine_snapshot *snapshot, u16 hwe_guc_class)
> +{
> +	int i, type;
> +
> +	if (!node)
> +		return;
> +
> +	for (type = GUC_CAPTURE_LIST_TYPE_GLOBAL; type < GUC_CAPTURE_LIST_TYPE_MAX; type++) {
> +		struct gcap_reg_list_info *reginfo = &node->reginfo[type];
> +		struct guc_mmio_reg *regs = reginfo->regs;
> +
> +		for (i = 0; i < reginfo->num_regs; i++)
> +			cp_reg_to_snapshot(type, hwe_guc_class, regs[i].offset, regs[i].value,
> +					   &snapshot->reg);
> +	}
> +}
> +
> +/**
> + * xe_hw_engine_find_and_copy_guc_capture_snapshot - Take a engine snapshot from GuC capture.
> + * @hwe: Xe HW Engine.
> + * @snapshot: Xe HW Engine snapshot object to save data, copied from error capture
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: None
> + */
> +void
> +xe_hw_engine_find_and_copy_guc_capture_snapshot(struct xe_hw_engine *hwe,
> +						struct xe_hw_engine_snapshot *snapshot)
> +{
> +	struct xe_gt *gt = hwe->gt;
> +	struct xe_device *xe = gt_to_xe(gt);
> +	struct xe_guc *guc = &gt->uc.guc;
> +	struct __guc_capture_parsed_output *n, *ntmp;
> +	struct xe_devcoredump *devcoredump = &xe->devcoredump;
> +	struct list_head *list = &guc->capture->outlist;
> +	struct xe_sched_job *job = devcoredump->job;
> +	struct xe_guc_submit_exec_queue_snapshot *ge = devcoredump->snapshot.ge;
> +	u16 guc_id = ge->guc.id;
> +	u32 lrca;
> +	u16 hwe_guc_class = xe_engine_class_to_guc_class(hwe->class);
> +
> +	lrca = xe_lrc_ggtt_addr(job->q->lrc[0]) & LRC_GTT_ADDRESS_MASK;
> +
> +	/*
> +	 * Look for a matching GuC reported error capture node from
> +	 * the internal output link-list based on engine class and instance.
> +	 */
> +	list_for_each_entry_safe(n, ntmp, list, link) {
> +		if (n->eng_class == hwe_guc_class && n->eng_inst == hwe->instance &&
> +		    n->guc_id == guc_id && (n->lrca & LRC_GTT_ADDRESS_MASK) == lrca) {
> +			guc_capture_parse_reglist(n, snapshot, hwe_guc_class);
> +			list_del(&n->link);
> +			return;
> +		}
> +	}
> +}
> +
> +void xe_guc_capture_free(struct xe_guc *guc)
> +{
> +	if (guc->capture && !list_empty(&guc->capture->outlist)) {
> +		struct __guc_capture_parsed_output *n, *ntmp;
> +
> +		list_for_each_entry_safe(n, ntmp, &guc->capture->outlist, link) {
> +			list_del(&n->link);
> +			/* put node back to cache list */
> +			/* No need to init here, guc_capture_get_prealloc_node init it later */
> +			guc_capture_add_node_to_cachelist(guc->capture, n);
> +		}
> +	}
> +}
> +
>  int xe_guc_capture_init(struct xe_guc *guc)
>  {
>  	guc->capture = drmm_kzalloc(guc_to_drm(guc), sizeof(*guc->capture), GFP_KERNEL);
> @@ -574,7 +1404,9 @@ int xe_guc_capture_init(struct xe_guc *guc)
>  		return -ENOMEM;
>  
>  	guc->capture->reglists = guc_capture_get_device_reglist(guc);
> -
>  	check_guc_capture_size(guc);
> +	INIT_LIST_HEAD(&guc->capture->outlist);
> +	INIT_LIST_HEAD(&guc->capture->cachelist);
> +
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture.h b/drivers/gpu/drm/xe/xe_guc_capture.h
> index a62b1dbd47a6..c0bada99c9ec 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture.h
> +++ b/drivers/gpu/drm/xe/xe_guc_capture.h
> @@ -10,6 +10,8 @@
>  #include "regs/xe_reg_defs.h"
>  
>  struct xe_guc;
> +struct xe_hw_engine;
> +struct xe_hw_engine_snapshot;
>  
>  /*
>   * struct __guc_mmio_reg_descr / struct __guc_mmio_reg_descr_group
> @@ -25,6 +27,7 @@ struct __guc_mmio_reg_descr {
>  	u32 flags;
>  	u32 mask;
>  	const char *regname;
> +	u32 position_in_snapshot;
>  };
>  
>  struct __guc_mmio_reg_descr_group {
> @@ -36,9 +39,15 @@ struct __guc_mmio_reg_descr_group {
>  	struct __guc_mmio_reg_descr *extlist; /* only used for steered registers */
>  };
>  
> +void xe_guc_capture_process(struct xe_guc *guc);
>  int xe_guc_capture_getlist(struct xe_guc *guc, u32 owner, u32 type, u32 classid, void **outptr);
>  int xe_guc_capture_getlistsize(struct xe_guc *guc, u32 owner, u32 type, u32 classid, size_t *size);
>  int xe_guc_capture_getnullheader(struct xe_guc *guc, void **outptr, size_t *size);
> +const struct __guc_mmio_reg_descr_group *
> +xe_guc_capture_get_reg_desc_list(u32 owner, u32 type, u32 engine_classid);
> +void xe_hw_engine_find_and_copy_guc_capture_snapshot(struct xe_hw_engine *hwe,
> +						     struct xe_hw_engine_snapshot *snapshot);
> +void xe_guc_capture_free(struct xe_guc *guc);
>  int xe_guc_capture_init(struct xe_guc *guc);
>  
>  #endif /* _XE_GUC_CAPTURE_H */
> diff --git a/drivers/gpu/drm/xe/xe_guc_capture_fwif.h b/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> index 199e3c0108a4..5ef8c20fe9bc 100644
> --- a/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> +++ b/drivers/gpu/drm/xe/xe_guc_capture_fwif.h
> @@ -10,6 +10,51 @@
>  
>  #include "xe_guc_fwif.h"
>  
> +/*
> + * struct __guc_capture_bufstate
> + *
> + * Book-keeping structure used to track read and write pointers
> + * as we extract error capture data from the GuC-log-buffer's
> + * error-capture region as a stream of dwords.
> + */
> +struct __guc_capture_bufstate {
> +	u32 size;
> +	u32 data_offset;
> +	u32 rd;
> +	u32 wr;
> +};
> +
> +/*
> + * struct __guc_capture_parsed_output - extracted error capture node
> + *
> + * A single unit of extracted error-capture output data grouped together
> + * at an engine-instance level. We keep these nodes in a linked list.
> + * See cachelist and outlist below.
> + */
> +struct __guc_capture_parsed_output {
> +	/*
> +	 * A single set of 3 capture lists: a global-list
> +	 * an engine-class-list and an engine-instance list.
> +	 * outlist in __guc_capture_parsed_output will keep
> +	 * a linked list of these nodes that will eventually
> +	 * be detached from outlist and attached into to
> +	 * xe_codedump in response to a context reset
> +	 */
> +	struct list_head link;
> +	bool is_partial;
> +	u32 eng_class;
> +	u32 eng_inst;
> +	u32 guc_id;
> +	u32 lrca;
> +	struct gcap_reg_list_info {
> +		u32 vfid;
> +		u32 num_regs;
> +		struct guc_mmio_reg *regs;
> +	} reginfo[GUC_CAPTURE_LIST_TYPE_MAX];
> +#define GCAP_PARSED_REGLIST_INDEX_GLOBAL   BIT(GUC_CAPTURE_LIST_TYPE_GLOBAL)
> +#define GCAP_PARSED_REGLIST_INDEX_ENGCLASS BIT(GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS)
> +};
> +
>  /*
>   * struct guc_debug_capture_list_header / struct guc_debug_capture_list
>   *
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index c1f258348f5c..865b58bb4fd9 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -1045,6 +1045,8 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
>  		/* Selftest only at the moment */
>  		break;
>  	case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION:
> +		ret = xe_guc_error_capture_handler(guc, payload, adj_len);
> +		break;
>  	case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE:
>  		/* FIXME: Handle this */
>  		break;
> diff --git a/drivers/gpu/drm/xe/xe_guc_fwif.h b/drivers/gpu/drm/xe/xe_guc_fwif.h
> index 908298791c93..f8f9c76eb7ac 100644
> --- a/drivers/gpu/drm/xe/xe_guc_fwif.h
> +++ b/drivers/gpu/drm/xe/xe_guc_fwif.h
> @@ -206,6 +206,12 @@ enum guc_capture_type {
>  	GUC_CAPTURE_LIST_TYPE_MAX,
>  };
>  
> +/* GuC support limited registers range to be captured for debug purpose,
> + * for unsupported registers, direct read is the only way to save the data.
> + * GuC capture handling will ignore all lists with this type: GUC_CAPTURE_LIST_TYPE_DIRECT_READ
> + */
> +#define GUC_CAPTURE_LIST_TYPE_DIRECT_READ GUC_CAPTURE_LIST_TYPE_MAX
> +
>  /* Class indecies for capture_class and capture_instance arrays */
>  enum {
>  	GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE = 0,
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 47aab04cf34f..f02f4c0c9568 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -25,6 +25,7 @@
>  #include "xe_gt.h"
>  #include "xe_gt_printk.h"
>  #include "xe_guc.h"
> +#include "xe_guc_capture.h"
>  #include "xe_guc_ct.h"
>  #include "xe_guc_exec_queue_types.h"
>  #include "xe_guc_id_mgr.h"
> @@ -769,7 +770,7 @@ static void guc_exec_queue_free_job(struct drm_sched_job *drm_job)
>  	xe_sched_job_put(job);
>  }
>  
> -static int guc_read_stopped(struct xe_guc *guc)
> +int xe_guc_read_stopped(struct xe_guc *guc)
>  {
>  	return atomic_read(&guc->submission_state.stopped);
>  }
> @@ -791,7 +792,7 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
>  	set_min_preemption_timeout(guc, q);
>  	smp_rmb();
>  	ret = wait_event_timeout(guc->ct.wq, !exec_queue_pending_enable(q) ||
> -				 guc_read_stopped(guc), HZ * 5);
> +				 xe_guc_read_stopped(guc), HZ * 5);
>  	if (!ret) {
>  		struct xe_gpu_scheduler *sched = &q->guc->sched;
>  
> @@ -906,7 +907,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>  		 */
>  		ret = wait_event_timeout(guc->ct.wq,
>  					 !exec_queue_pending_disable(q) ||
> -					 guc_read_stopped(guc), HZ * 5);
> +					 xe_guc_read_stopped(guc), HZ * 5);
>  		if (!ret) {
>  			drm_warn(&xe->drm, "Schedule disable failed to respond");
>  			xe_sched_submission_start(sched);
> @@ -929,6 +930,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	int err = -ETIME;
>  	int i = 0;
>  	bool wedged;
> +	bool reset_status = exec_queue_reset(q);
> +	bool guc_en = xe_device_uc_enabled(xe);
>  
>  	/*
>  	 * TDR has fired before free job worker. Common if exec queue
> @@ -948,7 +951,15 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
>  		   "VM job timed out on non-killed execqueue\n");
>  
> -	if (!exec_queue_killed(q))
> +	/* take devcoredump on:
> +	 * 1. GuC not enabled
> +	 * 2. GuC enabled with GuC reset status == 1
> +	 * When GuC enabled, register value is captured by GuC, GuC will notify host
> +	 * with capture notification message, which is right before reset.
> +	 * GuC reset status 1 also means capture ready.
> +	 * If not ready, will take snapshot after wait event within this function.
> +	 */
> +	if (!exec_queue_killed(q) && (!guc_en || (guc_en && reset_status)))
>  		xe_devcoredump(job);
>  
>  	trace_xe_sched_job_timedout(job);
> @@ -996,8 +1007,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  		smp_rmb();
>  		ret = wait_event_timeout(guc->ct.wq,
>  					 !exec_queue_pending_disable(q) ||
> -					 guc_read_stopped(guc), HZ * 5);
> -		if (!ret || guc_read_stopped(guc)) {
> +					 xe_guc_read_stopped(guc), HZ * 5);
> +		if (!ret || xe_guc_read_stopped(guc)) {
>  			drm_warn(&xe->drm, "Schedule disable failed to respond");
>  			xe_sched_add_pending_job(sched, job);
>  			xe_sched_submission_start(sched);
> @@ -1007,6 +1018,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  		}
>  	}
>  
> +	/* When entring this function, if capture/reset not ready, now is time to take snapshot */
> +	if (!exec_queue_killed(q) && guc_en && !reset_status)
> +		xe_devcoredump(job);
> +
>  	/* Stop fence signaling */
>  	xe_hw_fence_irq_stop(q->fence_irq);
>  
> @@ -1112,7 +1127,7 @@ static void suspend_fence_signal(struct xe_exec_queue *q)
>  	struct xe_device *xe = guc_to_xe(guc);
>  
>  	xe_assert(xe, exec_queue_suspended(q) || exec_queue_killed(q) ||
> -		  guc_read_stopped(guc));
> +		  xe_guc_read_stopped(guc));
>  	xe_assert(xe, q->guc->suspend_pending);
>  
>  	q->guc->suspend_pending = false;
> @@ -1128,9 +1143,9 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>  	if (guc_exec_queue_allowed_to_change_state(q) && !exec_queue_suspended(q) &&
>  	    exec_queue_enabled(q)) {
>  		wait_event(guc->ct.wq, q->guc->resume_time != RESUME_PENDING ||
> -			   guc_read_stopped(guc));
> +			   xe_guc_read_stopped(guc));
>  
> -		if (!guc_read_stopped(guc)) {
> +		if (!xe_guc_read_stopped(guc)) {
>  			MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
>  			s64 since_resume_ms =
>  				ktime_ms_delta(ktime_get(),
> @@ -1258,7 +1273,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
>  
>  	q->entity = &ge->entity;
>  
> -	if (guc_read_stopped(guc))
> +	if (xe_guc_read_stopped(guc))
>  		xe_sched_stop(sched);
>  
>  	mutex_unlock(&guc->submission_state.lock);
> @@ -1385,7 +1400,7 @@ static void guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
>  	struct xe_guc *guc = exec_queue_to_guc(q);
>  
>  	wait_event(q->guc->suspend_wait, !q->guc->suspend_pending ||
> -		   guc_read_stopped(guc));
> +		   xe_guc_read_stopped(guc));
>  }
>  
>  static void guc_exec_queue_resume(struct xe_exec_queue *q)
> @@ -1495,7 +1510,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
>  
>  void xe_guc_submit_reset_wait(struct xe_guc *guc)
>  {
> -	wait_event(guc->ct.wq, !guc_read_stopped(guc));
> +	wait_event(guc->ct.wq, !xe_guc_read_stopped(guc));
>  }
>  
>  void xe_guc_submit_stop(struct xe_guc *guc)
> @@ -1504,7 +1519,7 @@ void xe_guc_submit_stop(struct xe_guc *guc)
>  	unsigned long index;
>  	struct xe_device *xe = guc_to_xe(guc);
>  
> -	xe_assert(xe, guc_read_stopped(guc) == 1);
> +	xe_assert(xe, xe_guc_read_stopped(guc) == 1);
>  
>  	mutex_lock(&guc->submission_state.lock);
>  
> @@ -1542,7 +1557,7 @@ int xe_guc_submit_start(struct xe_guc *guc)
>  	unsigned long index;
>  	struct xe_device *xe = guc_to_xe(guc);
>  
> -	xe_assert(xe, guc_read_stopped(guc) == 1);
> +	xe_assert(xe, xe_guc_read_stopped(guc) == 1);
>  
>  	mutex_lock(&guc->submission_state.lock);
>  	atomic_dec(&guc->submission_state.stopped);
> @@ -1698,8 +1713,6 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
>  		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>  
> -	/* FIXME: Do error capture, most likely async */
> -
>  	trace_xe_exec_queue_reset(q);
>  
>  	/*
> @@ -1715,6 +1728,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	return 0;
>  }
>  

missing kerne-doc

> +int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32 len)

maybe this should be defined as xe_guc_capture_msg_handler() and placed
in xe_guc_capture.c as it doesn't look that it needs anything from
xe_guc_submit.c code

> +{
> +	u32 status;
> +
> +	if (unlikely(len != 1)) {

magic "1"

> +		xe_gt_dbg(guc_to_gt(guc), "Invalid length %u", len);
> +		return -EPROTO;
> +	}
> +
> +	status = msg[0] & XE_GUC_STATE_CAPTURE_EVENT_STATUS_MASK;
> +	if (status == XE_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE)
> +		xe_gt_warn(guc_to_gt(guc), "G2H-Error capture no space");

btw, is there anything to capture if GuC reported 'NOSPACE' ?

> +
> +	xe_guc_capture_process(guc);
> +
> +	return 0;
> +}
> +
>  int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>  					       u32 len)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
> index 4ad5f4c1b084..d92256de473e 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -19,12 +19,14 @@ void xe_guc_submit_reset_wait(struct xe_guc *guc);
>  void xe_guc_submit_stop(struct xe_guc *guc);
>  int xe_guc_submit_start(struct xe_guc *guc);
>  
> +int xe_guc_read_stopped(struct xe_guc *guc);
>  int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
>  int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
>  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len);
>  int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>  					       u32 len);
>  int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len);
> +int xe_guc_error_capture_handler(struct xe_guc *guc, u32 *msg, u32 len);
>  
>  struct xe_guc_submit_exec_queue_snapshot *
>  xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index 0a83506e1ad8..3bc88fbad952 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -20,6 +20,9 @@
>  #include "xe_gt_printk.h"
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
> +#include "xe_guc.h"
> +#include "xe_guc_capture.h"
> +#include "xe_guc_capture_fwif.h"
>  #include "xe_hw_fence.h"
>  #include "xe_irq.h"
>  #include "xe_lrc.h"
> @@ -287,6 +290,7 @@ static void hw_engine_mmio_write32(struct xe_hw_engine *hwe, struct xe_reg reg,
>  static u32 hw_engine_mmio_read32(struct xe_hw_engine *hwe, struct xe_reg reg)
>  {
>  	xe_gt_assert(hwe->gt, !(reg.addr & hwe->mmio_base));
> +

unrelated

>  	xe_force_wake_assert_held(gt_to_fw(hwe->gt), hwe->domain);
>  
>  	reg.addr += hwe->mmio_base;
> @@ -825,6 +829,62 @@ xe_hw_engine_snapshot_instdone_capture(struct xe_hw_engine *hwe,
>  	}
>  }
>  
> +static void
> +xe_hw_engine_snapshot_from_hw_by_type(struct xe_hw_engine *hwe,
> +				      struct xe_hw_engine_snapshot *snapshot, int type)
> +{
> +	const struct __guc_mmio_reg_descr_group *list;
> +	u16 capture_class = xe_engine_class_to_guc_capture_class(hwe->class);
> +	int i;
> +
> +	list = xe_guc_capture_get_reg_desc_list(GUC_CAPTURE_LIST_INDEX_PF, type, capture_class);
> +	if (!list)
> +		return;
> +
> +	for (i = 0; i < list->num_regs; i++) {
> +		u32 *field;
> +
> +		/* loop until extra operation registers zone */
> +		if (list->list[i].reg.addr == XE_GUC_SNAPSHOT_EXTRA_OPERATION_REGS_START_REG_ADDR)
> +			break;
> +
> +		field = (u32 *)((uintptr_t)&snapshot->reg +
> +				list->list[i].position_in_snapshot);
> +		if (type == GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE)
> +			*field = hw_engine_mmio_read32(hwe, list->list[i].reg);
> +		else
> +			*field = xe_mmio_read32(hwe->gt, list->list[i].reg);
> +	}
> +}
> +
> +/**

this is static function, no need to have true kernel-doc

> + * xe_hw_engine_snapshot_from_hw - Take a quick engine snapshot from HW.
> + * @hwe: Xe HW Engine.
> + * @snapshot: Point to the Xe HW Engine snapshot object to save data.
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: None
> + */
> +static void
> +xe_hw_engine_snapshot_from_hw(struct xe_hw_engine *hwe, struct xe_hw_engine_snapshot *snapshot)
> +{
> +	int type;
> +
> +	for (type = GUC_CAPTURE_LIST_TYPE_GLOBAL; type < GUC_CAPTURE_LIST_TYPE_MAX; type++)
> +		xe_hw_engine_snapshot_from_hw_by_type(hwe, snapshot, type);
> +
> +	/* Extra operation required registers zone - start */
> +	if (xe_gt_has_indirect_ring_state(hwe->gt))
> +		snapshot->reg.indirect_ring_state =
> +			hw_engine_mmio_read32(hwe, INDIRECT_RING_STATE(0));
> +	/* Extra operation required registers zone - End */
> +
> +	/* Capture steering registers */
> +	xe_hw_engine_snapshot_instdone_capture(hwe, snapshot);
> +}
> +
>  /**
>   * xe_hw_engine_snapshot_capture - Take a quick snapshot of the HW Engine.
>   * @hwe: Xe HW Engine.
> @@ -839,8 +899,12 @@ struct xe_hw_engine_snapshot *
>  xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe)
>  {
>  	struct xe_hw_engine_snapshot *snapshot;
> +	struct xe_gt *gt = hwe->gt;
> +	struct xe_device *xe = gt_to_xe(gt);
> +	struct xe_guc *guc = &gt->uc.guc;
>  	size_t len;
> -	u64 val;
> +	u32 i;
> +	bool endian_convert_required;
>  
>  	if (!xe_hw_engine_is_valid(hwe))
>  		return NULL;
> @@ -850,6 +914,9 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe)
>  	if (!snapshot)
>  		return NULL;
>  
> +	i = 0x01020304;
> +	endian_convert_required = (i != le32_to_cpu(i));
> +
>  	/* Because XE_MAX_DSS_FUSE_BITS is defined in xe_gt_types.h and it
>  	 * includes xe_hw_engine_types.h the length of this 3 registers can't be
>  	 * set in struct xe_hw_engine_snapshot, so here doing additional
> @@ -881,62 +948,35 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe)
>  	snapshot->mmio_base = hwe->mmio_base;
>  
>  	/* no more VF accessible data below this point */
> -	if (IS_SRIOV_VF(gt_to_xe(hwe->gt)))
> +	if (IS_SRIOV_VF(xe))
>  		return snapshot;
>  
> -	snapshot->reg.ring_execlist_status =
> -		hw_engine_mmio_read32(hwe, RING_EXECLIST_STATUS_LO(0));
> -	val = hw_engine_mmio_read32(hwe, RING_EXECLIST_STATUS_HI(0));
> -	snapshot->reg.ring_execlist_status |= val << 32;
> -
> -	snapshot->reg.ring_execlist_sq_contents =
> -		hw_engine_mmio_read32(hwe, RING_EXECLIST_SQ_CONTENTS_LO(0));
> -	val = hw_engine_mmio_read32(hwe, RING_EXECLIST_SQ_CONTENTS_HI(0));
> -	snapshot->reg.ring_execlist_sq_contents |= val << 32;
> -
> -	snapshot->reg.ring_acthd = hw_engine_mmio_read32(hwe, RING_ACTHD(0));
> -	val = hw_engine_mmio_read32(hwe, RING_ACTHD_UDW(0));
> -	snapshot->reg.ring_acthd |= val << 32;
> -
> -	snapshot->reg.ring_bbaddr = hw_engine_mmio_read32(hwe, RING_BBADDR(0));
> -	val = hw_engine_mmio_read32(hwe, RING_BBADDR_UDW(0));
> -	snapshot->reg.ring_bbaddr |= val << 32;
> -
> -	snapshot->reg.ring_dma_fadd =
> -		hw_engine_mmio_read32(hwe, RING_DMA_FADD(0));
> -	val = hw_engine_mmio_read32(hwe, RING_DMA_FADD_UDW(0));
> -	snapshot->reg.ring_dma_fadd |= val << 32;
> -
> -	snapshot->reg.ring_hwstam = hw_engine_mmio_read32(hwe, RING_HWSTAM(0));
> -	snapshot->reg.ring_hws_pga = hw_engine_mmio_read32(hwe, RING_HWS_PGA(0));
> -	snapshot->reg.ring_start = hw_engine_mmio_read32(hwe, RING_START(0));
> -	if (GRAPHICS_VERx100(hwe->gt->tile->xe) >= 2000) {
> -		val = hw_engine_mmio_read32(hwe, RING_START_UDW(0));
> -		snapshot->reg.ring_start |= val << 32;
> -	}
> -	if (xe_gt_has_indirect_ring_state(hwe->gt)) {
> -		snapshot->reg.indirect_ring_state =
> -			hw_engine_mmio_read32(hwe, INDIRECT_RING_STATE(0));
> +	/* Check GuC settings, job is set and capture outlist not empty,
> +	 * otherwise take it from engine
> +	 */
> +	if (xe_device_uc_enabled(xe) && xe->wedged.mode >= 1 &&
> +	    !list_empty(&guc->capture->outlist) && xe->devcoredump.job)
> +		xe_hw_engine_find_and_copy_guc_capture_snapshot(hwe, snapshot);
> +	else
> +		xe_hw_engine_snapshot_from_hw(hwe, snapshot);
> +
> +	/* Read registers defined in "Direct read" list */
> +	xe_hw_engine_snapshot_from_hw_by_type(hwe, snapshot, GUC_CAPTURE_LIST_TYPE_DIRECT_READ);
> +
> +	/* appy mask for ring head and tail */
> +	snapshot->reg.ring_head &= HEAD_ADDR;
> +	snapshot->reg.ring_tail &= TAIL_ADDR;
> +
> +	/* adjust u64 endine in snapshot if needed */
> +	if (endian_convert_required) {
> +		for (i = 0; i < XE_GUC_SNAPSHOT_REGS_U32_START_OFFSET; i += sizeof(u64)) {
> +			u64 *pdata = (u64 *)((ulong)&snapshot->reg + i);
> +			u32 *pl = (u32 *)pdata;
> +			u32 *ph = (u32 *)((ulong)pdata + 4);
> +			*pdata = ((u64)*ph) << 32 | *pl;
> +		}
>  	}
>  
> -	snapshot->reg.ring_head =
> -		hw_engine_mmio_read32(hwe, RING_HEAD(0)) & HEAD_ADDR;
> -	snapshot->reg.ring_tail =
> -		hw_engine_mmio_read32(hwe, RING_TAIL(0)) & TAIL_ADDR;
> -	snapshot->reg.ring_ctl = hw_engine_mmio_read32(hwe, RING_CTL(0));
> -	snapshot->reg.ring_mi_mode =
> -		hw_engine_mmio_read32(hwe, RING_MI_MODE(0));
> -	snapshot->reg.ring_mode = hw_engine_mmio_read32(hwe, RING_MODE(0));
> -	snapshot->reg.ring_imr = hw_engine_mmio_read32(hwe, RING_IMR(0));
> -	snapshot->reg.ring_esr = hw_engine_mmio_read32(hwe, RING_ESR(0));
> -	snapshot->reg.ring_emr = hw_engine_mmio_read32(hwe, RING_EMR(0));
> -	snapshot->reg.ring_eir = hw_engine_mmio_read32(hwe, RING_EIR(0));
> -	snapshot->reg.ipehr = hw_engine_mmio_read32(hwe, RING_IPEHR(0));
> -	xe_hw_engine_snapshot_instdone_capture(hwe, snapshot);
> -
> -	if (snapshot->hwe->class == XE_ENGINE_CLASS_COMPUTE)
> -		snapshot->reg.rcu_mode = xe_mmio_read32(hwe->gt, RCU_MODE);
> -
>  	return snapshot;
>  }
>  
> @@ -993,6 +1033,8 @@ xe_hw_engine_snapshot_instdone_print(struct xe_hw_engine_snapshot *snapshot, str
>  void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot,
>  				 struct drm_printer *p)
>  {
> +	int i, type;
> +
>  	if (!snapshot)
>  		return;
>  
> @@ -1001,34 +1043,52 @@ void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot,
>  		   snapshot->logical_instance);
>  	drm_printf(p, "\tForcewake: domain 0x%x, ref %d\n",
>  		   snapshot->forcewake.domain, snapshot->forcewake.ref);
> -	drm_printf(p, "\tHWSTAM: 0x%08x\n", snapshot->reg.ring_hwstam);
> -	drm_printf(p, "\tRING_HWS_PGA: 0x%08x\n", snapshot->reg.ring_hws_pga);
> -	drm_printf(p, "\tRING_EXECLIST_STATUS: 0x%016llx\n",
> -		   snapshot->reg.ring_execlist_status);
> -	drm_printf(p, "\tRING_EXECLIST_SQ_CONTENTS: 0x%016llx\n",
> -		   snapshot->reg.ring_execlist_sq_contents);
> -	drm_printf(p, "\tRING_START: 0x%016llx\n", snapshot->reg.ring_start);
> -	drm_printf(p, "\tRING_HEAD: 0x%08x\n", snapshot->reg.ring_head);
> -	drm_printf(p, "\tRING_TAIL: 0x%08x\n", snapshot->reg.ring_tail);
> -	drm_printf(p, "\tRING_CTL: 0x%08x\n", snapshot->reg.ring_ctl);
> -	drm_printf(p, "\tRING_MI_MODE: 0x%08x\n", snapshot->reg.ring_mi_mode);
> -	drm_printf(p, "\tRING_MODE: 0x%08x\n",
> -		   snapshot->reg.ring_mode);
> -	drm_printf(p, "\tRING_IMR: 0x%08x\n", snapshot->reg.ring_imr);
> -	drm_printf(p, "\tRING_ESR: 0x%08x\n", snapshot->reg.ring_esr);
> -	drm_printf(p, "\tRING_EMR: 0x%08x\n", snapshot->reg.ring_emr);
> -	drm_printf(p, "\tRING_EIR: 0x%08x\n", snapshot->reg.ring_eir);
> -	drm_printf(p, "\tACTHD: 0x%016llx\n", snapshot->reg.ring_acthd);
> -	drm_printf(p, "\tBBADDR: 0x%016llx\n", snapshot->reg.ring_bbaddr);
> -	drm_printf(p, "\tDMA_FADDR: 0x%016llx\n", snapshot->reg.ring_dma_fadd);
> -	drm_printf(p, "\tINDIRECT_RING_STATE: 0x%08x\n",
> -		   snapshot->reg.indirect_ring_state);
> -	drm_printf(p, "\tIPEHR: 0x%08x\n", snapshot->reg.ipehr);
> +
> +	/* Print will include direct read list in this main loop */
> +	for (type = GUC_CAPTURE_LIST_TYPE_GLOBAL; type <= GUC_CAPTURE_LIST_TYPE_DIRECT_READ;
> +	     type++) {
> +		const struct __guc_mmio_reg_descr_group *list;
> +		u16 capture_class = xe_engine_class_to_guc_capture_class(snapshot->hwe->class);
> +
> +		/* Capture engine registers */
> +		list = xe_guc_capture_get_reg_desc_list(GUC_CAPTURE_LIST_INDEX_PF, type,
> +							capture_class);
> +		if (!list)
> +			continue;
> +
> +		/* loop 32bit registers until 64 bit registers */
> +		for (i = 0; i < list->num_regs; i++) {
> +			u32 *field;
> +
> +			if (list->list[i].reg.addr == XE_GUC_SNAPSHOT_REGS_U64_START_REG_ADDR)
> +				break;
> +			field = (u32 *)((uintptr_t)&snapshot->reg +
> +					list->list[i].position_in_snapshot);
> +			drm_printf(p, "\t%s: 0x%08x\n", list->list[i].regname, *field);
> +		}
> +
> +		if (type != GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE)
> +			continue; /* 64bit and special registers is for engine instance only */
> +
> +		/* loop 64 bit registers until special registers */
> +		for (; i < list->num_regs; i += 2) {
> +			u64 *field;
> +
> +			if (list->list[i].reg.addr ==
> +			    XE_GUC_SNAPSHOT_EXTRA_OPERATION_REGS_START_REG_ADDR)
> +				break;
> +			field = (u64 *)((uintptr_t)&snapshot->reg +
> +					list->list[i].position_in_snapshot);
> +			drm_printf(p, "\t%s: 0x%016llx\n", list->list[i].regname, *field);
> +		}
> +
> +		/* Handling special registers - Start */
> +		drm_printf(p, "\tINDIRECT_RING_STATE: 0x%08x\n", snapshot->reg.indirect_ring_state);
> +		/* Handling special registers - End */
> +	}
> +
>  	xe_hw_engine_snapshot_instdone_print(snapshot, p);
>  
> -	if (snapshot->hwe->class == XE_ENGINE_CLASS_COMPUTE)
> -		drm_printf(p, "\tRCU_MODE: 0x%08x\n",
> -			   snapshot->reg.rcu_mode);
>  	drm_puts(p, "\n");
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index 580bbd7e83b2..617101dca272 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -150,6 +150,106 @@ struct xe_hw_engine {
>  	struct xe_hw_engine_class_intf *eclass;
>  };
>  
> +/**
> + * struct xe_hw_engine_snapshot - Hardware engine snapshot
> + *
> + * Contains the snapshot of useful hardware engine info and registers.
> + */
> +struct snapshot_regs {
> +	/* Engine instance type - start */
> +	/* 64 bit registers zone - start */
> +	/*
> +	 * u64 data captured by 2 u32s from GuC or by hw read.
> +	 * Save data into this u64 zone will always write in format of:
> +	 *  offset + 0000: [low 32]
> +	 *  offset + 0004: [high 32]
> +	 * Once all data captured, data will be converted to CPU endian order if needed at the
> +	 * end of xe_hw_engine_snapshot_capture
> +	 */
> +	#define XE_GUC_SNAPSHOT_REGS_U64_START_REG_ADDR RING_ACTHD(0).addr
> +	/** @ring_acthd: RING_ACTHD */
> +	u64 ring_acthd;
> +	/** @ring_bbaddr: RING_BBADDR */
> +	u64 ring_bbaddr;
> +	/** @ring_start: RING_START */
> +	u64 ring_start;
> +	/** @ring_dma_fadd: RING_DMA_FADD */
> +	u64 ring_dma_fadd;
> +	/** @ring_execlist_status: RING_EXECLIST_STATUS */
> +	u64 ring_execlist_status;
> +	/** @ring_execlist_sq_contents: RING_EXECLIST_SQ_CONTENTS */
> +	u64 ring_execlist_sq_contents;
> +	/* 64 bit registers zone - end */
> +
> +	/* 32 bit registers zone - start */
> +	/** @reg.ring_hwstam: RING_HWSTAM */
> +	u32 ring_hwstam;
> +	#define XE_GUC_SNAPSHOT_REGS_U32_START_OFFSET offsetof(struct snapshot_regs, ring_hwstam)
> +
> +	/** @reg.ring_hws_pga: RING_HWS_PGA */
> +	u32 ring_hws_pga;
> +	u32 ring_head;
> +	/** @reg.ring_tail: RING_TAIL */
> +	u32 ring_tail;
> +	/** @reg.ring_ctl: RING_CTL */
> +	u32 ring_ctl;
> +	/** @reg.ring_mi_mode: RING_MI_MODE */
> +	u32 ring_mi_mode;
> +	/** @reg.ring_mode: RING_MODE */
> +	u32 ring_mode;
> +	/** @reg.ring_imr: RING_IMR */
> +	u32 ring_imr;
> +	/** @reg.ring_esr: RING_ESR */
> +	u32 ring_esr;
> +	/** @reg.ring_emr: RING_EMR */
> +	u32 ring_emr;
> +	/** @reg.ring_eir: RING_EIR */
> +	u32 ring_eir;
> +	/** @reg.ipehr: IPEHR */
> +	u32 ipehr;
> +	/* Engine instance type - end */
> +
> +	/* Engine class type - start */
> +	/** @reg.rcu_mode: RCU_MODE */
> +	u32 rcu_mode;
> +	/** @reg.sfc_done_[0-3]: SFC_DONE[0-3] */
> +	u32 sfc_done_0;
> +	u32 sfc_done_1;
> +	u32 sfc_done_2;
> +	u32 sfc_done_3;
> +	/* Engine class type - end */
> +
> +	/* Global type - start */
> +	/** @reg.forcewake_gt: FORCEWAKE_GT */
> +	u32 forcewake_gt;
> +	/* Global type - end */
> +
> +	/* Extra operation Registers zone - start */
> +	/* registers requires extra code to handling */
> +	#define XE_GUC_SNAPSHOT_EXTRA_OPERATION_REGS_START_REG_ADDR INDIRECT_RING_STATE(0).addr
> +	/** @reg.indirect_ring_state: INDIRECT_RING_STATE */
> +	u32 indirect_ring_state;
> +	/* Special registers zone - end */
> +
> +	/* Steering registers */
> +	struct {
> +		/** @reg.instdone.ring: RING_INSTDONE */
> +		u32 ring;
> +		/** @reg.instdone.slice_common: SC_INSTDONE */
> +		u32 *slice_common;
> +		/** @reg.instdone.slice_common_extra: SC_INSTDONE_EXTRA */
> +		u32 *slice_common_extra;
> +		/** @reg.instdone.slice_common_extra2: SC_INSTDONE_EXTRA2 */
> +		u32 *slice_common_extra2;
> +		/** @reg.instdone.sampler: SAMPLER_INSTDONE */
> +		u32 *sampler;
> +		/** @reg.instdone.row: ROW_INSTDONE */
> +		u32 *row;
> +		/** @reg.instdone.geom_svg: INSTDONE_GEOM_SVGUNIT */
> +		u32 *geom_svg;
> +	} instdone;
> +};
> +
>  /**
>   * struct xe_hw_engine_snapshot - Hardware engine snapshot
>   *
> @@ -172,64 +272,7 @@ struct xe_hw_engine_snapshot {
>  	/** @mmio_base: MMIO base address of this hw engine*/
>  	u32 mmio_base;
>  	/** @reg: Useful MMIO register snapshot */
> -	struct {
> -		/** @reg.ring_execlist_status: RING_EXECLIST_STATUS */
> -		u64 ring_execlist_status;
> -		/** @reg.ring_execlist_sq_contents: RING_EXECLIST_SQ_CONTENTS */
> -		u64 ring_execlist_sq_contents;
> -		/** @reg.ring_acthd: RING_ACTHD */
> -		u64 ring_acthd;
> -		/** @reg.ring_bbaddr: RING_BBADDR */
> -		u64 ring_bbaddr;
> -		/** @reg.ring_dma_fadd: RING_DMA_FADD */
> -		u64 ring_dma_fadd;
> -		/** @reg.ring_hwstam: RING_HWSTAM */
> -		u32 ring_hwstam;
> -		/** @reg.ring_hws_pga: RING_HWS_PGA */
> -		u32 ring_hws_pga;
> -		/** @reg.ring_start: RING_START */
> -		u64 ring_start;
> -		/** @reg.ring_head: RING_HEAD */
> -		u32 ring_head;
> -		/** @reg.ring_tail: RING_TAIL */
> -		u32 ring_tail;
> -		/** @reg.ring_ctl: RING_CTL */
> -		u32 ring_ctl;
> -		/** @reg.ring_mi_mode: RING_MI_MODE */
> -		u32 ring_mi_mode;
> -		/** @reg.ring_mode: RING_MODE */
> -		u32 ring_mode;
> -		/** @reg.ring_imr: RING_IMR */
> -		u32 ring_imr;
> -		/** @reg.ring_esr: RING_ESR */
> -		u32 ring_esr;
> -		/** @reg.ring_emr: RING_EMR */
> -		u32 ring_emr;
> -		/** @reg.ring_eir: RING_EIR */
> -		u32 ring_eir;
> -		/** @reg.indirect_ring_state: INDIRECT_RING_STATE */
> -		u32 indirect_ring_state;
> -		/** @reg.ipehr: IPEHR */
> -		u32 ipehr;
> -		/** @reg.rcu_mode: RCU_MODE */
> -		u32 rcu_mode;
> -		struct {
> -			/** @reg.instdone.ring: RING_INSTDONE */
> -			u32 ring;
> -			/** @reg.instdone.slice_common: SC_INSTDONE */
> -			u32 *slice_common;
> -			/** @reg.instdone.slice_common_extra: SC_INSTDONE_EXTRA */
> -			u32 *slice_common_extra;
> -			/** @reg.instdone.slice_common_extra2: SC_INSTDONE_EXTRA2 */
> -			u32 *slice_common_extra2;
> -			/** @reg.instdone.sampler: SAMPLER_INSTDONE */
> -			u32 *sampler;
> -			/** @reg.instdone.row: ROW_INSTDONE */
> -			u32 *row;
> -			/** @reg.instdone.geom_svg: INSTDONE_GEOM_SVGUNIT */
> -			u32 *geom_svg;
> -		} instdone;
> -	} reg;
> +	struct snapshot_regs reg;
>  };
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
> index 882c3437ba5c..8c83601fc695 100644
> --- a/drivers/gpu/drm/xe/xe_lrc.h
> +++ b/drivers/gpu/drm/xe/xe_lrc.h
> @@ -21,6 +21,7 @@ struct xe_lrc_snapshot;
>  struct xe_vm;
>  
>  #define LRC_PPHWSP_SCRATCH_ADDR (0x34 * 4)
> +#define LRC_GTT_ADDRESS_MASK	GENMASK(31, 12)
>  
>  struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
>  			     u32 ring_size);


More information about the Intel-xe mailing list