[PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot

Souza, Jose jose.souza at intel.com
Wed Jan 31 13:44:46 UTC 2024


On Tue, 2024-01-30 at 17:37 -0500, Rodrigo Vivi wrote:
> devcoredump direction is that the snapshot could be taken
> even after the driver unbind. At that point the xe device
> will be gone. So, let's convert to a proper snapshot.

Would not be better to add a devcoredump uapi to remove it during Xe unload?

> 
> Cc: José Roberto de Souza <jose.souza at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       |  5 +-
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  3 +-
>  drivers/gpu/drm/xe/xe_device.c            | 80 ++++++++++++++++++++---
>  drivers/gpu/drm/xe/xe_device.h            |  6 +-
>  drivers/gpu/drm/xe/xe_device_types.h      | 13 ++++
>  5 files changed, 94 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index e701f0d07b67..30e7edbb8b6f 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -63,7 +63,6 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  				   size_t count, void *data, size_t datalen)
>  {
>  	struct xe_devcoredump *coredump = data;
> -	struct xe_device *xe = coredump_to_xe(coredump);
>  	struct xe_devcoredump_snapshot *ss;
>  	struct drm_printer p;
>  	struct drm_print_iterator iter;
> @@ -90,7 +89,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
>  	ts = ktime_to_timespec64(ss->boot_time);
>  	drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
> -	xe_device_snapshot_print(xe, &p);
> +	xe_device_snapshot_print(ss->xe, &p);
>  
>  	drm_printf(&p, "\n**** GuC CT ****\n");
>  	xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
> @@ -114,6 +113,7 @@ static void xe_devcoredump_free(void *data)
>  	if (!data || !coredump_to_xe(coredump))
>  		return;
>  
> +	xe_device_snapshot_free(coredump->snapshot.xe);
>  	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
>  	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
>  	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> @@ -153,6 +153,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>  
>  	xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
>  
> +	coredump->snapshot.xe = xe_device_snapshot_capture(gt_to_xe(q->gt));
>  	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
>  	coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
>  
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 50106efcbc29..8950b1ca7456 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -26,7 +26,8 @@ struct xe_devcoredump_snapshot {
>  	/** @boot_time:  Relative boot time so the uptime can be calculated. */
>  	ktime_t boot_time;
>  
> -	/* GuC snapshots */
> +	/** @xe: Xe Device Info snapshot */
> +	struct xe_device_snapshot *xe;
>  	/** @ct: GuC CT snapshot */
>  	struct xe_guc_ct_snapshot *ct;
>  	/** @ge: Guc Engine snapshot */
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 6faa7865b1aa..6999bb7aea42 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -728,22 +728,84 @@ void xe_device_mem_access_put(struct xe_device *xe)
>  	xe_assert(xe, ref >= 0);
>  }
>  
> -void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p)
> +/**
> + * xe_device_snapshot_capture - Take a quick snapshot of the Xe Device info.
> + * @xe: faulty Xe device.
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: a Xe device snapshot that must be freed by the
> + * caller, using `xe_device_snapshot_free`.
> + */
> +struct xe_device_snapshot *
> +xe_device_snapshot_capture(struct xe_device *xe)
>  {
> +	struct xe_device_snapshot *snapshot;
>  	struct xe_gt *gt;
>  	u8 id;
>  
> -	drm_printf(p, "PCI ID: 0x%04x\n", xe->info.devid);
> -	drm_printf(p, "PCI revision: 0x%02x\n", xe->info.revid);
> +	snapshot = kzalloc(sizeof(*snapshot), GFP_ATOMIC);
> +
> +	if (!snapshot) {
> +		drm_err(&xe->drm, "Skipping Xe Device snapshot.\n");
> +		return NULL;
> +	}
>  
> +	snapshot->devid = xe->info.devid;
> +	snapshot->revid = xe->info.revid;
> +	snapshot->gt_count = xe->info.gt_count;
> +
> +	snapshot->gt = kmalloc_array(xe->info.gt_count,
> +				    sizeof(struct gt_info_snapshot), GFP_ATOMIC);
>  	for_each_gt(gt, xe, id) {
> -		drm_printf(p, "GT id: %u\n", id);
> +		snapshot->gt[id].type = gt->info.type;
> +		snapshot->gt[id].gmdid = gt->info.gmdid;
> +		snapshot->gt[id].reference_clock = gt->info.reference_clock;
> +	}
> +
> +	return snapshot;
> +}
> +
> +/**
> + * xe_decice_snapshot_print - Print out a given Xe device snapshot.
> + * @snapshot: Xe device snapshot object.
> + * @p: drm_printer where it will be printed out.
> + *
> + * This function prints out a given GuC Submit Engine snapshot object.
> + */
> +void xe_device_snapshot_print(struct xe_device_snapshot *ss,
> +			      struct drm_printer *p)
> +{
> +	int i;
> +
> +	drm_printf(p, "PCI ID: 0x%04x\n", ss->devid);
> +	drm_printf(p, "PCI revision: 0x%02x\n", ss->revid);
> +
> +	for(i = 0; i < ss->gt_count; i++) {
> +		drm_printf(p, "GT id: %u\n", i);
>  		drm_printf(p, "\tType: %s\n",
> -			   gt->info.type == XE_GT_TYPE_MAIN ? "main" : "media");
> +			   ss->gt[i].type == XE_GT_TYPE_MAIN ? "main" : "media");
>  		drm_printf(p, "\tIP ver: %u.%u.%u\n",
> -			   REG_FIELD_GET(GMD_ID_ARCH_MASK, gt->info.gmdid),
> -			   REG_FIELD_GET(GMD_ID_RELEASE_MASK, gt->info.gmdid),
> -			   REG_FIELD_GET(GMD_ID_REVID, gt->info.gmdid));
> -		drm_printf(p, "\tCS reference clock: %u\n", gt->info.reference_clock);
> +			   REG_FIELD_GET(GMD_ID_ARCH_MASK, ss->gt[i].gmdid),
> +			   REG_FIELD_GET(GMD_ID_RELEASE_MASK, ss->gt[i].gmdid),
> +			   REG_FIELD_GET(GMD_ID_REVID, ss->gt[i].gmdid));
> +		drm_printf(p, "\tCS reference clock: %u\n", ss->gt[i].reference_clock);
>  	}
>  }
> +
> +/**
> + * xe_device_snapshot_free - Free all allocated objects for a given xe device snapshot.
> + * @snapshot: Xe device snapshot object.
> + *
> + * This function free all the memory that needed to be allocated at capture
> + * time.
> + */
> +void xe_device_snapshot_free(struct xe_device_snapshot *snapshot)
> +{
> +	if (!snapshot)
> +		return;
> +
> +	kfree(snapshot->gt);
> +	kfree(snapshot);
> +}
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index 270124da1e00..409bfbcffb4a 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -175,6 +175,10 @@ static inline bool xe_device_has_memirq(struct xe_device *xe)
>  
>  u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size);
>  
> -void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
> +struct xe_device_snapshot *
> +xe_device_snapshot_capture(struct xe_device *xe);
> +void xe_device_snapshot_print(struct xe_device_snapshot *snapshot,
> +			      struct drm_printer *p);
> +void xe_device_snapshot_free(struct xe_device_snapshot *snapshot);
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index eb2b806a1d23..50dac1a5b053 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -540,6 +540,19 @@ struct xe_device {
>  #endif
>  };
>  
> +struct gt_info_snapshot {
> +	enum xe_gt_type type;
> +	u32 gmdid;
> +	u32 reference_clock;
> +};
> +
> +struct xe_device_snapshot {
> +	u16 devid;
> +	u8 revid;
> +	u8 gt_count;
> +	struct gt_info_snapshot *gt;
> +};
> +
>  /**
>   * struct xe_file - file handle for XE driver
>   */



More information about the Intel-xe mailing list