[PATCH 1/2] drm/xe: Convert xe_device_snapshot to a standalone snapshot
Souza, Jose
jose.souza at intel.com
Wed Jan 31 13:44:46 UTC 2024
On Tue, 2024-01-30 at 17:37 -0500, Rodrigo Vivi wrote:
> devcoredump direction is that the snapshot could be taken
> even after the driver unbind. At that point the xe device
> will be gone. So, let's convert to a proper snapshot.
Would not be better to add a devcoredump uapi to remove it during Xe unload?
>
> Cc: José Roberto de Souza <jose.souza at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 5 +-
> drivers/gpu/drm/xe/xe_devcoredump_types.h | 3 +-
> drivers/gpu/drm/xe/xe_device.c | 80 ++++++++++++++++++++---
> drivers/gpu/drm/xe/xe_device.h | 6 +-
> drivers/gpu/drm/xe/xe_device_types.h | 13 ++++
> 5 files changed, 94 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index e701f0d07b67..30e7edbb8b6f 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -63,7 +63,6 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> size_t count, void *data, size_t datalen)
> {
> struct xe_devcoredump *coredump = data;
> - struct xe_device *xe = coredump_to_xe(coredump);
> struct xe_devcoredump_snapshot *ss;
> struct drm_printer p;
> struct drm_print_iterator iter;
> @@ -90,7 +89,7 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
> ts = ktime_to_timespec64(ss->boot_time);
> drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
> - xe_device_snapshot_print(xe, &p);
> + xe_device_snapshot_print(ss->xe, &p);
>
> drm_printf(&p, "\n**** GuC CT ****\n");
> xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p);
> @@ -114,6 +113,7 @@ static void xe_devcoredump_free(void *data)
> if (!data || !coredump_to_xe(coredump))
> return;
>
> + xe_device_snapshot_free(coredump->snapshot.xe);
> xe_guc_ct_snapshot_free(coredump->snapshot.ct);
> xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
> for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> @@ -153,6 +153,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>
> xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
>
> + coredump->snapshot.xe = xe_device_snapshot_capture(gt_to_xe(q->gt));
> coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
> coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 50106efcbc29..8950b1ca7456 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -26,7 +26,8 @@ struct xe_devcoredump_snapshot {
> /** @boot_time: Relative boot time so the uptime can be calculated. */
> ktime_t boot_time;
>
> - /* GuC snapshots */
> + /** @xe: Xe Device Info snapshot */
> + struct xe_device_snapshot *xe;
> /** @ct: GuC CT snapshot */
> struct xe_guc_ct_snapshot *ct;
> /** @ge: Guc Engine snapshot */
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 6faa7865b1aa..6999bb7aea42 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -728,22 +728,84 @@ void xe_device_mem_access_put(struct xe_device *xe)
> xe_assert(xe, ref >= 0);
> }
>
> -void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p)
> +/**
> + * xe_device_snapshot_capture - Take a quick snapshot of the Xe Device info.
> + * @xe: faulty Xe device.
> + *
> + * This can be printed out in a later stage like during dev_coredump
> + * analysis.
> + *
> + * Returns: a Xe device snapshot that must be freed by the
> + * caller, using `xe_device_snapshot_free`.
> + */
> +struct xe_device_snapshot *
> +xe_device_snapshot_capture(struct xe_device *xe)
> {
> + struct xe_device_snapshot *snapshot;
> struct xe_gt *gt;
> u8 id;
>
> - drm_printf(p, "PCI ID: 0x%04x\n", xe->info.devid);
> - drm_printf(p, "PCI revision: 0x%02x\n", xe->info.revid);
> + snapshot = kzalloc(sizeof(*snapshot), GFP_ATOMIC);
> +
> + if (!snapshot) {
> + drm_err(&xe->drm, "Skipping Xe Device snapshot.\n");
> + return NULL;
> + }
>
> + snapshot->devid = xe->info.devid;
> + snapshot->revid = xe->info.revid;
> + snapshot->gt_count = xe->info.gt_count;
> +
> + snapshot->gt = kmalloc_array(xe->info.gt_count,
> + sizeof(struct gt_info_snapshot), GFP_ATOMIC);
> for_each_gt(gt, xe, id) {
> - drm_printf(p, "GT id: %u\n", id);
> + snapshot->gt[id].type = gt->info.type;
> + snapshot->gt[id].gmdid = gt->info.gmdid;
> + snapshot->gt[id].reference_clock = gt->info.reference_clock;
> + }
> +
> + return snapshot;
> +}
> +
> +/**
> + * xe_decice_snapshot_print - Print out a given Xe device snapshot.
> + * @snapshot: Xe device snapshot object.
> + * @p: drm_printer where it will be printed out.
> + *
> + * This function prints out a given GuC Submit Engine snapshot object.
> + */
> +void xe_device_snapshot_print(struct xe_device_snapshot *ss,
> + struct drm_printer *p)
> +{
> + int i;
> +
> + drm_printf(p, "PCI ID: 0x%04x\n", ss->devid);
> + drm_printf(p, "PCI revision: 0x%02x\n", ss->revid);
> +
> + for(i = 0; i < ss->gt_count; i++) {
> + drm_printf(p, "GT id: %u\n", i);
> drm_printf(p, "\tType: %s\n",
> - gt->info.type == XE_GT_TYPE_MAIN ? "main" : "media");
> + ss->gt[i].type == XE_GT_TYPE_MAIN ? "main" : "media");
> drm_printf(p, "\tIP ver: %u.%u.%u\n",
> - REG_FIELD_GET(GMD_ID_ARCH_MASK, gt->info.gmdid),
> - REG_FIELD_GET(GMD_ID_RELEASE_MASK, gt->info.gmdid),
> - REG_FIELD_GET(GMD_ID_REVID, gt->info.gmdid));
> - drm_printf(p, "\tCS reference clock: %u\n", gt->info.reference_clock);
> + REG_FIELD_GET(GMD_ID_ARCH_MASK, ss->gt[i].gmdid),
> + REG_FIELD_GET(GMD_ID_RELEASE_MASK, ss->gt[i].gmdid),
> + REG_FIELD_GET(GMD_ID_REVID, ss->gt[i].gmdid));
> + drm_printf(p, "\tCS reference clock: %u\n", ss->gt[i].reference_clock);
> }
> }
> +
> +/**
> + * xe_device_snapshot_free - Free all allocated objects for a given xe device snapshot.
> + * @snapshot: Xe device snapshot object.
> + *
> + * This function free all the memory that needed to be allocated at capture
> + * time.
> + */
> +void xe_device_snapshot_free(struct xe_device_snapshot *snapshot)
> +{
> + if (!snapshot)
> + return;
> +
> + kfree(snapshot->gt);
> + kfree(snapshot);
> +}
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index 270124da1e00..409bfbcffb4a 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -175,6 +175,10 @@ static inline bool xe_device_has_memirq(struct xe_device *xe)
>
> u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size);
>
> -void xe_device_snapshot_print(struct xe_device *xe, struct drm_printer *p);
> +struct xe_device_snapshot *
> +xe_device_snapshot_capture(struct xe_device *xe);
> +void xe_device_snapshot_print(struct xe_device_snapshot *snapshot,
> + struct drm_printer *p);
> +void xe_device_snapshot_free(struct xe_device_snapshot *snapshot);
>
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index eb2b806a1d23..50dac1a5b053 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -540,6 +540,19 @@ struct xe_device {
> #endif
> };
>
> +struct gt_info_snapshot {
> + enum xe_gt_type type;
> + u32 gmdid;
> + u32 reference_clock;
> +};
> +
> +struct xe_device_snapshot {
> + u16 devid;
> + u8 revid;
> + u8 gt_count;
> + struct gt_info_snapshot *gt;
> +};
> +
> /**
> * struct xe_file - file handle for XE driver
> */
More information about the Intel-xe
mailing list