[PATCH v5 8/8] drm/xe/guc: Add GuC log to devcoredump captures
Matthew Brost
matthew.brost at intel.com
Sun Aug 4 22:36:30 UTC 2024
On Mon, Jul 29, 2024 at 04:17:52PM -0700, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> Add an ption to include the GuC log in devcoredump captures. Note that
> this is currently optional and disabled by default. The reason being
> that useful GuC logs are large, very large when converted to an ASCII
> hex dump! And as they are not always necessary/useful for debugging a
> hang, it is not desirable to force all core dump captures to be huge.
>
> NB: The intent is to add support for buffer compression to the core
> dumps. Then the log can be included as standard without being too
> onerous. At that point the module parameter override can be removed.
>
Briefly looked at this series and questions / few suggestions.
1. Remove mod param to include GuC log from devcoredump and always
include it. We had a bug in devcoredump which has been fixed in [1]
where large devcoredumps took forever. This has been fixed, now
capturing a devcoredump around 256M takes .3 seconds so adding another
8M or so seems fine.
2. Instead of printing GuC log as hex, can we just dumped it as binary?
This is what we do VM mappings with dump flag set and LRC. grep
'ascii85_encode' for an example of this. It would be nice to keep binary
dumping uniform. Then surely we can update our scripts / tools to parse
this output.
3. Can we move GuC log in devcoredump to
xe_devcoredump_deferred_snap_work and just use kvmalloc? I suppose this
makes debugfs capture a little tricker?
Matt
[1] https://patchwork.freedesktop.org/series/136770/
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
> drivers/gpu/drm/xe/xe_devcoredump.c | 22 +++++++++++++++-------
> drivers/gpu/drm/xe/xe_devcoredump_types.h | 12 ++++++++----
> drivers/gpu/drm/xe/xe_module.c | 3 +++
> drivers/gpu/drm/xe/xe_module.h | 1 +
> 4 files changed, 27 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 08a0bb3ee7c0..b7c241bd95d5 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -17,8 +17,10 @@
> #include "xe_gt.h"
> #include "xe_gt_printk.h"
> #include "xe_guc_ct.h"
> +#include "xe_guc_log.h"
> #include "xe_guc_submit.h"
> #include "xe_hw_engine.h"
> +#include "xe_module.h"
> #include "xe_sched_job.h"
> #include "xe_vm.h"
>
> @@ -74,7 +76,7 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
> xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
> xe_vm_snapshot_capture_delayed(ss->vm);
> - xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> + xe_guc_exec_queue_snapshot_capture_delayed(ss->guc.ge);
> xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> }
>
> @@ -116,9 +118,13 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> drm_printf(&p, "Process: %s\n", ss->process_name);
> xe_device_snapshot_print(xe, &p);
>
> + if (xe_modparam.enable_guc_log_in_coredump) {
> + drm_printf(&p, "\n**** GuC Log ****\n");
> + xe_guc_log_snapshot_print(xe, coredump->snapshot.guc.log, &p, false);
> + }
> drm_printf(&p, "\n**** GuC CT ****\n");
> - xe_guc_ct_snapshot_print(xe, coredump->snapshot.ct, &p, false);
> - xe_guc_exec_queue_snapshot_print(coredump->snapshot.ge, &p);
> + xe_guc_ct_snapshot_print(xe, coredump->snapshot.guc.ct, &p, false);
> + xe_guc_exec_queue_snapshot_print(coredump->snapshot.guc.ge, &p);
>
> drm_printf(&p, "\n**** Job ****\n");
> xe_sched_job_snapshot_print(coredump->snapshot.job, &p);
> @@ -145,8 +151,9 @@ static void xe_devcoredump_free(void *data)
>
> cancel_work_sync(&coredump->snapshot.work);
>
> - xe_guc_ct_snapshot_free(coredump->snapshot.ct);
> - xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
> + xe_guc_log_snapshot_free(coredump->snapshot.guc.log);
> + xe_guc_ct_snapshot_free(coredump->snapshot.guc.ct);
> + xe_guc_exec_queue_snapshot_free(coredump->snapshot.guc.ge);
> xe_sched_job_snapshot_free(coredump->snapshot.job);
> for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> if (coredump->snapshot.hwe[i])
> @@ -199,8 +206,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> if (xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL))
> xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
>
> - coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
> - coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(q);
> + coredump->snapshot.guc.log = xe_guc_log_snapshot_capture(&guc->log, true);
> + coredump->snapshot.guc.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
> + coredump->snapshot.guc.ge = xe_guc_exec_queue_snapshot_capture(q);
> coredump->snapshot.job = xe_sched_job_snapshot_capture(job);
> coredump->snapshot.vm = xe_vm_snapshot_capture(q->vm);
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 923cdf72a816..6ac8da1631f9 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -35,10 +35,14 @@ struct xe_devcoredump_snapshot {
> struct work_struct work;
>
> /* GuC snapshots */
> - /** @ct: GuC CT snapshot */
> - struct xe_guc_ct_snapshot *ct;
> - /** @ge: Guc Engine snapshot */
> - struct xe_guc_submit_exec_queue_snapshot *ge;
> + struct {
> + /** @ct: GuC CT snapshot */
> + struct xe_guc_ct_snapshot *ct;
> + /** @log: GuC log snapshot */
> + struct xe_guc_log_snapshot *log;
> + /** @ge: Guc Engine snapshot */
> + struct xe_guc_submit_exec_queue_snapshot *ge;
> + } guc;
>
> /** @hwe: HW Engine snapshot array */
> struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> index 7bb99e451fcc..dd837125f397 100644
> --- a/drivers/gpu/drm/xe/xe_module.c
> +++ b/drivers/gpu/drm/xe/xe_module.c
> @@ -37,6 +37,9 @@ MODULE_PARM_DESC(vram_bar_size, "Set the vram bar size(in MiB)");
> module_param_named(guc_log_level, xe_modparam.guc_log_level, int, 0600);
> MODULE_PARM_DESC(guc_log_level, "GuC firmware logging level (0=disable, 1..5=enable with verbosity min..max)");
>
> +module_param_named_unsafe(enable_guc_log_in_coredump, xe_modparam.enable_guc_log_in_coredump, bool, 0600);
> +MODULE_PARM_DESC(enable_guc_log_in_coredump, "Include a capture of the GuC log in devcoredumps");
> +
> module_param_named_unsafe(guc_firmware_path, xe_modparam.guc_firmware_path, charp, 0400);
> MODULE_PARM_DESC(guc_firmware_path,
> "GuC firmware path to use instead of the default one");
> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> index 61a0d28a28c8..81be7fb05bd1 100644
> --- a/drivers/gpu/drm/xe/xe_module.h
> +++ b/drivers/gpu/drm/xe/xe_module.h
> @@ -12,6 +12,7 @@
> struct xe_modparam {
> bool force_execlist;
> bool enable_display;
> + bool enable_guc_log_in_coredump;
> u32 force_vram_bar_size;
> int guc_log_level;
> char *guc_firmware_path;
> --
> 2.43.2
>
More information about the Intel-xe
mailing list