[PATCH v5 8/8] drm/xe/guc: Add GuC log to devcoredump captures

John Harrison john.c.harrison at intel.com
Mon Aug 5 18:41:05 UTC 2024


On 8/4/2024 15:36, Matthew Brost wrote:
> On Mon, Jul 29, 2024 at 04:17:52PM -0700, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> Add an ption to include the GuC log in devcoredump captures. Note that
>> this is currently optional and disabled by default. The reason being
>> that useful GuC logs are large, very large when converted to an ASCII
>> hex dump! And as they are not always necessary/useful for debugging a
>> hang, it is not desirable to force all core dump captures to be huge.
>>
>> NB: The intent is to add support for buffer compression to the core
>> dumps. Then the log can be included as standard without being too
>> onerous. At that point the module parameter override can be removed.
>>
> Briefly looked at this series and questions / few suggestions.
>
> 1. Remove mod param to include GuC log from devcoredump and always
> include it. We had a bug in devcoredump which has been fixed in [1]
> where large devcoredumps took forever. This has been fixed, now
> capturing a devcoredump around 256M takes .3 seconds so adding another
> 8M or so seems fine.
Yeah, that was the main reason for not wanting to always include it just 
yet!

>
> 2. Instead of printing GuC log as hex, can we just dumped it as binary?
> This is what we do VM mappings with dump flag set and LRC. grep
> 'ascii85_encode' for an example of this. It would be nice to keep binary
> dumping uniform. Then surely we can update our scripts / tools to parse
> this output.
Note, that's not binary. That's ASCII85 encoding. There are some drivers 
which genuinely include binary blobs e.g. ath10k 
(https://starkeblog.com/firmware/wifi/linux/kernel/2021/08/11/dev-coredump-and-firmware-images.html). 
But yes, it is way more compact than a full hexdump.

Do you know what userland tool is currently used to decode the ASCII85 
objects? The intel_error_decode tool in IGT does not appear to have been 
updated to support devcoredump files.


>
> 3. Can we move GuC log in devcoredump to
> xe_devcoredump_deferred_snap_work and just use kvmalloc? I suppose this
> makes debugfs capture a little tricker?
No idea. I haven't had chance to look at your changes in detail yet. So 
not sure what is involved. For certain, I would much rather capture the 
GuC log, CTBs, etc at the point of the error rather than in a worker 
thread some random amount of time later. The longer the delay between 
error and capture, the more chance there is for things to move on, wrap, 
overwrite, etc. And if you are using kvmalloc, that must be in some kind 
of asynchronous worker thread?

John.

>
> Matt
>
> [1] https://patchwork.freedesktop.org/series/136770/
>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_devcoredump.c       | 22 +++++++++++++++-------
>>   drivers/gpu/drm/xe/xe_devcoredump_types.h | 12 ++++++++----
>>   drivers/gpu/drm/xe/xe_module.c            |  3 +++
>>   drivers/gpu/drm/xe/xe_module.h            |  1 +
>>   4 files changed, 27 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
>> index 08a0bb3ee7c0..b7c241bd95d5 100644
>> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
>> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
>> @@ -17,8 +17,10 @@
>>   #include "xe_gt.h"
>>   #include "xe_gt_printk.h"
>>   #include "xe_guc_ct.h"
>> +#include "xe_guc_log.h"
>>   #include "xe_guc_submit.h"
>>   #include "xe_hw_engine.h"
>> +#include "xe_module.h"
>>   #include "xe_sched_job.h"
>>   #include "xe_vm.h"
>>   
>> @@ -74,7 +76,7 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>>   	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
>>   		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
>>   	xe_vm_snapshot_capture_delayed(ss->vm);
>> -	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>> +	xe_guc_exec_queue_snapshot_capture_delayed(ss->guc.ge);
>>   	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
>>   }
>>   
>> @@ -116,9 +118,13 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>   	drm_printf(&p, "Process: %s\n", ss->process_name);
>>   	xe_device_snapshot_print(xe, &p);
>>   
>> +	if (xe_modparam.enable_guc_log_in_coredump) {
>> +		drm_printf(&p, "\n**** GuC Log ****\n");
>> +		xe_guc_log_snapshot_print(xe, coredump->snapshot.guc.log, &p, false);
>> +	}
>>   	drm_printf(&p, "\n**** GuC CT ****\n");
>> -	xe_guc_ct_snapshot_print(xe, coredump->snapshot.ct, &p, false);
>> -	xe_guc_exec_queue_snapshot_print(coredump->snapshot.ge, &p);
>> +	xe_guc_ct_snapshot_print(xe, coredump->snapshot.guc.ct, &p, false);
>> +	xe_guc_exec_queue_snapshot_print(coredump->snapshot.guc.ge, &p);
>>   
>>   	drm_printf(&p, "\n**** Job ****\n");
>>   	xe_sched_job_snapshot_print(coredump->snapshot.job, &p);
>> @@ -145,8 +151,9 @@ static void xe_devcoredump_free(void *data)
>>   
>>   	cancel_work_sync(&coredump->snapshot.work);
>>   
>> -	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
>> -	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
>> +	xe_guc_log_snapshot_free(coredump->snapshot.guc.log);
>> +	xe_guc_ct_snapshot_free(coredump->snapshot.guc.ct);
>> +	xe_guc_exec_queue_snapshot_free(coredump->snapshot.guc.ge);
>>   	xe_sched_job_snapshot_free(coredump->snapshot.job);
>>   	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
>>   		if (coredump->snapshot.hwe[i])
>> @@ -199,8 +206,9 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>>   	if (xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL))
>>   		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
>>   
>> -	coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
>> -	coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(q);
>> +	coredump->snapshot.guc.log = xe_guc_log_snapshot_capture(&guc->log, true);
>> +	coredump->snapshot.guc.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
>> +	coredump->snapshot.guc.ge = xe_guc_exec_queue_snapshot_capture(q);
>>   	coredump->snapshot.job = xe_sched_job_snapshot_capture(job);
>>   	coredump->snapshot.vm = xe_vm_snapshot_capture(q->vm);
>>   
>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>> index 923cdf72a816..6ac8da1631f9 100644
>> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
>> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>> @@ -35,10 +35,14 @@ struct xe_devcoredump_snapshot {
>>   	struct work_struct work;
>>   
>>   	/* GuC snapshots */
>> -	/** @ct: GuC CT snapshot */
>> -	struct xe_guc_ct_snapshot *ct;
>> -	/** @ge: Guc Engine snapshot */
>> -	struct xe_guc_submit_exec_queue_snapshot *ge;
>> +	struct {
>> +		/** @ct: GuC CT snapshot */
>> +		struct xe_guc_ct_snapshot *ct;
>> +		/** @log: GuC log snapshot */
>> +		struct xe_guc_log_snapshot *log;
>> +		/** @ge: Guc Engine snapshot */
>> +		struct xe_guc_submit_exec_queue_snapshot *ge;
>> +	} guc;
>>   
>>   	/** @hwe: HW Engine snapshot array */
>>   	struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
>> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
>> index 7bb99e451fcc..dd837125f397 100644
>> --- a/drivers/gpu/drm/xe/xe_module.c
>> +++ b/drivers/gpu/drm/xe/xe_module.c
>> @@ -37,6 +37,9 @@ MODULE_PARM_DESC(vram_bar_size, "Set the vram bar size(in MiB)");
>>   module_param_named(guc_log_level, xe_modparam.guc_log_level, int, 0600);
>>   MODULE_PARM_DESC(guc_log_level, "GuC firmware logging level (0=disable, 1..5=enable with verbosity min..max)");
>>   
>> +module_param_named_unsafe(enable_guc_log_in_coredump, xe_modparam.enable_guc_log_in_coredump, bool, 0600);
>> +MODULE_PARM_DESC(enable_guc_log_in_coredump, "Include a capture of the GuC log in devcoredumps");
>> +
>>   module_param_named_unsafe(guc_firmware_path, xe_modparam.guc_firmware_path, charp, 0400);
>>   MODULE_PARM_DESC(guc_firmware_path,
>>   		 "GuC firmware path to use instead of the default one");
>> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
>> index 61a0d28a28c8..81be7fb05bd1 100644
>> --- a/drivers/gpu/drm/xe/xe_module.h
>> +++ b/drivers/gpu/drm/xe/xe_module.h
>> @@ -12,6 +12,7 @@
>>   struct xe_modparam {
>>   	bool force_execlist;
>>   	bool enable_display;
>> +	bool enable_guc_log_in_coredump;
>>   	u32 force_vram_bar_size;
>>   	int guc_log_level;
>>   	char *guc_firmware_path;
>> -- 
>> 2.43.2
>>



More information about the Intel-xe mailing list