[PATCH v4 3/7] drm/xe/guc: Use a two stage dump for GuC logs and add more info
Michal Wajdeczko
michal.wajdeczko at intel.com
Tue Jun 11 22:49:47 UTC 2024
On 11.06.2024 03:20, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> Split the GuC log dump into a two stage snapshot and print mechanism.
> This allows the log to be captured at the point of an error (which may
> be in a resticted context) and then dump it out later (from a regular
typo
> context such as a worker function or a sysfs file handler).
>
> Also add a bunch of other useful pieces of information that can help
> (or are fundamentally required!) to decode and parse the log.
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
> drivers/gpu/drm/xe/regs/xe_guc_regs.h | 1 +
> drivers/gpu/drm/xe/xe_guc_log.c | 150 ++++++++++++++++++++------
> drivers/gpu/drm/xe/xe_guc_log.h | 6 ++
> drivers/gpu/drm/xe/xe_guc_log_types.h | 29 +++++
> 4 files changed, 155 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> index a5fd14307f94..b27b73680c12 100644
> --- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
> @@ -84,6 +84,7 @@
> #define HUC_LOADING_AGENT_GUC REG_BIT(1)
> #define GUC_WOPCM_OFFSET_VALID REG_BIT(0)
> #define GUC_MAX_IDLE_COUNT XE_REG(0xc3e4)
> +#define GUC_PMTIMESTAMP XE_REG(0xc3e8)
>
> #define GUC_SEND_INTERRUPT XE_REG(0xc4c8)
> #define GUC_SEND_TRIGGER REG_BIT(0)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> index a35309926271..a35d3cc0c369 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.c
> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> @@ -7,12 +7,20 @@
>
> #include <drm/drm_managed.h>
>
> +#include "regs/xe_guc_regs.h"
> #include "xe_bo.h"
> #include "xe_gt.h"
> #include "xe_gt_printk.h"
> #include "xe_map.h"
> +#include "xe_mmio.h"
> #include "xe_module.h"
>
> +static struct xe_guc *
> +log_to_guc(struct xe_guc_log *log)
any reason why to split signature into two lines?
> +{
> + return container_of(log, struct xe_guc, log);
> +}
> +
> static struct xe_gt *
> log_to_gt(struct xe_guc_log *log)
> {
> @@ -108,62 +116,142 @@ static void xe_hexdump_blob(struct xe_device *xe, const void *blob, size_t size,
>
> #define GUC_LOG_CHUNK_SIZE SZ_2M
>
> -void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic)
no kernel-doc
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_alloc(struct xe_guc_log *log, bool atomic)
> {
> - struct xe_device *xe = log_to_xe(log);
> - size_t size, remain;
> - void **copy;
> - int num_chunks, i;
> + struct xe_guc_log_snapshot *snapshot;
> + size_t remain;
> + int i;
>
> - xe_assert(xe, log->bo);
> + snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL);
> + if (!snapshot)
> + return NULL;
>
> /*
> * NB: kmalloc has a hard limit well below the maximum GuC log buffer size.
> * Also, can't use vmalloc as might be called from atomic context. So need
> * to break the buffer up into smaller chunks that can be allocated.
> */
> - size = log->bo->size;
> - num_chunks = (size + GUC_LOG_CHUNK_SIZE - 1) / GUC_LOG_CHUNK_SIZE;
> + snapshot->size = log->bo->size;
> + snapshot->num_chunks = (snapshot->size + GUC_LOG_CHUNK_SIZE - 1) / GUC_LOG_CHUNK_SIZE;
>
> - copy = kcalloc(num_chunks, sizeof(*copy), atomic ? GFP_ATOMIC : GFP_KERNEL);
> - if (!copy) {
> - drm_printf(p, "Failed to allocate array x%d", num_chunks);
> - return;
> - }
> + snapshot->copy = kcalloc(snapshot->num_chunks, sizeof(*snapshot->copy),
> + atomic ? GFP_ATOMIC : GFP_KERNEL);
> + if (!snapshot->copy)
> + goto fail_snap;
>
> - remain = size;
> - for (i = 0; i < num_chunks; i++) {
> + remain = snapshot->size;
> + for (i = 0; i < snapshot->num_chunks; i++) {
> size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>
> - copy[i] = kmalloc(size, atomic ? GFP_ATOMIC : GFP_KERNEL);
> - if (!copy[i]) {
> - drm_printf(p, "Failed to allocate %ld at chunk %d of %d",
> - size, i, num_chunks);
> - goto out;
> - }
> + snapshot->copy[i] = kmalloc(size, atomic ? GFP_ATOMIC : GFP_KERNEL);
> + if (!snapshot->copy[i])
> + goto fail_copy;
> remain -= size;
> }
>
> - remain = size;
> - for (i = 0; i < num_chunks; i++) {
> + return snapshot;
> +
> +fail_copy:
> + for (i = 0; i < snapshot->num_chunks; i++)
> + kfree(snapshot->copy[i]);
> + kfree(snapshot->copy);
> +fail_snap:
> + kfree(snapshot);
> + return NULL;
> +}
> +
> +void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot)
> +{
> + int i;
> +
> + if (!snapshot)
> + return;
> +
> + if (!snapshot->copy) {
> + for (i = 0; i < snapshot->num_chunks; i++)
> + kfree(snapshot->copy[i]);
> + kfree(snapshot->copy);
> + }
> +
> + kfree(snapshot);
> +}
> +
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_capture(struct xe_guc_log *log, bool atomic)
> +{
> + struct xe_guc_log_snapshot *snapshot;
> + struct xe_device *xe = log_to_xe(log);
> + struct xe_guc *guc = log_to_guc(log);
> + struct xe_gt *gt = log_to_gt(log);
> + size_t remain;
> + int i;
> +
> + if (!log->bo) {
> + xe_gt_err(gt, "GuC log not allocated!\n");
> + return NULL;
> + }
> +
> + snapshot = xe_guc_log_snapshot_alloc(log, atomic);
> + if (!snapshot) {
> + xe_gt_err(gt, "GuC log snapshot not allocated!\n");
> + return NULL;
> + }
> +
> + remain = snapshot->size;
> + for (i = 0; i < snapshot->num_chunks; i++) {
> size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>
> - xe_map_memcpy_from(xe, copy[i], &log->bo->vmap, i * GUC_LOG_CHUNK_SIZE, size);
> + xe_map_memcpy_from(xe, snapshot->copy[i], &log->bo->vmap,
> + i * GUC_LOG_CHUNK_SIZE, size);
> remain -= size;
> }
>
> - remain = size;
> - for (i = 0; i < num_chunks; i++) {
> + snapshot->ktime = ktime_get_boottime_ns();
> + snapshot->stamp = xe_mmio_read32(gt, GUC_PMTIMESTAMP);
> + snapshot->ref_clk = gt->info.reference_clock;
> + snapshot->level = log->level;
> + snapshot->ver_found = guc->fw.versions.found[XE_UC_FW_VER_RELEASE];
> + snapshot->ver_want = guc->fw.versions.wanted;
> + snapshot->path = guc->fw.path;
shouldn't we use kstrdup() instead ?
> +
> + return snapshot;
> +}
> +
> +void xe_guc_log_snapshot_print(struct xe_device *xe, struct xe_guc_log_snapshot *snapshot,
> + struct drm_printer *p, bool atomic)
> +{
> + size_t remain;
> + int i;
> +
> + if (!snapshot) {
> + drm_printf(p, "GuC log snapshot not allocated!\n");
> + return;
> + }
> +
> + drm_printf(p, "GuC version %u.%u.%u (wanted %u.%u.%u)\n",
> + snapshot->ver_found.major, snapshot->ver_found.minor, snapshot->ver_found.patch,
> + snapshot->ver_want.major, snapshot->ver_want.minor, snapshot->ver_want.patch);
> + drm_printf(p, "GuC firmware: %s\n", snapshot->path);
> + drm_printf(p, "Kernel timestamp: 0x%08llX [%llu]\n", snapshot->ktime, snapshot->ktime);
> + drm_printf(p, "GuC timestamp: 0x%08X [%u]\n", snapshot->stamp, snapshot->stamp);
> + drm_printf(p, "CS timestamp frequency: %u Hz\n", snapshot->ref_clk);
> + drm_printf(p, "Log level: %u\n", snapshot->level);
> +
> + remain = snapshot->size;
> + for (i = 0; i < snapshot->num_chunks; i++) {
> size_t size = min(GUC_LOG_CHUNK_SIZE, remain);
>
> - xe_hexdump_blob(xe, copy[i], size, p, atomic);
> + xe_hexdump_blob(xe, snapshot->copy[i], size, p, atomic);
we really should get rid of xe in xe_hexdump_blob
> remain -= size;
> }
> +}
> +
> +void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic)
> +{
> + struct xe_guc_log_snapshot *snapshot;
>
> -out:
> - for (i = 0; i < num_chunks; i++)
> - kfree(copy[i]);
> - kfree(copy);
> + snapshot = xe_guc_log_snapshot_capture(log, atomic);
> + xe_guc_log_snapshot_print(log_to_xe(log), snapshot, p, atomic);
> + xe_guc_log_snapshot_free(snapshot);
> }
>
> int xe_guc_log_init(struct xe_guc_log *log)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
> index 5149b492c3b8..6e4d0b369c19 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log.h
> @@ -9,6 +9,7 @@
> #include "xe_guc_log_types.h"
>
> struct drm_printer;
> +struct xe_device;
>
> #if IS_ENABLED(CONFIG_DRM_XE_LARGE_GUC_BUFFER)
> #define CRASH_BUFFER_SIZE SZ_1M
> @@ -38,6 +39,11 @@ struct drm_printer;
>
> int xe_guc_log_init(struct xe_guc_log *log);
> void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p, bool atomic);
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_alloc(struct xe_guc_log *log, bool atomic);
> +struct xe_guc_log_snapshot *xe_guc_log_snapshot_capture(struct xe_guc_log *log, bool atomic);
> +void xe_guc_log_snapshot_print(struct xe_device *xe, struct xe_guc_log_snapshot *snapshot,
> + struct drm_printer *p, bool atomic);
> +void xe_guc_log_snapshot_free(struct xe_guc_log_snapshot *snapshot);
>
> static inline u32
> xe_guc_log_get_level(struct xe_guc_log *log)
> diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h
> index 125080d138a7..e3a6a44c2f18 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h
> @@ -8,8 +8,37 @@
>
> #include <linux/types.h>
>
> +#include "xe_uc_fw_types.h"
> +
> struct xe_bo;
>
> +/**
> + * struct xe_guc_log_snapshot:
> + * Capture of the GuC log plus various state useful for decoding the log
> + */
> +struct xe_guc_log_snapshot {
> + /** @size: Size in bytes of the @copy allocation */
> + size_t size;
> + /** @copy: Host memory copy of the log buffer for later dumping, split into chunks */
> + void **copy;
> + /** @num_chunks: Number of chunks withint @copy */
typo
> + int num_chunks;
> + /** @ktime: Kernel time the snapshot was taken */
> + u64 ktime;
> + /** @stamp: GuC timestamp at which the snapshot was taken */
> + u32 stamp;
> + /** @ref_clk: GuC timestamp frequency */
> + u32 ref_clk;
> + /** @level: GuC log verbosity level */
> + u32 level;
> + /** @ver_found: GuC firmware version */
> + struct xe_uc_fw_version ver_found;
didn't you mention that platform and/or revision is also mandatory to
correctly decode GuC log ?
> + /** @ver_want: GuC firmware version that driver expected */
> + struct xe_uc_fw_version ver_want;
I'm still not convinced that it's relevant to the actual GuC log what
driver wanted to use - this should be part of the GuC snapshot instead
> + /** @path: Path of GuC firmware blob */
> + const char *path;
ditto
> +};
> +
> /**
> * struct xe_guc_log - GuC log
> */
More information about the Intel-xe
mailing list