[PATCH v6 2/6] drm/xe/guc: Don't store capture nodes in xe_devcoredump_snapshot
Teres Alexis, Alan Previn
alan.previn.teres.alexis at intel.com
Thu Jan 30 17:57:56 UTC 2025
On Tue, 2025-01-28 at 10:36 -0800, Teres Alexis, Alan Previn wrote:
> GuC-Err-Capture should not be storing register snapshot
> nodes directly inside of the top level xe_devcoredump_snapshot
> structure that it doesn't control. Furthermore, that is
> is not right from a driver subsystem layering perspective.
>
>
alan:snip
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index a99e3160724b..26006d72904f 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -25,6 +25,7 @@
> #include "xe_gt_mcr.h"
> #include "xe_gt_topology.h"
> #include "xe_guc_capture.h"
> +#include "xe_guc_capture_snapshot_types.h"
> #include "xe_hw_engine_group.h"
> #include "xe_hw_fence.h"
> #include "xe_irq.h"
> @@ -867,22 +868,20 @@ xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe, struct xe_exec_queue *q)
> return snapshot;
>
> if (q) {
> - /* If got guc capture, set source to GuC */
> - node = xe_guc_capture_get_matching_and_lock(q);
> - if (node) {
> - struct xe_device *xe = gt_to_xe(hwe->gt);
> - struct xe_devcoredump *coredump = &xe->devcoredump;
> -
> - coredump->snapshot.matched_node = node;
> - xe_gt_dbg(hwe->gt, "Found and locked GuC-err-capture node");
> - return snapshot;
> + /* First, retrieve the manual GuC-Error-Capture node if it exists */
> + node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_MANUAL);
> + /* Find preferred node type sourced from firmware if available */
> + snapshot->matched_node = xe_guc_capture_get_matching_and_lock(q, XE_ENGINE_CAPTURE_SOURCE_GUC);
> + if (!snapshot->matched_node) {
> + xe_gt_dbg(hwe->gt, "No fw sourced GuC-Err-Capture for queue %s", q->name);
> + snapshot->matched_node = node;
> + } else if (node) {
> + xe_guc_capture_put_matched_nodes(&hwe->gt->uc.guc, node);
> }
> + if (!snapshot->matched_node)
> + xe_gt_warn(hwe->gt, "Can't retrieve any GuC-Err-Capture node");
alan: a couple of the CI full-test failures was caused by this. It turns out that
we have other code paths that can attempt to generate a xe_devcoredump without being triggered
from a timed-out-job event. John Harrison fedback that such cases are still valid so this should
be a xe_gt_dbg, not xe_gt_warn. Additionally, we agreed that there is value in reporting
such cases in the dump file. So as opposed to "GuC source" vs "Manual source" engine dumps
we could add additional differentiation Guc-src vs Manual-early vs Manual-late.
Will add that in next rev.
> }
>
> - /* otherwise, do manual capture */
> - xe_engine_manual_capture(hwe, snapshot);
> - xe_gt_dbg(hwe->gt, "Proceeding with manual engine snapshot");
> -
> return snapshot;
> }
>
> @@ -900,12 +899,7 @@ void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot)
> return;
>
> gt = snapshot->hwe->gt;
> - /*
> - * xe_guc_capture_put_matched_nodes is called here and from
> - * xe_devcoredump_snapshot_free, to cover the 2 calling paths
> - * of hw_engines - debugfs and devcoredump free.
> - */
> - xe_guc_capture_put_matched_nodes(>->uc.guc);
> + xe_guc_capture_put_matched_nodes(>->uc.guc, snapshot->matched_node);
>
> kfree(snapshot->name);
> kfree(snapshot);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index de69e2628f2f..de1f82c11bcf 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -152,6 +152,7 @@ struct xe_hw_engine {
> struct xe_hw_engine_group *hw_engine_group;
> };
>
> +struct xe_guc_capture_snapshot;
> /**
> * struct xe_hw_engine_snapshot - Hardware engine snapshot
> *
> @@ -175,6 +176,13 @@ struct xe_hw_engine_snapshot {
> u32 mmio_base;
> /** @kernel_reserved: Engine reserved, can't be used by userspace */
> bool kernel_reserved;
> + /**
> + * @matched_node: GuC Capture snapshot:
> + * The matched capture node for the timedout job
> + * this single-node tracker works because devcoredump will always only
> + * produce one hw-engine capture per devcoredump event
> + */
> + struct xe_guc_capture_snapshot *matched_node;
> };
>
> #endif
More information about the dri-devel
mailing list