[PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang

Tue Apr 16 17:19:50 UTC 2024

On Tue, Apr 09, 2024 at 06:15:06PM GMT, Rodrigo Vivi wrote:
>In many validation situations when debugging GPU Hangs,
>it is useful to preserve the GT situation from the moment
>that the timeout occurred.
>
>This patch introduces a module parameter that could be used
>on situations like this.
>
>If xe.wedged module parameter is set to 2, Xe will be declared
>wedged on every single execution timeout (a.k.a. GPU hang) right
>after devcoredump snapshot capture and without attempting any
>kind of GT reset and blocking entirely any kind of execution.
>
>v2: Really block gt_reset from guc side. (Lucas)
>    s/wedged/busted (Lucas)
>
>v3: - s/busted/wedged
>    - Really use global_flags (Dafna)
>    - More robust timeout handling when wedging it.
>
>v4: A really robust clean exit done by Matt Brost.
>    No more kernel warns on unbind.
>
>Cc: Matthew Brost <matthew.brost at intel.com>
>Cc: Dafna Hirschfeld <dhirschfeld at habana.ai>
>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>Cc: Alan Previn <alan.previn.teres.alexis at intel.com>
>Cc: Himanshu Somaiya <himanshu.somaiya at intel.com>
>Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
>---
> drivers/gpu/drm/xe/xe_device.c              | 32 ++++++++
> drivers/gpu/drm/xe/xe_device.h              | 15 +---
> drivers/gpu/drm/xe/xe_exec_queue.h          |  9 +++
> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c |  2 +-
> drivers/gpu/drm/xe/xe_guc_ads.c             |  9 ++-
> drivers/gpu/drm/xe/xe_guc_submit.c          | 90 +++++++++++++++++----
> drivers/gpu/drm/xe/xe_module.c              |  5 ++
> drivers/gpu/drm/xe/xe_module.h              |  1 +
> 8 files changed, 132 insertions(+), 31 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>index 67de795e43b3..7928a5470cee 100644
>--- a/drivers/gpu/drm/xe/xe_device.c
>+++ b/drivers/gpu/drm/xe/xe_device.c
>@@ -785,3 +785,35 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
> {
> 	return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
> }
>+
>+/**
>+ * xe_device_declare_wedged - Declare device wedged
>+ * @xe: xe device instance
>+ *
>+ * This is a final state that can only be cleared with a module
>+ * re-probe (unbind + bind).
>+ * In this state every IOCTL will be blocked so the GT cannot be used.
>+ * In general it will be called upon any critical error such as gt reset
>+ * failure or guc loading failure.
>+ * If xe.wedged module parameter is set to 2, this function will be called
>+ * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
>+ * snapshot capture. In this mode, GT reset won't be attempted so the state of
>+ * the issue is preserved for further debugging.
>+ */
>+void xe_device_declare_wedged(struct xe_device *xe)

nit: this function could have been added in this file already in the
previous patches.

>+{
>+	if (xe_modparam.wedged_mode == 0)
>+		return;

^ so this would be the only diff

>+
>+	if (!atomic_xchg(&xe->wedged, 1)) {
>+		xe->needs_flr_on_fini = true;
>+		drm_err(&xe->drm,
>+			"CRITICAL: Xe has declared device %s as wedged.\n"
>+			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
>+			"echo '%s' | sudo tee /sys/bus/pci/drivers/xe/unbind\n"
>+			"echo '%s' | sudo tee /sys/bus/pci/drivers/xe/bind\n"

this brings back the old version. Alternatively we may simplify the
error message with:

  		drm_err(&xe->drm,
  			"CRITICAL: Xe has declared device %s as wedged.\n"
  			"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
  			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
  			dev_name(xe->drm.dev));

up to you.

>+			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
>+			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
>+			dev_name(xe->drm.dev));
>+	}
>+}
>diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>index c532209c5bbd..0fea5c18f76d 100644
>--- a/drivers/gpu/drm/xe/xe_device.h
>+++ b/drivers/gpu/drm/xe/xe_device.h
>@@ -181,19 +181,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
> 	return atomic_read(&xe->wedged);
> }
>
>-static inline void xe_device_declare_wedged(struct xe_device *xe)
>-{
>-	if (!atomic_xchg(&xe->wedged, 1)) {
>-		xe->needs_flr_on_fini = true;
>-		drm_err(&xe->drm,
>-			"CRITICAL: Xe has declared device %s as wedged.\n"
>-			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
>-			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
>-			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
>-			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
>-			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
>-			dev_name(xe->drm.dev));
>-	}
>-}
>+void xe_device_declare_wedged(struct xe_device *xe);
>
> #endif
>diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
>index 02ce8d204622..48f6da53a292 100644
>--- a/drivers/gpu/drm/xe/xe_exec_queue.h
>+++ b/drivers/gpu/drm/xe/xe_exec_queue.h
>@@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q);
> void xe_exec_queue_destroy(struct kref *ref);
> void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance);
>
>+static inline struct xe_exec_queue *
>+xe_exec_queue_get_unless_zero(struct xe_exec_queue *q)
>+{
>+	if (kref_get_unless_zero(&q->refcount))
>+		return q;
>+
>+	return NULL;
>+}
>+
> struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id);
>
> static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q)
>diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>index 93df2d7969b3..8e9c4b990fbb 100644
>--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>@@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
> 			return seqno;
>
> 		xe_gt_tlb_invalidation_wait(gt, seqno);
>-	} else if (xe_device_uc_enabled(xe)) {
>+	} else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
> 		xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
> 		if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
> 			xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
>diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
>index 757cbbb87869..dbd88ae20aa3 100644
>--- a/drivers/gpu/drm/xe/xe_guc_ads.c
>+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
>@@ -20,6 +20,7 @@
> #include "xe_lrc.h"
> #include "xe_map.h"
> #include "xe_mmio.h"
>+#include "xe_module.h"
> #include "xe_platform_types.h"
> #include "xe_wa.h"
>
>@@ -394,11 +395,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
>
> static void guc_policies_init(struct xe_guc_ads *ads)
> {
>+	u32 global_flags = 0;
>+
> 	ads_blob_write(ads, policies.dpc_promote_time,
> 		       GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
> 	ads_blob_write(ads, policies.max_num_work_items,
> 		       GLOBAL_POLICY_MAX_NUM_WI);
>-	ads_blob_write(ads, policies.global_flags, 0);
>+
>+	if (xe_modparam.wedged_mode == 2)
>+		global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
>+
>+	ads_blob_write(ads, policies.global_flags, global_flags);
> 	ads_blob_write(ads, policies.is_valid, 1);
> }
>
>diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>index c7d38469fb46..0bea17536659 100644
>--- a/drivers/gpu/drm/xe/xe_guc_submit.c
>+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>@@ -35,6 +35,7 @@
> #include "xe_macros.h"
> #include "xe_map.h"
> #include "xe_mocs.h"
>+#include "xe_module.h"
> #include "xe_ring_ops_types.h"
> #include "xe_sched_job.h"
> #include "xe_trace.h"
>@@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
> #define ENGINE_STATE_SUSPENDED		(1 << 5)
> #define EXEC_QUEUE_STATE_RESET		(1 << 6)
> #define ENGINE_STATE_KILLED		(1 << 7)
>+#define EXEC_QUEUE_STATE_WEDGED		(1 << 8)
>
> static bool exec_queue_registered(struct xe_exec_queue *q)
> {
>@@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
> 	atomic_or(ENGINE_STATE_KILLED, &q->guc->state);
> }
>
>-static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
>+static bool exec_queue_wedged(struct xe_exec_queue *q)
> {
>-	return exec_queue_killed(q) || exec_queue_banned(q);
>+	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED;
>+}
>+
>+static void set_exec_queue_wedged(struct xe_exec_queue *q)
>+{
>+	atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
>+}
>+
>+static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
>+{
>+	return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
>+		(EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED));
> }
>
> #ifdef CONFIG_PROVE_LOCKING
>@@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
> 	free_submit_wq(guc);
> }
>
>+static void guc_submit_wedged_fini(struct drm_device *drm, void *arg)
>+{
>+	struct xe_guc *guc = arg;
>+	struct xe_exec_queue *q;
>+	unsigned long index;
>+
>+	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
>+		if (exec_queue_wedged(q))
>+			xe_exec_queue_put(q);
>+}
>+
> static const struct xe_exec_queue_ops guc_exec_queue_ops;
>
> static void primelockdep(struct xe_guc *guc)
>@@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
>
> 	trace_xe_sched_job_run(job);
>
>-	if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) {
>+	if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
> 		if (!exec_queue_registered(q))
> 			register_engine(q);
> 		if (!lr)	/* LR jobs are emitted in the exec IOCTL */
>@@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
> 		xe_sched_tdr_queue_imm(&q->guc->sched);
> }
>
>+static void guc_submit_wedged(struct xe_guc *guc)
>+{
>+	struct xe_exec_queue *q;
>+	unsigned long index;
>+	int err;
>+
>+	xe_device_declare_wedged(guc_to_xe(guc));
>+	xe_guc_submit_reset_prepare(guc);
>+	xe_guc_ct_stop(&guc->ct);
>+
>+	err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
>+				       guc_submit_wedged_fini, guc);
>+	if (err)
>+		return;
>+
>+	mutex_lock(&guc->submission_state.lock);
>+	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
>+		if (xe_exec_queue_get_unless_zero(q))
>+			set_exec_queue_wedged(q);
>+	mutex_unlock(&guc->submission_state.lock);
>+}
>+
> static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
> {
> 	struct xe_guc_exec_queue *ge =
>@@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
> 	struct xe_guc *guc = exec_queue_to_guc(q);
> 	struct xe_device *xe = guc_to_xe(guc);
> 	struct xe_gpu_scheduler *sched = &ge->sched;
>+	bool wedged = xe_device_wedged(xe);
>
> 	xe_assert(xe, xe_exec_queue_is_lr(q));
> 	trace_xe_exec_queue_lr_cleanup(q);
>
>+	if (!wedged && xe_modparam.wedged_mode == 2) {

I wonder if these checks shouldn't be inside guc_submit_wedged()

anyway,

Reviewed-by: Lucas De Marchi <lucas.demarchi at intel.com>

Lucas De Marchi