[PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable
Cavitt, Jonathan
jonathan.cavitt at intel.com
Wed Aug 7 14:06:12 UTC 2024
-----Original Message-----
From: Andi Shyti <andi.shyti at linux.intel.com>
Sent: Wednesday, August 7, 2024 2:10 AM
To: intel-gfx <intel-gfx at lists.freedesktop.org>; dri-devel <dri-devel at lists.freedesktop.org>
Cc: Chris Wilson <chris.p.wilson at linux.intel.com>; Das, Nirmoy <nirmoy.das at intel.com>; Cavitt, Jonathan <jonathan.cavitt at intel.com>; Andi Shyti <andi.shyti at linux.intel.com>
Subject: [PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable
>
> From: Chris Wilson <chris.p.wilson at intel.com>
>
> After we detect that mmio is returning all 0xff, we believe that the GPU
> has dropped off the pci bus and is dead. Mark the device as wedged such
> that we can propagate the failure back to userspace and wait for
> recovery.
>
> Signed-off-by: Chris Wilson <chris.p.wilson at intel.com>
> Signed-off-by: Andi Shyti <andi.shyti at linux.intel.com>
LGTM.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
-Jonathan Cavitt
> ---
> drivers/gpu/drm/i915/gt/intel_gt.h | 6 ++++++
> drivers/gpu/drm/i915/gt/intel_gt_types.h | 2 ++
> drivers/gpu/drm/i915/gt/intel_reset.c | 12 +++++++++++-
> drivers/gpu/drm/i915/intel_uncore.c | 7 +++++--
> 4 files changed, 24 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index b5e114d284ad..b73555889d50 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt,
> void intel_gt_bind_context_set_ready(struct intel_gt *gt);
> void intel_gt_bind_context_set_unready(struct intel_gt *gt);
> bool intel_gt_is_bind_context_ready(struct intel_gt *gt);
> +
> +static inline void intel_gt_set_wedged_async(struct intel_gt *gt)
> +{
> + queue_work(system_highpri_wq, >->wedge);
> +}
> +
> #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index cfdd2ad5e954..bcee084b1f27 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -292,6 +292,8 @@ struct intel_gt {
> struct gt_defaults defaults;
> struct kobject *sysfs_defaults;
>
> + struct work_struct wedge;
> +
> struct i915_perf_gt perf;
>
> /** link: &ggtt.gt_list */
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 735cd23a43c6..8f1ea95471ef 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
> GT_TRACE(gt, "end\n");
> }
>
> +static void set_wedged_work(struct work_struct *w)
> +{
> + struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
> + intel_wakeref_t wf;
> +
> + with_intel_runtime_pm(gt->uncore->rpm, wf)
> + __intel_gt_set_wedged(gt);
> +}
> +
> void intel_gt_set_wedged(struct intel_gt *gt)
> {
> intel_wakeref_t wakeref;
> @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt)
> init_waitqueue_head(>->reset.queue);
> mutex_init(>->reset.mutex);
> init_srcu_struct(>->reset.backoff_srcu);
> + INIT_WORK(>->wedge, set_wedged_work);
>
> /*
> * While undesirable to wait inside the shrinker, complain anyway.
> @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work)
> struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
>
> gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name);
> - intel_gt_set_wedged(w->gt);
> + set_wedged_work(&w->gt->wedge);
> }
>
> void __intel_init_wedge(struct intel_wedge_me *w,
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 2eba289d88ad..6aa179a3e92a 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -24,6 +24,7 @@
> #include <drm/drm_managed.h>
> #include <linux/pm_runtime.h>
>
> +#include "gt/intel_gt.h"
> #include "gt/intel_engine_regs.h"
> #include "gt/intel_gt_regs.h"
>
> @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d)
> if (!wait_ack_clear(d, FORCEWAKE_KERNEL))
> return;
>
> - if (fw_ack(d) == ~0)
> + if (fw_ack(d) == ~0) {
> drm_err(&d->uncore->i915->drm,
> "%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n",
> intel_uncore_forcewake_domain_to_str(d->id));
> - else
> + intel_gt_set_wedged_async(d->uncore->gt);
> + } else {
> drm_err(&d->uncore->i915->drm,
> "%s: timed out waiting for forcewake ack to clear.\n",
> intel_uncore_forcewake_domain_to_str(d->id));
> + }
>
> add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */
> }
> --
> 2.45.2
>
>
More information about the Intel-gfx
mailing list