[PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable

Cavitt, Jonathan jonathan.cavitt at intel.com
Wed Aug 7 14:06:12 UTC 2024


-----Original Message-----
From: Andi Shyti <andi.shyti at linux.intel.com> 
Sent: Wednesday, August 7, 2024 2:10 AM
To: intel-gfx <intel-gfx at lists.freedesktop.org>; dri-devel <dri-devel at lists.freedesktop.org>
Cc: Chris Wilson <chris.p.wilson at linux.intel.com>; Das, Nirmoy <nirmoy.das at intel.com>; Cavitt, Jonathan <jonathan.cavitt at intel.com>; Andi Shyti <andi.shyti at linux.intel.com>
Subject: [PATCH] drm/i915/gt: Mark the GT as dead when mmio is unreliable
> 
> From: Chris Wilson <chris.p.wilson at intel.com>
> 
> After we detect that mmio is returning all 0xff, we believe that the GPU
> has dropped off the pci bus and is dead. Mark the device as wedged such
> that we can propagate the failure back to userspace and wait for
> recovery.
> 
> Signed-off-by: Chris Wilson <chris.p.wilson at intel.com>
> Signed-off-by: Andi Shyti <andi.shyti at linux.intel.com>

LGTM.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
-Jonathan Cavitt

> ---
>  drivers/gpu/drm/i915/gt/intel_gt.h       |  6 ++++++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h |  2 ++
>  drivers/gpu/drm/i915/gt/intel_reset.c    | 12 +++++++++++-
>  drivers/gpu/drm/i915/intel_uncore.c      |  7 +++++--
>  4 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index b5e114d284ad..b73555889d50 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -208,4 +208,10 @@ enum i915_map_type intel_gt_coherent_map_type(struct intel_gt *gt,
>  void intel_gt_bind_context_set_ready(struct intel_gt *gt);
>  void intel_gt_bind_context_set_unready(struct intel_gt *gt);
>  bool intel_gt_is_bind_context_ready(struct intel_gt *gt);
> +
> +static inline void intel_gt_set_wedged_async(struct intel_gt *gt)
> +{
> +	queue_work(system_highpri_wq, &gt->wedge);
> +}
> +
>  #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index cfdd2ad5e954..bcee084b1f27 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -292,6 +292,8 @@ struct intel_gt {
>  	struct gt_defaults defaults;
>  	struct kobject *sysfs_defaults;
>  
> +	struct work_struct wedge;
> +
>  	struct i915_perf_gt perf;
>  
>  	/** link: &ggtt.gt_list */
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 735cd23a43c6..8f1ea95471ef 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -1013,6 +1013,15 @@ static void __intel_gt_set_wedged(struct intel_gt *gt)
>  	GT_TRACE(gt, "end\n");
>  }
>  
> +static void set_wedged_work(struct work_struct *w)
> +{
> +	struct intel_gt *gt = container_of(w, struct intel_gt, wedge);
> +	intel_wakeref_t wf;
> +
> +	with_intel_runtime_pm(gt->uncore->rpm, wf)
> +		__intel_gt_set_wedged(gt);
> +}
> +
>  void intel_gt_set_wedged(struct intel_gt *gt)
>  {
>  	intel_wakeref_t wakeref;
> @@ -1614,6 +1623,7 @@ void intel_gt_init_reset(struct intel_gt *gt)
>  	init_waitqueue_head(&gt->reset.queue);
>  	mutex_init(&gt->reset.mutex);
>  	init_srcu_struct(&gt->reset.backoff_srcu);
> +	INIT_WORK(&gt->wedge, set_wedged_work);
>  
>  	/*
>  	 * While undesirable to wait inside the shrinker, complain anyway.
> @@ -1640,7 +1650,7 @@ static void intel_wedge_me(struct work_struct *work)
>  	struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
>  
>  	gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name);
> -	intel_gt_set_wedged(w->gt);
> +	set_wedged_work(&w->gt->wedge);
>  }
>  
>  void __intel_init_wedge(struct intel_wedge_me *w,
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 2eba289d88ad..6aa179a3e92a 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -24,6 +24,7 @@
>  #include <drm/drm_managed.h>
>  #include <linux/pm_runtime.h>
>  
> +#include "gt/intel_gt.h"
>  #include "gt/intel_engine_regs.h"
>  #include "gt/intel_gt_regs.h"
>  
> @@ -180,14 +181,16 @@ fw_domain_wait_ack_clear(const struct intel_uncore_forcewake_domain *d)
>  	if (!wait_ack_clear(d, FORCEWAKE_KERNEL))
>  		return;
>  
> -	if (fw_ack(d) == ~0)
> +	if (fw_ack(d) == ~0) {
>  		drm_err(&d->uncore->i915->drm,
>  			"%s: MMIO unreliable (forcewake register returns 0xFFFFFFFF)!\n",
>  			intel_uncore_forcewake_domain_to_str(d->id));
> -	else
> +		intel_gt_set_wedged_async(d->uncore->gt);
> +	} else {
>  		drm_err(&d->uncore->i915->drm,
>  			"%s: timed out waiting for forcewake ack to clear.\n",
>  			intel_uncore_forcewake_domain_to_str(d->id));
> +	}
>  
>  	add_taint_for_CI(d->uncore->i915, TAINT_WARN); /* CI now unreliable */
>  }
> -- 
> 2.45.2
> 
> 


More information about the Intel-gfx mailing list