[PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang

Thu Apr 4 17:52:15 UTC 2024

On Wed, Apr 03, 2024 at 11:07:31AM -0400, Rodrigo Vivi wrote:
> In many validation situations when debugging GPU Hangs,
> it is useful to preserve the GT situation from the moment
> that the timeout occurred.
> 
> This patch introduces a module parameter that could be used
> on situations like this.
> 
> If xe.wedged module parameter is set to 2, Xe will be declared
> wedged on every single execution timeout (a.k.a. GPU hang) right
> after devcoredump snapshot capture and without attempting any
> kind of GT reset and blocking entirely any kind of execution.
> 
> v2: Really block gt_reset from guc side. (Lucas)
>     s/wedged/busted (Lucas)
> 
> v3: - s/busted/wedged
>     - Really use global_flags (Dafna)
>     - More robust timeout handling when wedging it.
> 
> Cc: Dafna Hirschfeld <dhirschfeld at habana.ai>
> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> Cc: Alan Previn <alan.previn.teres.alexis at intel.com>
> Cc: Himanshu Somaiya <himanshu.somaiya at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c     | 32 +++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_device.h     | 15 +---------
>  drivers/gpu/drm/xe/xe_guc_ads.c    |  9 +++++-
>  drivers/gpu/drm/xe/xe_guc_submit.c | 46 +++++++++++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_module.c     |  5 ++++
>  drivers/gpu/drm/xe/xe_module.h     |  1 +
>  6 files changed, 86 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 7015ef9b00a0..8e380a404a26 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -776,3 +776,35 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
>  {
>  	return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
>  }
> +
> +/**
> + * xe_device_declare_wedged - Declare device wedged
> + * @xe: xe device instance
> + *
> + * This is a final state that can only be cleared with a module
> + * re-probe (unbind + bind).
> + * In this state every IOCTL will be blocked so the GT cannot be used.
> + * In general it will be called upon any critical error such as gt reset
> + * failure or guc loading failure.
> + * If xe.wedged module parameter is set to 2, this function will be called
> + * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
> + * snapshot capture. In this mode, GT reset won't be attempted so the state of
> + * the issue is preserved for further debugging.
> + */
> +void xe_device_declare_wedged(struct xe_device *xe)
> +{
> +	if (xe_modparam.wedged_mode == 0)
> +		return;
> +
> +	if (!atomic_xchg(&xe->wedged, 1)) {
> +		xe->needs_flr_on_fini = true;
> +		drm_err(&xe->drm,
> +			"CRITICAL: Xe has declared device %s as wedged.\n"
> +			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
> +			"echo '%s' | sudo tee /sys/bus/pci/drivers/xe/unbind\n"
> +			"echo '%s' | sudo tee /sys/bus/pci/drivers/xe/bind\n"
> +			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
> +			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
> +			dev_name(xe->drm.dev));
> +	}
> +}
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index c532209c5bbd..0fea5c18f76d 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -181,19 +181,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
>  	return atomic_read(&xe->wedged);
>  }
>  
> -static inline void xe_device_declare_wedged(struct xe_device *xe)
> -{
> -	if (!atomic_xchg(&xe->wedged, 1)) {
> -		xe->needs_flr_on_fini = true;
> -		drm_err(&xe->drm,
> -			"CRITICAL: Xe has declared device %s as wedged.\n"
> -			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
> -			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
> -			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
> -			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
> -			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
> -			dev_name(xe->drm.dev));
> -	}
> -}
> +void xe_device_declare_wedged(struct xe_device *xe);
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> index e025f3e10c9b..37f30c333c93 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> @@ -18,6 +18,7 @@
>  #include "xe_lrc.h"
>  #include "xe_map.h"
>  #include "xe_mmio.h"
> +#include "xe_module.h"
>  #include "xe_platform_types.h"
>  
>  /* Slack of a few additional entries per engine */
> @@ -313,11 +314,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
>  
>  static void guc_policies_init(struct xe_guc_ads *ads)
>  {
> +	u32 global_flags = 0;
> +
>  	ads_blob_write(ads, policies.dpc_promote_time,
>  		       GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
>  	ads_blob_write(ads, policies.max_num_work_items,
>  		       GLOBAL_POLICY_MAX_NUM_WI);
> -	ads_blob_write(ads, policies.global_flags, 0);
> +
> +	if (xe_modparam.wedged_mode == 2)
> +		global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
> +
> +	ads_blob_write(ads, policies.global_flags, global_flags);
>  	ads_blob_write(ads, policies.is_valid, 1);
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 0a2a54f69f50..0eba01582f7c 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -35,6 +35,7 @@
>  #include "xe_macros.h"
>  #include "xe_map.h"
>  #include "xe_mocs.h"
> +#include "xe_module.h"
>  #include "xe_ring_ops_types.h"
>  #include "xe_sched_job.h"
>  #include "xe_trace.h"
> @@ -900,16 +901,44 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>  	xe_sched_submission_start(sched);
>  }
>  
> +static void guc_submit_signal_pending_jobs(struct xe_gpu_scheduler *sched,
> +					   int err)
> +{
> +	struct xe_sched_job *job;
> +	int i = 0;
> +
> +	/* Mark all outstanding jobs as bad, thus completing them */
> +	spin_lock(&sched->base.job_list_lock);
> +	list_for_each_entry(job, &sched->base.pending_list, drm.list)
> +		xe_sched_job_set_error(job, !i++ ? err : -ECANCELED);
> +	spin_unlock(&sched->base.job_list_lock);
> +}
> +
> +static void guc_submit_device_wedged(struct xe_exec_queue *q)
> +{
> +	struct xe_gpu_scheduler *sched = &q->guc->sched;
> +	struct xe_guc *guc = exec_queue_to_guc(q);
> +
> +	xe_sched_submission_stop(sched);
> +	xe_guc_exec_queue_trigger_cleanup(q);
> +	guc_submit_signal_pending_jobs(sched, -ETIME);

This will not signal the job that timed out as the job that times out is
removed from the pending list. In the normal TDR path the pending job is
added in via xe_sched_add_pending_job call.

Also with the xe_sched_submission_stop() I don't think free_job() will
ever get called on the pending jobs. Is that the intent?

> +	xe_guc_submit_reset_prepare(guc);
> +	xe_guc_submit_stop(guc);

This also is going to stop all jobs, on all exec queues, from having
free_job being called.

I guess I am little confused what this function is trying accomplish.
Can you explain? It would help me review this.

> +}
> +
>  static enum drm_gpu_sched_stat
>  guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  {
>  	struct xe_sched_job *job = to_xe_sched_job(drm_job);
> -	struct xe_sched_job *tmp_job;
>  	struct xe_exec_queue *q = job->q;
>  	struct xe_gpu_scheduler *sched = &q->guc->sched;
>  	struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
>  	int err = -ETIME;
> -	int i = 0;
> +
> +	if (xe_device_wedged(xe)) {
> +		guc_submit_device_wedged(q);
> +		return DRM_GPU_SCHED_STAT_ENODEV;
> +	}
>  
>  	/*
>  	 * TDR has fired before free job worker. Common if exec queue
> @@ -933,6 +962,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  
>  	trace_xe_sched_job_timedout(job);
>  
> +	if (xe_modparam.wedged_mode == 2) {
> +		xe_device_declare_wedged(xe);
> +		guc_submit_device_wedged(q);
> +		return DRM_GPU_SCHED_STAT_ENODEV;
> +	}
> +
>  	/* Kill the run_job entry point */
>  	xe_sched_submission_stop(sched);
>  
> @@ -994,13 +1029,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  	 */
>  	xe_sched_add_pending_job(sched, job);
>  	xe_sched_submission_start(sched);
> +

Nit: Looks unrelated.

Matt

>  	xe_guc_exec_queue_trigger_cleanup(q);
>  
> -	/* Mark all outstanding jobs as bad, thus completing them */
> -	spin_lock(&sched->base.job_list_lock);
> -	list_for_each_entry(tmp_job, &sched->base.pending_list, drm.list)
> -		xe_sched_job_set_error(tmp_job, !i++ ? err : -ECANCELED);
> -	spin_unlock(&sched->base.job_list_lock);
> +	guc_submit_signal_pending_jobs(sched, err);
>  
>  	/* Start fence signaling */
>  	xe_hw_fence_irq_start(q->fence_irq);
> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> index 110b69864656..5e023df0bea9 100644
> --- a/drivers/gpu/drm/xe/xe_module.c
> +++ b/drivers/gpu/drm/xe/xe_module.c
> @@ -17,6 +17,7 @@ struct xe_modparam xe_modparam = {
>  	.enable_display = true,
>  	.guc_log_level = 5,
>  	.force_probe = CONFIG_DRM_XE_FORCE_PROBE,
> +	.wedged_mode = 1,
>  	/* the rest are 0 by default */
>  };
>  
> @@ -48,6 +49,10 @@ module_param_named_unsafe(force_probe, xe_modparam.force_probe, charp, 0400);
>  MODULE_PARM_DESC(force_probe,
>  		 "Force probe options for specified devices. See CONFIG_DRM_XE_FORCE_PROBE for details.");
>  
> +module_param_named_unsafe(wedged_mode, xe_modparam.wedged_mode, int, 0600);
> +MODULE_PARM_DESC(wedged_mode,
> +		 "Module's default policy for the wedged mode - 0=never, 1=upon-critical-errors[default], 2=upon-any-hang");
> +
>  struct init_funcs {
>  	int (*init)(void);
>  	void (*exit)(void);
> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> index 88ef0e8b2bfd..bc6f370c9a8e 100644
> --- a/drivers/gpu/drm/xe/xe_module.h
> +++ b/drivers/gpu/drm/xe/xe_module.h
> @@ -18,6 +18,7 @@ struct xe_modparam {
>  	char *huc_firmware_path;
>  	char *gsc_firmware_path;
>  	char *force_probe;
> +	int wedged_mode;
>  };
>  
>  extern struct xe_modparam xe_modparam;
> -- 
> 2.44.0
>