[PATCH 3/4] drm/xe: Force wedged state and block GT reset upon any GPU hang

Wed Apr 24 03:20:45 UTC 2024

On 24-04-2024 03:48, Rodrigo Vivi wrote:
> In many validation situations when debugging GPU Hangs,
> it is useful to preserve the GT situation from the moment
> that the timeout occurred.
>
> This patch introduces a module parameter that could be used
> on situations like this.
>
> If xe.wedged module parameter is set to 2, Xe will be declared
> wedged on every single execution timeout (a.k.a. GPU hang) right
> after devcoredump snapshot capture and without attempting any
> kind of GT reset and blocking entirely any kind of execution.
>
> v2: Really block gt_reset from guc side. (Lucas)
>      s/wedged/busted (Lucas)
>
> v3: - s/busted/wedged
>      - Really use global_flags (Dafna)
>      - More robust timeout handling when wedging it.
>
> v4: A really robust clean exit done by Matt Brost.
>      No more kernel warns on unbind.
>
> v5: Simplify error message (Lucas)
>
> Cc: Matthew Brost<matthew.brost at intel.com>
> Cc: Dafna Hirschfeld<dhirschfeld at habana.ai>
> Cc: Lucas De Marchi<lucas.demarchi at intel.com>
> Cc: Alan Previn<alan.previn.teres.alexis at intel.com>
> Cc: Himanshu Somaiya<himanshu.somaiya at intel.com>
> Reviewed-by: Lucas De Marchi<lucas.demarchi at intel.com>
> Signed-off-by: Rodrigo Vivi<rodrigo.vivi at intel.com>
> ---
>   drivers/gpu/drm/xe/xe_device.c              | 29 +++++++
>   drivers/gpu/drm/xe/xe_device.h              | 15 +---
>   drivers/gpu/drm/xe/xe_exec_queue.h          |  9 +++
>   drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c |  2 +-
>   drivers/gpu/drm/xe/xe_guc_ads.c             |  9 ++-
>   drivers/gpu/drm/xe/xe_guc_submit.c          | 90 +++++++++++++++++----
>   drivers/gpu/drm/xe/xe_module.c              |  5 ++
>   drivers/gpu/drm/xe/xe_module.h              |  1 +
>   8 files changed, 129 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 76a7b37a4a53..d45db6ff1fa3 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -764,3 +764,32 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
>   {
>   	return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
>   }
> +
> +/**
> + * xe_device_declare_wedged - Declare device wedged
> + * @xe: xe device instance
> + *
> + * This is a final state that can only be cleared with a module
> + * re-probe (unbind + bind).
> + * In this state every IOCTL will be blocked so the GT cannot be used.
> + * In general it will be called upon any critical error such as gt reset
> + * failure or guc loading failure.
> + * If xe.wedged module parameter is set to 2, this function will be called
> + * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
> + * snapshot capture. In this mode, GT reset won't be attempted so the state of
> + * the issue is preserved for further debugging.
> + */
> +void xe_device_declare_wedged(struct xe_device *xe)
> +{
> +	if (xe_modparam.wedged_mode == 0)
> +		return;
> +
> +	if (!atomic_xchg(&xe->wedged, 1)) {
> +		xe->needs_flr_on_fini = true;
> +		drm_err(&xe->drm,
> +			"CRITICAL: Xe has declared device %s as wedged.\n"
> +			"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
> +			"Please file a _new_ bug report athttps://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
> +			dev_name(xe->drm.dev));
> +	}
> +}
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index d2e4249d37ce..9ede45fc062a 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -172,19 +172,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
>   	return atomic_read(&xe->wedged);
>   }
>   
> -static inline void xe_device_declare_wedged(struct xe_device *xe)
> -{
> -	if (!atomic_xchg(&xe->wedged, 1)) {
> -		xe->needs_flr_on_fini = true;
> -		drm_err(&xe->drm,
> -			"CRITICAL: Xe has declared device %s as wedged.\n"
> -			"IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
> -			"echo '%s' > /sys/bus/pci/drivers/xe/unbind\n"
> -			"echo '%s' > /sys/bus/pci/drivers/xe/bind\n"
> -			"Please file a _new_ bug report athttps://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
> -			dev_name(xe->drm.dev), dev_name(xe->drm.dev),
> -			dev_name(xe->drm.dev));
> -	}
> -}
> +void xe_device_declare_wedged(struct xe_device *xe);
>   
>   #endif
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h
> index 02ce8d204622..48f6da53a292 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.h
> @@ -26,6 +26,15 @@ void xe_exec_queue_fini(struct xe_exec_queue *q);
>   void xe_exec_queue_destroy(struct kref *ref);
>   void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance);
>   
> +static inline struct xe_exec_queue *
> +xe_exec_queue_get_unless_zero(struct xe_exec_queue *q)
> +{
> +	if (kref_get_unless_zero(&q->refcount))
> +		return q;
> +
> +	return NULL;
> +}
> +
>   struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id);
>   
>   static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q)
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> index 93df2d7969b3..8e9c4b990fbb 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> @@ -245,7 +245,7 @@ int xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt)
>   			return seqno;
>   
>   		xe_gt_tlb_invalidation_wait(gt, seqno);
> -	} else if (xe_device_uc_enabled(xe)) {
> +	} else if (xe_device_uc_enabled(xe) && !xe_device_wedged(xe)) {
>   		xe_gt_WARN_ON(gt, xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
>   		if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) {
>   			xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1,
> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> index 1aafa486edec..db817a46f157 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> @@ -20,6 +20,7 @@
>   #include "xe_lrc.h"
>   #include "xe_map.h"
>   #include "xe_mmio.h"
> +#include "xe_module.h"
>   #include "xe_platform_types.h"
>   #include "xe_wa.h"
>   
> @@ -440,11 +441,17 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
>   
>   static void guc_policies_init(struct xe_guc_ads *ads)
>   {
> +	u32 global_flags = 0;
> +
>   	ads_blob_write(ads, policies.dpc_promote_time,
>   		       GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US);
>   	ads_blob_write(ads, policies.max_num_work_items,
>   		       GLOBAL_POLICY_MAX_NUM_WI);
> -	ads_blob_write(ads, policies.global_flags, 0);
> +
> +	if (xe_modparam.wedged_mode == 2)
> +		global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
> +
> +	ads_blob_write(ads, policies.global_flags, global_flags);
>   	ads_blob_write(ads, policies.is_valid, 1);
>   }
>   
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index c7d38469fb46..0bea17536659 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -35,6 +35,7 @@
>   #include "xe_macros.h"
>   #include "xe_map.h"
>   #include "xe_mocs.h"
> +#include "xe_module.h"
>   #include "xe_ring_ops_types.h"
>   #include "xe_sched_job.h"
>   #include "xe_trace.h"
> @@ -59,6 +60,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
>   #define ENGINE_STATE_SUSPENDED		(1 << 5)
>   #define EXEC_QUEUE_STATE_RESET		(1 << 6)
>   #define ENGINE_STATE_KILLED		(1 << 7)
> +#define EXEC_QUEUE_STATE_WEDGED		(1 << 8)
>   
>   static bool exec_queue_registered(struct xe_exec_queue *q)
>   {
> @@ -175,9 +177,20 @@ static void set_exec_queue_killed(struct xe_exec_queue *q)
>   	atomic_or(ENGINE_STATE_KILLED, &q->guc->state);
>   }
>   
> -static bool exec_queue_killed_or_banned(struct xe_exec_queue *q)
> +static bool exec_queue_wedged(struct xe_exec_queue *q)
>   {
> -	return exec_queue_killed(q) || exec_queue_banned(q);
> +	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_WEDGED;
> +}
> +
> +static void set_exec_queue_wedged(struct xe_exec_queue *q)
> +{
> +	atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
> +}
> +
> +static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
> +{
> +	return exec_queue_banned(q) || (atomic_read(&q->guc->state) &
> +		(EXEC_QUEUE_STATE_WEDGED | ENGINE_STATE_KILLED));
>   }
>   
>   #ifdef CONFIG_PROVE_LOCKING
> @@ -240,6 +253,17 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
>   	free_submit_wq(guc);
>   }
>   
> +static void guc_submit_wedged_fini(struct drm_device *drm, void *arg)
> +{
> +	struct xe_guc *guc = arg;
> +	struct xe_exec_queue *q;
> +	unsigned long index;
> +
> +	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> +		if (exec_queue_wedged(q))
> +			xe_exec_queue_put(q);
> +}
> +
>   static const struct xe_exec_queue_ops guc_exec_queue_ops;
>   
>   static void primelockdep(struct xe_guc *guc)
> @@ -708,7 +732,7 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job)
>   
>   	trace_xe_sched_job_run(job);
>   
> -	if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) {
> +	if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) {
>   		if (!exec_queue_registered(q))
>   			register_engine(q);
>   		if (!lr)	/* LR jobs are emitted in the exec IOCTL */
> @@ -844,6 +868,28 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
>   		xe_sched_tdr_queue_imm(&q->guc->sched);
>   }
>   
> +static void guc_submit_wedged(struct xe_guc *guc)
> +{
> +	struct xe_exec_queue *q;
> +	unsigned long index;
> +	int err;
> +
> +	xe_device_declare_wedged(guc_to_xe(guc));
> +	xe_guc_submit_reset_prepare(guc);
> +	xe_guc_ct_stop(&guc->ct);
> +
> +	err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
> +				       guc_submit_wedged_fini, guc);
> +	if (err)
> +		return;
> +
> +	mutex_lock(&guc->submission_state.lock);
> +	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
> +		if (xe_exec_queue_get_unless_zero(q))
> +			set_exec_queue_wedged(q);
> +	mutex_unlock(&guc->submission_state.lock);
> +}
> +
>   static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>   {
>   	struct xe_guc_exec_queue *ge =
> @@ -852,10 +898,16 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>   	struct xe_guc *guc = exec_queue_to_guc(q);
>   	struct xe_device *xe = guc_to_xe(guc);
>   	struct xe_gpu_scheduler *sched = &ge->sched;
> +	bool wedged = xe_device_wedged(xe);
>   
>   	xe_assert(xe, xe_exec_queue_is_lr(q));
>   	trace_xe_exec_queue_lr_cleanup(q);
>   
> +	if (!wedged && xe_modparam.wedged_mode == 2) {
> +		guc_submit_wedged(exec_queue_to_guc(q));
> +		wedged = true;
> +	}
> +
>   	/* Kill the run_job / process_msg entry points */
>   	xe_sched_submission_stop(sched);
>   
> @@ -870,7 +922,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>   	 * xe_guc_deregister_done_handler() which treats it as an unexpected
>   	 * state.
>   	 */
> -	if (exec_queue_registered(q) && !exec_queue_destroyed(q)) {
> +	if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
>   		struct xe_guc *guc = exec_queue_to_guc(q);
>   		int ret;
>   
> @@ -905,6 +957,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
>   	int err = -ETIME;
>   	int i = 0;
> +	bool wedged = xe_device_wedged(xe);
>   
>   	/*
>   	 * TDR has fired before free job worker. Common if exec queue
> @@ -928,6 +981,11 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   
>   	trace_xe_sched_job_timedout(job);
>   
> +	if (!wedged && xe_modparam.wedged_mode == 2) {
> +		guc_submit_wedged(exec_queue_to_guc(q));
> +		wedged = true;
> +	}
> +
>   	/* Kill the run_job entry point */
>   	xe_sched_submission_stop(sched);
>   
> @@ -935,8 +993,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	 * Kernel jobs should never fail, nor should VM jobs if they do
>   	 * somethings has gone wrong and the GT needs a reset
>   	 */
> -	if (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
> -	    (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) {
> +	if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
> +			(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
>   		if (!xe_sched_invalidate_job(job, 2)) {
>   			xe_sched_add_pending_job(sched, job);
>   			xe_sched_submission_start(sched);
> @@ -946,7 +1004,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	}
>   
>   	/* Engine state now stable, disable scheduling if needed */
> -	if (exec_queue_registered(q)) {
> +	if (!wedged && exec_queue_registered(q)) {
>   		struct xe_guc *guc = exec_queue_to_guc(q);
>   		int ret;
>   
> @@ -989,6 +1047,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>   	 */
>   	xe_sched_add_pending_job(sched, job);
>   	xe_sched_submission_start(sched);
> +
>   	xe_guc_exec_queue_trigger_cleanup(q);
>   
>   	/* Mark all outstanding jobs as bad, thus completing them */
> @@ -1028,7 +1087,7 @@ static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
>   	INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
>   
>   	/* We must block on kernel engines so slabs are empty on driver unload */
> -	if (q->flags & EXEC_QUEUE_FLAG_PERMANENT)
> +	if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
>   		__guc_exec_queue_fini_async(&q->guc->fini_async);
>   	else
>   		queue_work(system_wq, &q->guc->fini_async);
> @@ -1063,7 +1122,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
>   
>   static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
>   {
> -	return !exec_queue_killed_or_banned(q) && exec_queue_registered(q);
> +	return !exec_queue_killed_or_banned_or_wedged(q) && exec_queue_registered(q);
>   }
>   
>   static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg)
> @@ -1274,7 +1333,7 @@ static void guc_exec_queue_fini(struct xe_exec_queue *q)
>   {
>   	struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
>   
> -	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT))
> +	if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
>   		guc_exec_queue_add_msg(q, msg, CLEANUP);
>   	else
>   		__guc_exec_queue_fini(exec_queue_to_guc(q), q);
> @@ -1285,7 +1344,8 @@ static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
>   {
>   	struct xe_sched_msg *msg;
>   
> -	if (q->sched_props.priority == priority || exec_queue_killed_or_banned(q))
> +	if (q->sched_props.priority == priority ||
> +	    exec_queue_killed_or_banned_or_wedged(q))
>   		return 0;
>   
>   	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
> @@ -1303,7 +1363,7 @@ static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_u
>   	struct xe_sched_msg *msg;
>   
>   	if (q->sched_props.timeslice_us == timeslice_us ||
> -	    exec_queue_killed_or_banned(q))
> +	    exec_queue_killed_or_banned_or_wedged(q))
>   		return 0;
>   
>   	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
> @@ -1322,7 +1382,7 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
>   	struct xe_sched_msg *msg;
>   
>   	if (q->sched_props.preempt_timeout_us == preempt_timeout_us ||
> -	    exec_queue_killed_or_banned(q))
> +	    exec_queue_killed_or_banned_or_wedged(q))
>   		return 0;
>   
>   	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
> @@ -1339,7 +1399,7 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
>   {
>   	struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
>   
> -	if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending)
> +	if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
>   		return -EINVAL;
>   
>   	q->guc->suspend_pending = true;
> @@ -1485,7 +1545,7 @@ static void guc_exec_queue_start(struct xe_exec_queue *q)
>   {
>   	struct xe_gpu_scheduler *sched = &q->guc->sched;
>   
> -	if (!exec_queue_killed_or_banned(q)) {
> +	if (!exec_queue_killed_or_banned_or_wedged(q)) {
>   		int i;
>   
>   		trace_xe_exec_queue_resubmit(q);
> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> index ceb8345cbca6..3edeb30d5ccb 100644
> --- a/drivers/gpu/drm/xe/xe_module.c
> +++ b/drivers/gpu/drm/xe/xe_module.c
> @@ -17,6 +17,7 @@ struct xe_modparam xe_modparam = {
>   	.enable_display = true,
>   	.guc_log_level = 5,
>   	.force_probe = CONFIG_DRM_XE_FORCE_PROBE,
> +	.wedged_mode = 1,
>   	/* the rest are 0 by default */
>   };
>   
> @@ -55,6 +56,10 @@ MODULE_PARM_DESC(max_vfs,
>   		 "(0 = no VFs [default]; N = allow up to N VFs)");
>   #endif
>   
> +module_param_named_unsafe(wedged_mode, xe_modparam.wedged_mode, int, 0600);
> +MODULE_PARM_DESC(wedged_mode,
> +		 "Module's default policy for the wedged mode - 0=never, 1=upon-critical-errors[default], 2=upon-any-hang");
> +

Hi Rodrigo,

The debugfs entry introduced in [PATCH 4/4] of the series offers the 
same functionality as the modparams provided. Do you perceive any 
additional value in using this modparam?

The behavior of loading the module without using modparams and setting 
debugfs mode to 2 before executing the workload is identical to loading 
the driver module with the modparam xe_modparam.wedged_mode = 2.

BR

Himal

>   struct init_funcs {
>   	int (*init)(void);
>   	void (*exit)(void);
> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> index b369984f08ec..61a0d28a28c8 100644
> --- a/drivers/gpu/drm/xe/xe_module.h
> +++ b/drivers/gpu/drm/xe/xe_module.h
> @@ -21,6 +21,7 @@ struct xe_modparam {
>   #ifdef CONFIG_PCI_IOV
>   	unsigned int max_vfs;
>   #endif
> +	int wedged_mode;
>   };
>   
>   extern struct xe_modparam xe_modparam;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20240424/4744501b/attachment-0001.htm>