[PATCH v6 11/11] drm/xe: Sample ctx timestamp to determine if jobs have timed out

Thu Jun 13 00:57:26 UTC 2024

On 6/12/2024 15:30, Matthew Brost wrote:
> On Wed, Jun 12, 2024 at 02:56:42PM -0700, John Harrison wrote:
>> On 6/11/2024 07:40, Matthew Brost wrote:
>>> In GuC TDR sample ctx timestamp to determine if jobs have timed out. The
>>> scheduling enable needs to be toggled to properly sample the timestamp.
>>> If a job has not been running for longer than the timeout period,
>>> re-enable scheduling and restart the TDR.
>>>
>>> v2:
>>>    - Use GT clock to msec helper (Umesh, off list)
>>>    - s/ctx_timestamp_job/ctx_job_timestamp
>>> v3:
>>>    - Fix state machine for TDR, mainly decouple sched disable and
>>>      deregister (testing)
>>>    - Rebase (CI)
>>> v4:
>>>    - Fix checkpatch && newline issue (CI)
>>>    - Do not deregister on wedged or unregistered (CI)
>>>    - Fix refcounting bugs (CI)
>>>    - Move devcoredump above VM / kernel job check (John H)
>>>    - Add comment for check_timeout state usage (John H)
>>>    - Assert pending disable not inflight when enabling scheduling (John H)
>>>    - Use enable_scheduling in other scheduling enable code (John H)
>>>    - Add comments on a few steps in TDR (John H)
>>>    - Add assert for timestamp overflow protection (John H)
>>> v6:
>>>    - Use mul_u64_u32_div (CI, checkpath)
>>>    - Change check time to dbg level (Paulo)
>>>    - Add immediate mode to sched disable (inspection)
>>>    - Use xe_gt_* messages (John H)
>>>    - Fix typo in comment (John H)
>>>    - Check timeout before clearing pending disable (Paulo)
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>> Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_guc_submit.c | 303 +++++++++++++++++++++++------
>>>    1 file changed, 242 insertions(+), 61 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> index 671c72caf0ff..cddb391888b6 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> @@ -10,6 +10,7 @@
>>>    #include <linux/circ_buf.h>
>>>    #include <linux/delay.h>
>>>    #include <linux/dma-fence-array.h>
>>> +#include <linux/math64.h>
>>>    #include <drm/drm_managed.h>
>>> @@ -23,6 +24,7 @@
>>>    #include "xe_force_wake.h"
>>>    #include "xe_gpu_scheduler.h"
>>>    #include "xe_gt.h"
>>> +#include "xe_gt_clock.h"
>>>    #include "xe_gt_printk.h"
>>>    #include "xe_guc.h"
>>>    #include "xe_guc_ct.h"
>>> @@ -62,6 +64,8 @@ exec_queue_to_guc(struct xe_exec_queue *q)
>>>    #define EXEC_QUEUE_STATE_KILLED			(1 << 7)
>>>    #define EXEC_QUEUE_STATE_WEDGED			(1 << 8)
>>>    #define EXEC_QUEUE_STATE_BANNED			(1 << 9)
>>> +#define EXEC_QUEUE_STATE_CHECK_TIMEOUT		(1 << 10)
>>> +#define EXEC_QUEUE_STATE_EXTRA_REF		(1 << 11)
>>>    static bool exec_queue_registered(struct xe_exec_queue *q)
>>>    {
>>> @@ -188,6 +192,31 @@ static void set_exec_queue_wedged(struct xe_exec_queue *q)
>>>    	atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
>>>    }
>>> +static bool exec_queue_check_timeout(struct xe_exec_queue *q)
>>> +{
>>> +	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_CHECK_TIMEOUT;
>>> +}
>>> +
>>> +static void set_exec_queue_check_timeout(struct xe_exec_queue *q)
>>> +{
>>> +	atomic_or(EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
>>> +}
>>> +
>>> +static void clear_exec_queue_check_timeout(struct xe_exec_queue *q)
>>> +{
>>> +	atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
>>> +}
>>> +
>>> +static bool exec_queue_extra_ref(struct xe_exec_queue *q)
>>> +{
>>> +	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_EXTRA_REF;
>>> +}
>>> +
>>> +static void set_exec_queue_extra_ref(struct xe_exec_queue *q)
>>> +{
>>> +	atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
>>> +}
>>> +
>>>    static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
>>>    {
>>>    	return (atomic_read(&q->guc->state) &
>>> @@ -920,6 +949,109 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>>>    	xe_sched_submission_start(sched);
>>>    }
>>> +#define ADJUST_FIVE_PERCENT(__t)	mul_u64_u32_div((__t), 105, 100)
>>> +
>>> +static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>>> +{
>>> +	struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
>>> +	u32 ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
>>> +	u32 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
>>> +	u32 timeout_ms = q->sched_props.job_timeout_ms;
>>> +	u32 diff;
>>> +	u64 running_time_ms;
>>> +
>>> +	/*
>>> +	 * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch
>>> +	 * possible overflows with a high timeout.
>>> +	 */
>>> +	xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC);
>>> +
>>> +	if (ctx_timestamp < ctx_job_timestamp)
>>> +		diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
>>> +	else
>>> +		diff = ctx_timestamp - ctx_job_timestamp;
>>> +
>>> +	/*
>>> +	 * Ensure timeout is within 5% to account for an GuC scheduling latency
>>> +	 */
>>> +	running_time_ms =
>>> +		ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff));
>>> +
>>> +	xe_gt_dbg(gt,
>>> +		  "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%u, diff=0x%08x",
>>> +		  xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>>> +		  q->guc->id, running_time_ms, timeout_ms, diff);
>>> +
>>> +	return running_time_ms >= timeout_ms;
>>> +}
>>> +
>>> +static void enable_scheduling(struct xe_exec_queue *q)
>>> +{
>>> +	MAKE_SCHED_CONTEXT_ACTION(q, ENABLE);
>>> +	struct xe_guc *guc = exec_queue_to_guc(q);
>>> +	int ret;
>>> +
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
>>> +	xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q));
>>> +
>>> +	set_exec_queue_pending_enable(q);
>>> +	set_exec_queue_enabled(q);
>>> +	trace_xe_exec_queue_scheduling_enable(q);
>>> +
>>> +	xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> +		       G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> +
>>> +	ret = wait_event_timeout(guc->ct.wq,
>>> +				 !exec_queue_pending_enable(q) ||
>>> +				 guc_read_stopped(guc), HZ * 5);
>>> +	if (!ret || guc_read_stopped(guc)) {
>>> +		xe_gt_warn(guc_to_gt(guc), "Schedule enable failed to respond");
>>> +		set_exec_queue_banned(q);
>>> +		xe_gt_reset_async(q->gt);
>>> +		xe_sched_tdr_queue_imm(&q->guc->sched);
>>> +	}
>>> +}
>>> +
>>> +static void disable_scheduling(struct xe_exec_queue *q, bool immediate)
>>> +{
>>> +	MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
>>> +	struct xe_guc *guc = exec_queue_to_guc(q);
>>> +
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
>>> +	xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
>>> +
>>> +	if (immediate)
>>> +		set_min_preemption_timeout(guc, q);
>>> +	clear_exec_queue_enabled(q);
>>> +	set_exec_queue_pending_disable(q);
>>> +	trace_xe_exec_queue_scheduling_disable(q);
>>> +
>>> +	xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> +		       G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> +}
>>> +
>>> +static void __deregister_exec_queue(struct xe_guc *guc, struct xe_exec_queue *q)
>>> +{
>>> +	u32 action[] = {
>>> +		XE_GUC_ACTION_DEREGISTER_CONTEXT,
>>> +		q->guc->id,
>>> +	};
>>> +
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
>>> +	xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q));
>>> +	xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
>>> +
>>> +	set_exec_queue_destroyed(q);
>>> +	trace_xe_exec_queue_deregister(q);
>>> +
>>> +	xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> +		       G2H_LEN_DW_DEREGISTER_CONTEXT, 1);
>>> +}
>>> +
>>>    static enum drm_gpu_sched_stat
>>>    guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>>    {
>>> @@ -927,10 +1059,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>>    	struct xe_sched_job *tmp_job;
>>>    	struct xe_exec_queue *q = job->q;
>>>    	struct xe_gpu_scheduler *sched = &q->guc->sched;
>>> -	struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
>>> +	struct xe_guc *guc = exec_queue_to_guc(q);
>>>    	int err = -ETIME;
>>>    	int i = 0;
>>> -	bool wedged;
>>> +	bool wedged, skip_timeout_check;
>>>    	/*
>>>    	 * TDR has fired before free job worker. Common if exec queue
>>> @@ -942,49 +1074,53 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>>    		return DRM_GPU_SCHED_STAT_NOMINAL;
>>>    	}
>>> -	drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
>>> -		   xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>>> -		   q->guc->id, q->flags);
>>> -	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
>>> -		   "Kernel-submitted job timed out\n");
>>> -	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
>>> -		   "VM job timed out on non-killed execqueue\n");
>>> -
>>> -	if (!exec_queue_killed(q))
>>> -		xe_devcoredump(job);
>>> -
>>> -	trace_xe_sched_job_timedout(job);
>>> -
>>> -	wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
>>> -
>>>    	/* Kill the run_job entry point */
>>>    	xe_sched_submission_stop(sched);
>>> +	/* Must check all state after stopping scheduler */
>>> +	skip_timeout_check = exec_queue_reset(q) ||
>>> +		exec_queue_killed_or_banned_or_wedged(q) ||
>>> +		exec_queue_destroyed(q);
>>> +
>>> +	/* Job hasn't started, can't be timed out */
>>> +	if (!skip_timeout_check && !xe_sched_job_started(job))
>>> +		goto rearm;
>>> +
>>>    	/*
>>> -	 * Kernel jobs should never fail, nor should VM jobs if they do
>>> -	 * somethings has gone wrong and the GT needs a reset
>>> +	 * XXX: Sampling timeout doesn't work in wedged mode as we have to
>>> +	 * modify scheduling state to read timestamp. We could read the
>>> +	 * timestamp from a register to accumulate current running time but this
>>> +	 * doesn't work for SRIOV. For now assuming timeouts in wedged mode are
>>> +	 * genuine timeouts.
>>>    	 */
>>> -	if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
>>> -			(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
>>> -		if (!xe_sched_invalidate_job(job, 2)) {
>>> -			xe_sched_add_pending_job(sched, job);
>>> -			xe_sched_submission_start(sched);
>>> -			xe_gt_reset_async(q->gt);
>>> -			goto out;
>>> -		}
>>> -	}
>>> +	wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
>>> -	/* Engine state now stable, disable scheduling if needed */
>>> +	/* Engine state now stable, disable scheduling to check timestamp */
>>>    	if (!wedged && exec_queue_registered(q)) {
>>> -		struct xe_guc *guc = exec_queue_to_guc(q);
>>>    		int ret;
>>>    		if (exec_queue_reset(q))
>>>    			err = -EIO;
>>> -		set_exec_queue_banned(q);
>>> +
>>>    		if (!exec_queue_destroyed(q)) {
>>> -			xe_exec_queue_get(q);
>>> -			disable_scheduling_deregister(guc, q);
>>> +			/*
>>> +			 * Wait for any pending G2H to flush out before
>>> +			 * modifying state
>>> +			 */
>>> +			ret = wait_event_timeout(guc->ct.wq,
>>> +						 !exec_queue_pending_enable(q) ||
>>> +						 guc_read_stopped(guc), HZ * 5);
>>> +			if (!ret || guc_read_stopped(guc))
>>> +				goto trigger_reset;
>>> +
>>> +			/*
>>> +			 * Flag communicates to G2H handler that schedule
>>> +			 * disable originated from a timeout check. The G2H then
>>> +			 * avoid triggering cleanup or deregistering the exec
>>> +			 * queue.
>>> +			 */
>>> +			set_exec_queue_check_timeout(q);
>>> +			disable_scheduling(q, skip_timeout_check);
>>>    		}
>>>    		/*
>>> @@ -1000,15 +1136,60 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>>    					 !exec_queue_pending_disable(q) ||
>>>    					 guc_read_stopped(guc), HZ * 5);
>>>    		if (!ret || guc_read_stopped(guc)) {
>>> -			drm_warn(&xe->drm, "Schedule disable failed to respond");
>>> -			xe_sched_add_pending_job(sched, job);
>>> -			xe_sched_submission_start(sched);
>>> +trigger_reset:
>>> +			xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond");
>> Not a problem introduced in this patch set so maybe not necessary to fix
>> here either. But we have seen what look like false hits on this warning in
>> some of the reset tests. The code gets here if the schedule disable
>> genuinely times out which is what the warning is saying. But it also gets
>> here if guc_read_stopped() is true and that happens if a reset occurs
>> asynchronously to this timeout check. In that situation, there is no need to
>> fire a warning - the abort is intentional and expected. It is also not
>> necessary to queue up another reset just below. It seems like the warning
>> and the reset should be inside a further 'if(!ret)' check.
>>
> Agree. It should be:
>
> if (!ret)
> 	xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond");
>
>
> Will fix in next rev or before merging.
What about the xe_gt_reset_async call? Should that be only in the case 
of genuine timeout or is there a reason to keep it in the case of an 
abort as well?

>
>>> +			set_exec_queue_extra_ref(q);
>>> +			xe_exec_queue_get(q);	/* GT reset owns this */
>>> +			set_exec_queue_banned(q);
>>>    			xe_gt_reset_async(q->gt);
>>>    			xe_sched_tdr_queue_imm(sched);
>>> -			goto out;
>>> +			goto rearm;
>>> +		}
>>> +	}
>>> +
>>> +	/*
>>> +	 * Check if job is actually timed out, if so restart job execution and TDR
>>> +	 */
>>> +	if (!wedged && !skip_timeout_check && !check_timeout(q, job) &&
>>> +	    !exec_queue_reset(q) && exec_queue_registered(q)) {
>>> +		clear_exec_queue_check_timeout(q);
>>> +		goto sched_enable;
>>> +	}
>>> +
>>> +	xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
>>> +		     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>>> +		     q->guc->id, q->flags);
>>> +	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
>>> +		   "Kernel-submitted job timed out\n");
>>> +	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
>>> +		   "VM job timed out on non-killed execqueue\n");
>> I still think it makes more sense to have these two warnings next to the
>> comment that says why these are unexpected errors...
>>
>>> +
>>> +	trace_xe_sched_job_timedout(job);
>>> +
>>> +	if (!exec_queue_killed(q))
>>> +		xe_devcoredump(job);
>>> +
>>> +	/*
>>> +	 * Kernel jobs should never fail, nor should VM jobs if they do
>>> +	 * somethings has gone wrong and the GT needs a reset
>>> +	 */
>> ... i.e. the warning about kernel jobs and VM jobs not failing should be
>> here.
>>
> Sure, can move these warn below this comment. Do you mind if I just fix
> this at merge time?
Sure.

John.

>
> Matt
>
>> John.
>>
>>> +	if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
>>> +			(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
>>> +		if (!xe_sched_invalidate_job(job, 2)) {
>>> +			clear_exec_queue_check_timeout(q);
>>> +			xe_gt_reset_async(q->gt);
>>> +			goto rearm;
>>>    		}
>>>    	}
>>> +	/* Finish cleaning up exec queue via deregister */
>>> +	set_exec_queue_banned(q);
>>> +	if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
>>> +		set_exec_queue_extra_ref(q);
>>> +		xe_exec_queue_get(q);
>>> +		__deregister_exec_queue(guc, q);
>>> +	}
>>> +
>>>    	/* Stop fence signaling */
>>>    	xe_hw_fence_irq_stop(q->fence_irq);
>>> @@ -1030,7 +1211,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>>    	/* Start fence signaling */
>>>    	xe_hw_fence_irq_start(q->fence_irq);
>>> -out:
>>> +	return DRM_GPU_SCHED_STAT_NOMINAL;
>>> +
>>> +sched_enable:
>>> +	enable_scheduling(q);
>>> +rearm:
>>> +	/*
>>> +	 * XXX: Ideally want to adjust timeout based on current exection time
>>> +	 * but there is not currently an easy way to do in DRM scheduler. With
>>> +	 * some thought, do this in a follow up.
>>> +	 */
>>> +	xe_sched_add_pending_job(sched, job);
>>> +	xe_sched_submission_start(sched);
>>> +
>>>    	return DRM_GPU_SCHED_STAT_NOMINAL;
>>>    }
>>> @@ -1133,7 +1326,6 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>>>    			   guc_read_stopped(guc));
>>>    		if (!guc_read_stopped(guc)) {
>>> -			MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
>>>    			s64 since_resume_ms =
>>>    				ktime_ms_delta(ktime_get(),
>>>    					       q->guc->resume_time);
>>> @@ -1144,12 +1336,7 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>>>    				msleep(wait_ms);
>>>    			set_exec_queue_suspended(q);
>>> -			clear_exec_queue_enabled(q);
>>> -			set_exec_queue_pending_disable(q);
>>> -			trace_xe_exec_queue_scheduling_disable(q);
>>> -
>>> -			xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> -				       G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> +			disable_scheduling(q, false);
>>>    		}
>>>    	} else if (q->guc->suspend_pending) {
>>>    		set_exec_queue_suspended(q);
>>> @@ -1160,19 +1347,11 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>>>    static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
>>>    {
>>>    	struct xe_exec_queue *q = msg->private_data;
>>> -	struct xe_guc *guc = exec_queue_to_guc(q);
>>>    	if (guc_exec_queue_allowed_to_change_state(q)) {
>>> -		MAKE_SCHED_CONTEXT_ACTION(q, ENABLE);
>>> -
>>>    		q->guc->resume_time = RESUME_PENDING;
>>>    		clear_exec_queue_suspended(q);
>>> -		set_exec_queue_pending_enable(q);
>>> -		set_exec_queue_enabled(q);
>>> -		trace_xe_exec_queue_scheduling_enable(q);
>>> -
>>> -		xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> -			       G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> +		enable_scheduling(q);
>>>    	} else {
>>>    		clear_exec_queue_suspended(q);
>>>    	}
>>> @@ -1434,8 +1613,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>>>    	/* Clean up lost G2H + reset engine state */
>>>    	if (exec_queue_registered(q)) {
>>> -		if ((exec_queue_banned(q) && exec_queue_destroyed(q)) ||
>>> -		    xe_exec_queue_is_lr(q))
>>> +		if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
>>>    			xe_exec_queue_put(q);
>>>    		else if (exec_queue_destroyed(q))
>>>    			__guc_exec_queue_fini(guc, q);
>>> @@ -1612,6 +1790,8 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q,
>>>    		smp_wmb();
>>>    		wake_up_all(&guc->ct.wq);
>>>    	} else {
>>> +		bool check_timeout = exec_queue_check_timeout(q);
>>> +
>>>    		xe_gt_assert(guc_to_gt(guc), runnable_state == 0);
>>>    		xe_gt_assert(guc_to_gt(guc), exec_queue_pending_disable(q));
>>> @@ -1619,11 +1799,12 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q,
>>>    		if (q->guc->suspend_pending) {
>>>    			suspend_fence_signal(q);
>>>    		} else {
>>> -			if (exec_queue_banned(q)) {
>>> +			if (exec_queue_banned(q) || check_timeout) {
>>>    				smp_wmb();
>>>    				wake_up_all(&guc->ct.wq);
>>>    			}
>>> -			deregister_exec_queue(guc, q);
>>> +			if (!check_timeout)
>>> +				deregister_exec_queue(guc, q);
>>>    		}
>>>    	}
>>>    }
>>> @@ -1664,7 +1845,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
>>>    	clear_exec_queue_registered(q);
>>> -	if (exec_queue_banned(q) || xe_exec_queue_is_lr(q))
>>> +	if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
>>>    		xe_exec_queue_put(q);
>>>    	else
>>>    		__guc_exec_queue_fini(guc, q);
>>> @@ -1728,7 +1909,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>>    	 * guc_exec_queue_timedout_job.
>>>    	 */
>>>    	set_exec_queue_reset(q);
>>> -	if (!exec_queue_banned(q))
>>> +	if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
>>>    		xe_guc_exec_queue_trigger_cleanup(q);
>>>    	return 0;
>>> @@ -1758,7 +1939,7 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>>>    	/* Treat the same as engine reset */
>>>    	set_exec_queue_reset(q);
>>> -	if (!exec_queue_banned(q))
>>> +	if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
>>>    		xe_guc_exec_queue_trigger_cleanup(q);
>>>    	return 0;