[PATCH v6 11/11] drm/xe: Sample ctx timestamp to determine if jobs have timed out
John Harrison
john.c.harrison at intel.com
Thu Jun 13 00:57:26 UTC 2024
On 6/12/2024 15:30, Matthew Brost wrote:
> On Wed, Jun 12, 2024 at 02:56:42PM -0700, John Harrison wrote:
>> On 6/11/2024 07:40, Matthew Brost wrote:
>>> In GuC TDR sample ctx timestamp to determine if jobs have timed out. The
>>> scheduling enable needs to be toggled to properly sample the timestamp.
>>> If a job has not been running for longer than the timeout period,
>>> re-enable scheduling and restart the TDR.
>>>
>>> v2:
>>> - Use GT clock to msec helper (Umesh, off list)
>>> - s/ctx_timestamp_job/ctx_job_timestamp
>>> v3:
>>> - Fix state machine for TDR, mainly decouple sched disable and
>>> deregister (testing)
>>> - Rebase (CI)
>>> v4:
>>> - Fix checkpatch && newline issue (CI)
>>> - Do not deregister on wedged or unregistered (CI)
>>> - Fix refcounting bugs (CI)
>>> - Move devcoredump above VM / kernel job check (John H)
>>> - Add comment for check_timeout state usage (John H)
>>> - Assert pending disable not inflight when enabling scheduling (John H)
>>> - Use enable_scheduling in other scheduling enable code (John H)
>>> - Add comments on a few steps in TDR (John H)
>>> - Add assert for timestamp overflow protection (John H)
>>> v6:
>>> - Use mul_u64_u32_div (CI, checkpath)
>>> - Change check time to dbg level (Paulo)
>>> - Add immediate mode to sched disable (inspection)
>>> - Use xe_gt_* messages (John H)
>>> - Fix typo in comment (John H)
>>> - Check timeout before clearing pending disable (Paulo)
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>> Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_guc_submit.c | 303 +++++++++++++++++++++++------
>>> 1 file changed, 242 insertions(+), 61 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> index 671c72caf0ff..cddb391888b6 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> @@ -10,6 +10,7 @@
>>> #include <linux/circ_buf.h>
>>> #include <linux/delay.h>
>>> #include <linux/dma-fence-array.h>
>>> +#include <linux/math64.h>
>>> #include <drm/drm_managed.h>
>>> @@ -23,6 +24,7 @@
>>> #include "xe_force_wake.h"
>>> #include "xe_gpu_scheduler.h"
>>> #include "xe_gt.h"
>>> +#include "xe_gt_clock.h"
>>> #include "xe_gt_printk.h"
>>> #include "xe_guc.h"
>>> #include "xe_guc_ct.h"
>>> @@ -62,6 +64,8 @@ exec_queue_to_guc(struct xe_exec_queue *q)
>>> #define EXEC_QUEUE_STATE_KILLED (1 << 7)
>>> #define EXEC_QUEUE_STATE_WEDGED (1 << 8)
>>> #define EXEC_QUEUE_STATE_BANNED (1 << 9)
>>> +#define EXEC_QUEUE_STATE_CHECK_TIMEOUT (1 << 10)
>>> +#define EXEC_QUEUE_STATE_EXTRA_REF (1 << 11)
>>> static bool exec_queue_registered(struct xe_exec_queue *q)
>>> {
>>> @@ -188,6 +192,31 @@ static void set_exec_queue_wedged(struct xe_exec_queue *q)
>>> atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
>>> }
>>> +static bool exec_queue_check_timeout(struct xe_exec_queue *q)
>>> +{
>>> + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_CHECK_TIMEOUT;
>>> +}
>>> +
>>> +static void set_exec_queue_check_timeout(struct xe_exec_queue *q)
>>> +{
>>> + atomic_or(EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
>>> +}
>>> +
>>> +static void clear_exec_queue_check_timeout(struct xe_exec_queue *q)
>>> +{
>>> + atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
>>> +}
>>> +
>>> +static bool exec_queue_extra_ref(struct xe_exec_queue *q)
>>> +{
>>> + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_EXTRA_REF;
>>> +}
>>> +
>>> +static void set_exec_queue_extra_ref(struct xe_exec_queue *q)
>>> +{
>>> + atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
>>> +}
>>> +
>>> static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
>>> {
>>> return (atomic_read(&q->guc->state) &
>>> @@ -920,6 +949,109 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>>> xe_sched_submission_start(sched);
>>> }
>>> +#define ADJUST_FIVE_PERCENT(__t) mul_u64_u32_div((__t), 105, 100)
>>> +
>>> +static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>>> +{
>>> + struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
>>> + u32 ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
>>> + u32 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
>>> + u32 timeout_ms = q->sched_props.job_timeout_ms;
>>> + u32 diff;
>>> + u64 running_time_ms;
>>> +
>>> + /*
>>> + * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch
>>> + * possible overflows with a high timeout.
>>> + */
>>> + xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC);
>>> +
>>> + if (ctx_timestamp < ctx_job_timestamp)
>>> + diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
>>> + else
>>> + diff = ctx_timestamp - ctx_job_timestamp;
>>> +
>>> + /*
>>> + * Ensure timeout is within 5% to account for an GuC scheduling latency
>>> + */
>>> + running_time_ms =
>>> + ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff));
>>> +
>>> + xe_gt_dbg(gt,
>>> + "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%u, diff=0x%08x",
>>> + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>>> + q->guc->id, running_time_ms, timeout_ms, diff);
>>> +
>>> + return running_time_ms >= timeout_ms;
>>> +}
>>> +
>>> +static void enable_scheduling(struct xe_exec_queue *q)
>>> +{
>>> + MAKE_SCHED_CONTEXT_ACTION(q, ENABLE);
>>> + struct xe_guc *guc = exec_queue_to_guc(q);
>>> + int ret;
>>> +
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
>>> + xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q));
>>> +
>>> + set_exec_queue_pending_enable(q);
>>> + set_exec_queue_enabled(q);
>>> + trace_xe_exec_queue_scheduling_enable(q);
>>> +
>>> + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> +
>>> + ret = wait_event_timeout(guc->ct.wq,
>>> + !exec_queue_pending_enable(q) ||
>>> + guc_read_stopped(guc), HZ * 5);
>>> + if (!ret || guc_read_stopped(guc)) {
>>> + xe_gt_warn(guc_to_gt(guc), "Schedule enable failed to respond");
>>> + set_exec_queue_banned(q);
>>> + xe_gt_reset_async(q->gt);
>>> + xe_sched_tdr_queue_imm(&q->guc->sched);
>>> + }
>>> +}
>>> +
>>> +static void disable_scheduling(struct xe_exec_queue *q, bool immediate)
>>> +{
>>> + MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
>>> + struct xe_guc *guc = exec_queue_to_guc(q);
>>> +
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
>>> + xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
>>> +
>>> + if (immediate)
>>> + set_min_preemption_timeout(guc, q);
>>> + clear_exec_queue_enabled(q);
>>> + set_exec_queue_pending_disable(q);
>>> + trace_xe_exec_queue_scheduling_disable(q);
>>> +
>>> + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> +}
>>> +
>>> +static void __deregister_exec_queue(struct xe_guc *guc, struct xe_exec_queue *q)
>>> +{
>>> + u32 action[] = {
>>> + XE_GUC_ACTION_DEREGISTER_CONTEXT,
>>> + q->guc->id,
>>> + };
>>> +
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
>>> + xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q));
>>> + xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
>>> +
>>> + set_exec_queue_destroyed(q);
>>> + trace_xe_exec_queue_deregister(q);
>>> +
>>> + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> + G2H_LEN_DW_DEREGISTER_CONTEXT, 1);
>>> +}
>>> +
>>> static enum drm_gpu_sched_stat
>>> guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>> {
>>> @@ -927,10 +1059,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>> struct xe_sched_job *tmp_job;
>>> struct xe_exec_queue *q = job->q;
>>> struct xe_gpu_scheduler *sched = &q->guc->sched;
>>> - struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
>>> + struct xe_guc *guc = exec_queue_to_guc(q);
>>> int err = -ETIME;
>>> int i = 0;
>>> - bool wedged;
>>> + bool wedged, skip_timeout_check;
>>> /*
>>> * TDR has fired before free job worker. Common if exec queue
>>> @@ -942,49 +1074,53 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>> return DRM_GPU_SCHED_STAT_NOMINAL;
>>> }
>>> - drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
>>> - xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>>> - q->guc->id, q->flags);
>>> - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
>>> - "Kernel-submitted job timed out\n");
>>> - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
>>> - "VM job timed out on non-killed execqueue\n");
>>> -
>>> - if (!exec_queue_killed(q))
>>> - xe_devcoredump(job);
>>> -
>>> - trace_xe_sched_job_timedout(job);
>>> -
>>> - wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
>>> -
>>> /* Kill the run_job entry point */
>>> xe_sched_submission_stop(sched);
>>> + /* Must check all state after stopping scheduler */
>>> + skip_timeout_check = exec_queue_reset(q) ||
>>> + exec_queue_killed_or_banned_or_wedged(q) ||
>>> + exec_queue_destroyed(q);
>>> +
>>> + /* Job hasn't started, can't be timed out */
>>> + if (!skip_timeout_check && !xe_sched_job_started(job))
>>> + goto rearm;
>>> +
>>> /*
>>> - * Kernel jobs should never fail, nor should VM jobs if they do
>>> - * somethings has gone wrong and the GT needs a reset
>>> + * XXX: Sampling timeout doesn't work in wedged mode as we have to
>>> + * modify scheduling state to read timestamp. We could read the
>>> + * timestamp from a register to accumulate current running time but this
>>> + * doesn't work for SRIOV. For now assuming timeouts in wedged mode are
>>> + * genuine timeouts.
>>> */
>>> - if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
>>> - (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
>>> - if (!xe_sched_invalidate_job(job, 2)) {
>>> - xe_sched_add_pending_job(sched, job);
>>> - xe_sched_submission_start(sched);
>>> - xe_gt_reset_async(q->gt);
>>> - goto out;
>>> - }
>>> - }
>>> + wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
>>> - /* Engine state now stable, disable scheduling if needed */
>>> + /* Engine state now stable, disable scheduling to check timestamp */
>>> if (!wedged && exec_queue_registered(q)) {
>>> - struct xe_guc *guc = exec_queue_to_guc(q);
>>> int ret;
>>> if (exec_queue_reset(q))
>>> err = -EIO;
>>> - set_exec_queue_banned(q);
>>> +
>>> if (!exec_queue_destroyed(q)) {
>>> - xe_exec_queue_get(q);
>>> - disable_scheduling_deregister(guc, q);
>>> + /*
>>> + * Wait for any pending G2H to flush out before
>>> + * modifying state
>>> + */
>>> + ret = wait_event_timeout(guc->ct.wq,
>>> + !exec_queue_pending_enable(q) ||
>>> + guc_read_stopped(guc), HZ * 5);
>>> + if (!ret || guc_read_stopped(guc))
>>> + goto trigger_reset;
>>> +
>>> + /*
>>> + * Flag communicates to G2H handler that schedule
>>> + * disable originated from a timeout check. The G2H then
>>> + * avoid triggering cleanup or deregistering the exec
>>> + * queue.
>>> + */
>>> + set_exec_queue_check_timeout(q);
>>> + disable_scheduling(q, skip_timeout_check);
>>> }
>>> /*
>>> @@ -1000,15 +1136,60 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>> !exec_queue_pending_disable(q) ||
>>> guc_read_stopped(guc), HZ * 5);
>>> if (!ret || guc_read_stopped(guc)) {
>>> - drm_warn(&xe->drm, "Schedule disable failed to respond");
>>> - xe_sched_add_pending_job(sched, job);
>>> - xe_sched_submission_start(sched);
>>> +trigger_reset:
>>> + xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond");
>> Not a problem introduced in this patch set so maybe not necessary to fix
>> here either. But we have seen what look like false hits on this warning in
>> some of the reset tests. The code gets here if the schedule disable
>> genuinely times out which is what the warning is saying. But it also gets
>> here if guc_read_stopped() is true and that happens if a reset occurs
>> asynchronously to this timeout check. In that situation, there is no need to
>> fire a warning - the abort is intentional and expected. It is also not
>> necessary to queue up another reset just below. It seems like the warning
>> and the reset should be inside a further 'if(!ret)' check.
>>
> Agree. It should be:
>
> if (!ret)
> xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond");
>
>
> Will fix in next rev or before merging.
What about the xe_gt_reset_async call? Should that be only in the case
of genuine timeout or is there a reason to keep it in the case of an
abort as well?
>
>>> + set_exec_queue_extra_ref(q);
>>> + xe_exec_queue_get(q); /* GT reset owns this */
>>> + set_exec_queue_banned(q);
>>> xe_gt_reset_async(q->gt);
>>> xe_sched_tdr_queue_imm(sched);
>>> - goto out;
>>> + goto rearm;
>>> + }
>>> + }
>>> +
>>> + /*
>>> + * Check if job is actually timed out, if so restart job execution and TDR
>>> + */
>>> + if (!wedged && !skip_timeout_check && !check_timeout(q, job) &&
>>> + !exec_queue_reset(q) && exec_queue_registered(q)) {
>>> + clear_exec_queue_check_timeout(q);
>>> + goto sched_enable;
>>> + }
>>> +
>>> + xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
>>> + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>>> + q->guc->id, q->flags);
>>> + xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
>>> + "Kernel-submitted job timed out\n");
>>> + xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
>>> + "VM job timed out on non-killed execqueue\n");
>> I still think it makes more sense to have these two warnings next to the
>> comment that says why these are unexpected errors...
>>
>>> +
>>> + trace_xe_sched_job_timedout(job);
>>> +
>>> + if (!exec_queue_killed(q))
>>> + xe_devcoredump(job);
>>> +
>>> + /*
>>> + * Kernel jobs should never fail, nor should VM jobs if they do
>>> + * somethings has gone wrong and the GT needs a reset
>>> + */
>> ... i.e. the warning about kernel jobs and VM jobs not failing should be
>> here.
>>
> Sure, can move these warn below this comment. Do you mind if I just fix
> this at merge time?
Sure.
John.
>
> Matt
>
>> John.
>>
>>> + if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
>>> + (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
>>> + if (!xe_sched_invalidate_job(job, 2)) {
>>> + clear_exec_queue_check_timeout(q);
>>> + xe_gt_reset_async(q->gt);
>>> + goto rearm;
>>> }
>>> }
>>> + /* Finish cleaning up exec queue via deregister */
>>> + set_exec_queue_banned(q);
>>> + if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
>>> + set_exec_queue_extra_ref(q);
>>> + xe_exec_queue_get(q);
>>> + __deregister_exec_queue(guc, q);
>>> + }
>>> +
>>> /* Stop fence signaling */
>>> xe_hw_fence_irq_stop(q->fence_irq);
>>> @@ -1030,7 +1211,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>>> /* Start fence signaling */
>>> xe_hw_fence_irq_start(q->fence_irq);
>>> -out:
>>> + return DRM_GPU_SCHED_STAT_NOMINAL;
>>> +
>>> +sched_enable:
>>> + enable_scheduling(q);
>>> +rearm:
>>> + /*
>>> + * XXX: Ideally want to adjust timeout based on current exection time
>>> + * but there is not currently an easy way to do in DRM scheduler. With
>>> + * some thought, do this in a follow up.
>>> + */
>>> + xe_sched_add_pending_job(sched, job);
>>> + xe_sched_submission_start(sched);
>>> +
>>> return DRM_GPU_SCHED_STAT_NOMINAL;
>>> }
>>> @@ -1133,7 +1326,6 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>>> guc_read_stopped(guc));
>>> if (!guc_read_stopped(guc)) {
>>> - MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
>>> s64 since_resume_ms =
>>> ktime_ms_delta(ktime_get(),
>>> q->guc->resume_time);
>>> @@ -1144,12 +1336,7 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>>> msleep(wait_ms);
>>> set_exec_queue_suspended(q);
>>> - clear_exec_queue_enabled(q);
>>> - set_exec_queue_pending_disable(q);
>>> - trace_xe_exec_queue_scheduling_disable(q);
>>> -
>>> - xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> - G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> + disable_scheduling(q, false);
>>> }
>>> } else if (q->guc->suspend_pending) {
>>> set_exec_queue_suspended(q);
>>> @@ -1160,19 +1347,11 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>>> static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
>>> {
>>> struct xe_exec_queue *q = msg->private_data;
>>> - struct xe_guc *guc = exec_queue_to_guc(q);
>>> if (guc_exec_queue_allowed_to_change_state(q)) {
>>> - MAKE_SCHED_CONTEXT_ACTION(q, ENABLE);
>>> -
>>> q->guc->resume_time = RESUME_PENDING;
>>> clear_exec_queue_suspended(q);
>>> - set_exec_queue_pending_enable(q);
>>> - set_exec_queue_enabled(q);
>>> - trace_xe_exec_queue_scheduling_enable(q);
>>> -
>>> - xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
>>> - G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
>>> + enable_scheduling(q);
>>> } else {
>>> clear_exec_queue_suspended(q);
>>> }
>>> @@ -1434,8 +1613,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
>>> /* Clean up lost G2H + reset engine state */
>>> if (exec_queue_registered(q)) {
>>> - if ((exec_queue_banned(q) && exec_queue_destroyed(q)) ||
>>> - xe_exec_queue_is_lr(q))
>>> + if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
>>> xe_exec_queue_put(q);
>>> else if (exec_queue_destroyed(q))
>>> __guc_exec_queue_fini(guc, q);
>>> @@ -1612,6 +1790,8 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q,
>>> smp_wmb();
>>> wake_up_all(&guc->ct.wq);
>>> } else {
>>> + bool check_timeout = exec_queue_check_timeout(q);
>>> +
>>> xe_gt_assert(guc_to_gt(guc), runnable_state == 0);
>>> xe_gt_assert(guc_to_gt(guc), exec_queue_pending_disable(q));
>>> @@ -1619,11 +1799,12 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q,
>>> if (q->guc->suspend_pending) {
>>> suspend_fence_signal(q);
>>> } else {
>>> - if (exec_queue_banned(q)) {
>>> + if (exec_queue_banned(q) || check_timeout) {
>>> smp_wmb();
>>> wake_up_all(&guc->ct.wq);
>>> }
>>> - deregister_exec_queue(guc, q);
>>> + if (!check_timeout)
>>> + deregister_exec_queue(guc, q);
>>> }
>>> }
>>> }
>>> @@ -1664,7 +1845,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
>>> clear_exec_queue_registered(q);
>>> - if (exec_queue_banned(q) || xe_exec_queue_is_lr(q))
>>> + if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
>>> xe_exec_queue_put(q);
>>> else
>>> __guc_exec_queue_fini(guc, q);
>>> @@ -1728,7 +1909,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>> * guc_exec_queue_timedout_job.
>>> */
>>> set_exec_queue_reset(q);
>>> - if (!exec_queue_banned(q))
>>> + if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
>>> xe_guc_exec_queue_trigger_cleanup(q);
>>> return 0;
>>> @@ -1758,7 +1939,7 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>>> /* Treat the same as engine reset */
>>> set_exec_queue_reset(q);
>>> - if (!exec_queue_banned(q))
>>> + if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
>>> xe_guc_exec_queue_trigger_cleanup(q);
>>> return 0;
More information about the Intel-xe
mailing list