[PATCH v3 6/6] drm/xe: Sample ctx timestamp to determine if jobs have timed out
Matthew Brost
matthew.brost at intel.com
Sat Jun 8 02:29:40 UTC 2024
On Fri, Jun 07, 2024 at 07:13:51PM -0700, John Harrison wrote:
> On 6/7/2024 17:21, Matthew Brost wrote:
> > In GuC TDR sample ctx timestamp to determine if jobs have timed out. The
> > scheduling enable needs to be toggled to properly sample the timestamp.
> > If a job has not been running for longer than the timeout period,
> > re-enable scheduling and restart the TDR.
> >
> > v2:
> > - Use GT clock to msec helper (Umesh, off list)
> > - s/ctx_timestamp_job/ctx_job_timestamp
> > v3:
> > - Fix state machine for TDR, mainly decouple sched disable and deregister (testing)
> > - Rebase (CI)
> >
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_guc_submit.c | 218 +++++++++++++++++++++++------
> > 1 file changed, 175 insertions(+), 43 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 4464ba337d12..716dd489d596 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -23,6 +23,7 @@
> > #include "xe_force_wake.h"
> > #include "xe_gpu_scheduler.h"
> > #include "xe_gt.h"
> > +#include "xe_gt_clock.h"
> > #include "xe_gt_printk.h"
> > #include "xe_guc.h"
> > #include "xe_guc_ct.h"
> > @@ -62,6 +63,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
> > #define EXEC_QUEUE_STATE_KILLED (1 << 7)
> > #define EXEC_QUEUE_STATE_WEDGED (1 << 8)
> > #define EXEC_QUEUE_STATE_BANNED (1 << 9)
> > +#define EXEC_QUEUE_STATE_CHECK_TIMEOUT (1 << 10)
> > static bool exec_queue_registered(struct xe_exec_queue *q)
> > {
> > @@ -188,6 +190,21 @@ static void set_exec_queue_wedged(struct xe_exec_queue *q)
> > atomic_or(EXEC_QUEUE_STATE_WEDGED, &q->guc->state);
> > }
> > +static bool exec_queue_check_timeout(struct xe_exec_queue *q)
> > +{
> > + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_CHECK_TIMEOUT;
> > +}
> > +
> > +static void set_exec_queue_check_timeout(struct xe_exec_queue *q)
> > +{
> > + atomic_or(EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
> > +}
> > +
> > +static void clear_exec_queue_check_timeout(struct xe_exec_queue *q)
> > +{
> > + atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
> > +}
> > +
> > static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
> > {
> > return (atomic_read(&q->guc->state) &
> > @@ -920,6 +937,87 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
> > xe_sched_submission_start(sched);
> > }
> > +#define ADJUST_FIVE_PERCENT(__t) (((__t) * 105) / 100)
> > +
> > +static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
> > +{
> > + struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
> > + u32 ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
> > + u32 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
> > + u32 timeout_ms = q->sched_props.job_timeout_ms;
> Is there any range bound on this? The hardware counter is only 32bits which
> gives a wrap at ~223s at the usual 19.2MHz.
>
Yes, default Kconfig options set the to 10s. dma-fencing rules apply
to this limit too - signal in a reasonable amount of time. Anything more
than 223s seem unreasonable. Maybe add an assert here that pops if this
is over 100s to be paranoid?
> > + u32 diff, running_time_ms;
> > +
> > + if (ctx_timestamp < ctx_job_timestamp)
> > + diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
> > + else
> > + diff = ctx_timestamp - ctx_job_timestamp;
> > +
> > + /*
> > + * Ensure timeout is within 5% to account for an GuC scheduling latency
> > + */
> > + running_time_ms =
> > + ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff));
> > +
> > + drm_info(&guc_to_xe(exec_queue_to_guc(q))->drm,
> > + "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%u, timeout_ms=%u, diff=0x%08x",
> > + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> > + q->guc->id, running_time_ms, timeout_ms, diff);
> > +
> > + return running_time_ms >= timeout_ms;
> > +}
> > +
> > +static void enable_scheduling(struct xe_exec_queue *q)
> > +{
> > + MAKE_SCHED_CONTEXT_ACTION(q, ENABLE);
> > + struct xe_guc *guc = exec_queue_to_guc(q);
> > + struct xe_device *xe = guc_to_xe(guc);
> > + int ret;
> > +
> This is duplicating existing code? It would be good to replace the existing
> context enable code with a call to this as a common helper.
>
Let me see what I can do to clean things up.
> It would also be a good idea to add checks that there is not already a
> pending enable or disable in flight. A problem we have on i915 is that there
> is some really convoluted code around the context enable/disable that can
> lead to multiple operations being in flight concurrently and all sorts of
> things getting confused as a result. Indeed. it looks like the Xe done
> handler (xe_guc_sched_done_handler / handle_sched_done) is not checking the
> enable/disable data word and will break if both an enable and a disable are
> in flight back to back with the disable first.
>
Agree. Let me add some asserts to the helpers I come up with.
>
> > + set_exec_queue_pending_enable(q);
> > + set_exec_queue_enabled(q);
> > + trace_xe_exec_queue_scheduling_enable(q);
> > +
> > + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
> > + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
> > +
> > + ret = wait_event_timeout(guc->ct.wq,
> > + !exec_queue_pending_enable(q) ||
> > + guc_read_stopped(guc), HZ * 5);
> > + if (!ret || guc_read_stopped(guc)) {
> > + drm_warn(&xe->drm, "Schedule enable failed to respond");
> > + set_exec_queue_banned(q);
> > + xe_gt_reset_async(q->gt);
> > + xe_sched_tdr_queue_imm(&q->guc->sched);
> > + }
> > +}
> > +
> > +static void disable_scheduling(struct xe_exec_queue *q)
> > +{
> > + MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
> > + struct xe_guc *guc = exec_queue_to_guc(q);
> Same comment about duplication and concurrency issues. Especially as this
> one does not appear to wait for the disable to complete so could lead to a
> back-to-back d/e.
>
The waiting is owned by the caller in this case (TDR). There is another
place that could call this which could use this helper (preempt fence
suspend path). I think the later path can pipeline this call with a
scheduler enable though, while the former cannot.
> > +
> > + clear_exec_queue_enabled(q);
> > + set_exec_queue_pending_disable(q);
> > + trace_xe_exec_queue_scheduling_disable(q);
> > +
> > + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
> > + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
> > +}
> > +
> > +static void __deregister_exec_queue(struct xe_guc *guc, struct xe_exec_queue *q)
> > +{
> > + u32 action[] = {
> > + XE_GUC_ACTION_DEREGISTER_CONTEXT,
> > + q->guc->id,
> > + };
> > +
> > + set_exec_queue_destroyed(q);
> > + trace_xe_exec_queue_deregister(q);
> > +
> > + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
> > + G2H_LEN_DW_DEREGISTER_CONTEXT, 1);
> > +}
> > +
> > static enum drm_gpu_sched_stat
> > guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > {
> > @@ -928,9 +1026,11 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > struct xe_exec_queue *q = job->q;
> > struct xe_gpu_scheduler *sched = &q->guc->sched;
> > struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q));
> > + struct xe_guc *guc = exec_queue_to_guc(q);
> > int err = -ETIME;
> > int i = 0;
> > - bool wedged;
> > + bool wedged, skip_timeout_check = exec_queue_reset(q) ||
> > + exec_queue_killed_or_banned_or_wedged(q);
> > /*
> > * TDR has fired before free job worker. Common if exec queue
> > @@ -942,50 +1042,31 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > return DRM_GPU_SCHED_STAT_NOMINAL;
> > }
> > - drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
> > - xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> > - q->guc->id, q->flags);
> > - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
> > - "Kernel-submitted job timed out\n");
> > - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
> > - "VM job timed out on non-killed execqueue\n");
> > -
> > - if (!exec_queue_killed(q))
> > - xe_devcoredump(job);
> > -
> > - trace_xe_sched_job_timedout(job);
> > + /* Job hasn't started, can't be timed out */
> > + if (!skip_timeout_check && !xe_sched_job_started(job))
> > + goto rearm;
> > + /*
> > + * XXX: Sampling timeout doesn't work in wedged mode as we have to
> > + * modify scheduling state to read timestamp. We could read the
> > + * timestamp from a register to accumulate current running time but this
> > + * doesn't work for SRIOV. For now assuming timeouts in wedged mode are
> > + * genuine timeouts.
> > + */
> > wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
> I'm confused. Doesn't wedge mean that the hardware is toast so all jobs are
> dead? I.e. when we wedge we should be cancelling everything that is in
> flight rather than letting it time out?
>
Wedged mode == 2 is such that on hang we do not modify HW state (e.g. we
can't toggle schedule enable). If look at what mode does, we just take a
ref to all in use exec queues (so LRC memory sticks around and can poke
exec queue debugfs) and signal all fences.
> > /* Kill the run_job entry point */
> > xe_sched_submission_stop(sched);
> > - /*
> > - * Kernel jobs should never fail, nor should VM jobs if they do
> > - * somethings has gone wrong and the GT needs a reset
> > - */
> > - if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
> > - (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
> > - if (!xe_sched_invalidate_job(job, 2)) {
> > - xe_sched_add_pending_job(sched, job);
> > - xe_sched_submission_start(sched);
> > - xe_gt_reset_async(q->gt);
> > - goto out;
> > - }
> > - }
> > -
> > /* Engine state now stable, disable scheduling if needed */
> > if (!wedged && exec_queue_registered(q)) {
> > - struct xe_guc *guc = exec_queue_to_guc(q);
> > int ret;
> > if (exec_queue_reset(q))
> > err = -EIO;
> > - set_exec_queue_banned(q);
> > - if (!exec_queue_destroyed(q)) {
> > - xe_exec_queue_get(q);
> > - disable_scheduling_deregister(guc, q);
> > - }
> > + set_exec_queue_check_timeout(q);
> It would be nice to document what this flag is being used for. As in, why is
> it necessary to know that a timeout check is in progress?
>
Sure, will add something.
> > + if (!exec_queue_destroyed(q))
> > + disable_scheduling(q);
> Maybe have a comment that the first step is to disable in order to flush
> out the current run time. And if the already flushed run time is excessive
> then the context will be disabled anyway due to the timeout.
>
The 2nd step is to ban, deregister, and signal job fences. Can add a
comment.
> > /*
> > * Must wait for scheduling to be disabled before signalling
> > @@ -1001,14 +1082,49 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > guc_read_stopped(guc), HZ * 5);
> > if (!ret || guc_read_stopped(guc)) {
> > drm_warn(&xe->drm, "Schedule disable failed to respond");
> > - xe_sched_add_pending_job(sched, job);
> > - xe_sched_submission_start(sched);
> > + clear_exec_queue_check_timeout(q);
> > + set_exec_queue_banned(q);
> > xe_gt_reset_async(q->gt);
> > xe_sched_tdr_queue_imm(sched);
> > - goto out;
> > + goto rearm;
> > + }
> > + }
> > +
> > + if (!skip_timeout_check && !check_timeout(q, job) &&
> > + !exec_queue_reset(q)) {
> > + clear_exec_queue_check_timeout(q);
> > + goto sched_enable;
> > + }
> > +
> > + drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
> > + xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> > + q->guc->id, q->flags);
> Maybe include the context run count here?
>
Like the values from clear_exec_queue_check_timeout? We print that
information in that function.
> > + xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
> > + "Kernel-submitted job timed out\n");
> > + xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
> > + "VM job timed out on non-killed execqueue\n");
> > +
> > + trace_xe_sched_job_timedout(job);
> > +
> > + /*
> > + * Kernel jobs should never fail, nor should VM jobs if they do
> > + * somethings has gone wrong and the GT needs a reset
> > + */
> > + if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
> > + (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
> > + if (!xe_sched_invalidate_job(job, 2)) {
> > + xe_gt_reset_async(q->gt);
> > + goto rearm;
> > }
> > }
> > + if (!exec_queue_killed(q))
> > + xe_devcoredump(job);
> Is it not worth capturing the core dump for kernel / VM jobs?
>
This is in the wrong spot.
Matt
> John.
>
> > +
> > + set_exec_queue_banned(q);
> > + xe_exec_queue_get(q);
> > + __deregister_exec_queue(guc, q);
> > +
> > /* Stop fence signaling */
> > xe_hw_fence_irq_stop(q->fence_irq);
> > @@ -1030,7 +1146,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > /* Start fence signaling */
> > xe_hw_fence_irq_start(q->fence_irq);
> > -out:
> > + return DRM_GPU_SCHED_STAT_NOMINAL;
> > +
> > +sched_enable:
> > + enable_scheduling(q);
> > +rearm:
> > + /*
> > + * XXX: Ideally want to adjust timeout based on current exection time
> > + * but there is not currently an easy way to do in DRM scheduler. With
> > + * some thought, do this in a follow up.
> > + */
> > + xe_sched_add_pending_job(sched, job);
> > + xe_sched_submission_start(sched);
> > +
> > return DRM_GPU_SCHED_STAT_NOMINAL;
> > }
> > @@ -1434,7 +1562,8 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> > /* Clean up lost G2H + reset engine state */
> > if (exec_queue_registered(q)) {
> > - if ((exec_queue_banned(q) && exec_queue_destroyed(q)) ||
> > + if (((exec_queue_banned(q) || exec_queue_check_timeout(q))
> > + && exec_queue_destroyed(q)) ||
> > xe_exec_queue_is_lr(q))
> > xe_exec_queue_put(q);
> > else if (exec_queue_destroyed(q))
> > @@ -1606,11 +1735,13 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q)
> > if (q->guc->suspend_pending) {
> > suspend_fence_signal(q);
> > } else {
> > - if (exec_queue_banned(q)) {
> > + if (exec_queue_banned(q) ||
> > + exec_queue_check_timeout(q)) {
> > smp_wmb();
> > wake_up_all(&guc->ct.wq);
> > }
> > - deregister_exec_queue(guc, q);
> > + if (!exec_queue_check_timeout(q))
> > + deregister_exec_queue(guc, q);
> > }
> > }
> > }
> > @@ -1648,7 +1779,8 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
> > clear_exec_queue_registered(q);
> > - if (exec_queue_banned(q) || xe_exec_queue_is_lr(q))
> > + if (exec_queue_banned(q) || exec_queue_check_timeout(q) ||
> > + xe_exec_queue_is_lr(q))
> > xe_exec_queue_put(q);
> > else
> > __guc_exec_queue_fini(guc, q);
> > @@ -1711,7 +1843,7 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > * guc_exec_queue_timedout_job.
> > */
> > set_exec_queue_reset(q);
> > - if (!exec_queue_banned(q))
> > + if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
> > xe_guc_exec_queue_trigger_cleanup(q);
> > return 0;
> > @@ -1741,7 +1873,7 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
> > /* Treat the same as engine reset */
> > set_exec_queue_reset(q);
> > - if (!exec_queue_banned(q))
> > + if (!exec_queue_banned(q) && !exec_queue_check_timeout(q))
> > xe_guc_exec_queue_trigger_cleanup(q);
> > return 0;
>
More information about the Intel-xe
mailing list