[PATCH] drm/xe: Take preemption into account when resubmitting jobs

Fri Aug 15 01:33:22 UTC 2025

On Tue, Aug 12, 2025 at 08:54:23PM +0200, Lis, Tomasz wrote:
> 
> On 8/9/2025 6:34 AM, Matthew Brost wrote:
> > Take preemption into account when resubmitting jobs, and adjust the new
> > LRC head pointer accordingly to skip over previously executed parts of
> > the job. To support this, save the head pointer of each job when it is
> > emitted.
> > 
> > This code can either be leveraged or reused for VF recovery.
> 
> Right. VF migration recovery.
> 
> This will help in amending the jobs ring fixup code with ring position
> control.
> 
> > 
> > Signed-off-by: Matthew Brost<matthew.brost at intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_guc_submit.c      | 23 +++++++++++++++++++++--
> >   drivers/gpu/drm/xe/xe_ring_ops.c        | 23 +++++++++++++++++++----
> >   drivers/gpu/drm/xe/xe_sched_job_types.h |  2 ++
> >   3 files changed, 42 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 1185b23b1384..3ba707bbb74d 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1954,16 +1954,35 @@ void xe_guc_submit_pause(struct xe_guc *guc)
> >   		xe_sched_submission_stop_async(&q->guc->sched);
> >   }
> > +static int guc_lrc_offset(struct xe_lrc *lrc, u32 job_head)
> > +{
> > +	if (xe_lrc_ring_head(lrc) == job_head)
> > +		return 0;
> 
> not sure why we've singled out this condition rather than putting (job_head
> <= xe_lrc_ring_head(lrc))
> 
> below, but that's just a matter of individual style, so can be both ways.
> 
> > +
> > +	if (job_head < xe_lrc_ring_head(lrc))
> > +		return xe_lrc_ring_head(lrc) - job_head;
> > +
> > +	return lrc->ring.size - job_head + xe_lrc_ring_head(lrc);
> 
> I don't think it's a good idea to read the head value from LRC multiple
> times,
> 
> this is vram access. Also if we're assuming the value in LRC is kept
> unchanged,
> 
> maybe a comment would make sense, to avoid incorrect reuse?
> 
> But instead, since it is used 4 times, a local var is fully justified.
> 

Yes, even though this is not a hot path, better to have local variable.

Matt

> -Tomasz
> 
> > +}
> > +
> >   static void guc_exec_queue_start(struct xe_exec_queue *q)
> >   {
> >   	struct xe_gpu_scheduler *sched = &q->guc->sched;
> >   	if (!exec_queue_killed_or_banned_or_wedged(q)) {
> > +		struct xe_sched_job *job;
> >   		int i;
> > +		job = xe_sched_first_pending_job(&q->guc->sched);
> > +
> >   		trace_xe_exec_queue_resubmit(q);
> > -		for (i = 0; i < q->width; ++i)
> > -			xe_lrc_set_ring_head(q->lrc[i], q->lrc[i]->ring.tail);
> > +		for (i = 0; i < q->width; ++i) {
> > +			int offset = !job ? 0 :
> > +				guc_lrc_offset(q->lrc[i], job->ptrs[i].head);
> > +
> > +			xe_lrc_set_ring_head(q->lrc[i], (q->lrc[i]->ring.tail +
> > +					     offset) % q->lrc[i]->ring.size);
> > +		}
> >   		xe_sched_resubmit_jobs(sched);
> >   	}
> > diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
> > index 5f15360d14bf..4dad28f0614d 100644
> > --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> > +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> > @@ -245,12 +245,14 @@ static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i)
> >   /* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
> >   static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
> > -				    u64 batch_addr, u32 seqno)
> > +				    u64 batch_addr, u32 *head, u32 seqno)
> >   {
> >   	u32 dw[MAX_JOB_SIZE_DW], i = 0;
> >   	u32 ppgtt_flag = get_ppgtt_flag(job);
> >   	struct xe_gt *gt = job->q->gt;
> > +	*head = lrc->ring.tail;
> > +
> >   	i = emit_copy_timestamp(lrc, dw, i);
> >   	if (job->ring_ops_flush_tlb) {
> > @@ -296,7 +298,7 @@ static bool has_aux_ccs(struct xe_device *xe)
> >   }
> >   static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
> > -				   u64 batch_addr, u32 seqno)
> > +				   u64 batch_addr, u32 *head, u32 seqno)
> >   {
> >   	u32 dw[MAX_JOB_SIZE_DW], i = 0;
> >   	u32 ppgtt_flag = get_ppgtt_flag(job);
> > @@ -304,6 +306,8 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
> >   	struct xe_device *xe = gt_to_xe(gt);
> >   	bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
> > +	*head = lrc->ring.tail;
> > +
> >   	i = emit_copy_timestamp(lrc, dw, i);
> >   	dw[i++] = preparser_disable(true);
> > @@ -346,7 +350,8 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
> >   static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
> >   					    struct xe_lrc *lrc,
> > -					    u64 batch_addr, u32 seqno)
> > +					    u64 batch_addr, u32 *head,
> > +					    u32 seqno)
> >   {
> >   	u32 dw[MAX_JOB_SIZE_DW], i = 0;
> >   	u32 ppgtt_flag = get_ppgtt_flag(job);
> > @@ -355,6 +360,8 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
> >   	bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
> >   	u32 mask_flags = 0;
> > +	*head = lrc->ring.tail;
> > +
> >   	i = emit_copy_timestamp(lrc, dw, i);
> >   	dw[i++] = preparser_disable(true);
> > @@ -396,11 +403,14 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
> >   }
> >   static void emit_migration_job_gen12(struct xe_sched_job *job,
> > -				     struct xe_lrc *lrc, u32 seqno)
> > +				     struct xe_lrc *lrc, u32 *head,
> > +				     u32 seqno)
> >   {
> >   	u32 saddr = xe_lrc_start_seqno_ggtt_addr(lrc);
> >   	u32 dw[MAX_JOB_SIZE_DW], i = 0;
> > +	*head = lrc->ring.tail;
> > +
> >   	i = emit_copy_timestamp(lrc, dw, i);
> >   	i = emit_store_imm_ggtt(saddr, seqno, dw, i);
> > @@ -434,6 +444,7 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
> >   	__emit_job_gen12_simple(job, job->q->lrc[0],
> >   				job->ptrs[0].batch_addr,
> > +				&job->ptrs[0].head,
> >   				xe_sched_job_lrc_seqno(job));
> >   }
> > @@ -443,6 +454,7 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
> >   	if (xe_sched_job_is_migration(job->q)) {
> >   		emit_migration_job_gen12(job, job->q->lrc[0],
> > +					 &job->ptrs[0].head,
> >   					 xe_sched_job_lrc_seqno(job));
> >   		return;
> >   	}
> > @@ -450,6 +462,7 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
> >   	for (i = 0; i < job->q->width; ++i)
> >   		__emit_job_gen12_simple(job, job->q->lrc[i],
> >   					job->ptrs[i].batch_addr,
> > +					&job->ptrs[i].head,
> >   					xe_sched_job_lrc_seqno(job));
> >   }
> > @@ -461,6 +474,7 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
> >   	for (i = 0; i < job->q->width; ++i)
> >   		__emit_job_gen12_video(job, job->q->lrc[i],
> >   				       job->ptrs[i].batch_addr,
> > +				       &job->ptrs[i].head,
> >   				       xe_sched_job_lrc_seqno(job));
> >   }
> > @@ -471,6 +485,7 @@ static void emit_job_gen12_render_compute(struct xe_sched_job *job)
> >   	for (i = 0; i < job->q->width; ++i)
> >   		__emit_job_gen12_render_compute(job, job->q->lrc[i],
> >   						job->ptrs[i].batch_addr,
> > +						&job->ptrs[i].head,
> >   						xe_sched_job_lrc_seqno(job));
> >   }
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > index dbf260dded8d..359f93b0cdca 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > @@ -24,6 +24,8 @@ struct xe_job_ptrs {
> >   	struct dma_fence_chain *chain_fence;
> >   	/** @batch_addr: Batch buffer address. */
> >   	u64 batch_addr;
> > +	/** @head: The head pointer of the LRC when the job was submitted */
> > +	u32 head;
> >   };
> >   /**