[PATCH] drm/xe: Take preemption into account when resubmitting jobs
Lis, Tomasz
tomasz.lis at intel.com
Tue Aug 12 18:54:23 UTC 2025
On 8/9/2025 6:34 AM, Matthew Brost wrote:
> Take preemption into account when resubmitting jobs, and adjust the new
> LRC head pointer accordingly to skip over previously executed parts of
> the job. To support this, save the head pointer of each job when it is
> emitted.
>
> This code can either be leveraged or reused for VF recovery.
Right. VF migration recovery.
This will help in amending the jobs ring fixup code with ring position
control.
>
> Signed-off-by: Matthew Brost<matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_guc_submit.c | 23 +++++++++++++++++++++--
> drivers/gpu/drm/xe/xe_ring_ops.c | 23 +++++++++++++++++++----
> drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++
> 3 files changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 1185b23b1384..3ba707bbb74d 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1954,16 +1954,35 @@ void xe_guc_submit_pause(struct xe_guc *guc)
> xe_sched_submission_stop_async(&q->guc->sched);
> }
>
> +static int guc_lrc_offset(struct xe_lrc *lrc, u32 job_head)
> +{
> + if (xe_lrc_ring_head(lrc) == job_head)
> + return 0;
not sure why we've singled out this condition rather than putting
(job_head <= xe_lrc_ring_head(lrc))
below, but that's just a matter of individual style, so can be both ways.
> +
> + if (job_head < xe_lrc_ring_head(lrc))
> + return xe_lrc_ring_head(lrc) - job_head;
> +
> + return lrc->ring.size - job_head + xe_lrc_ring_head(lrc);
I don't think it's a good idea to read the head value from LRC multiple
times,
this is vram access. Also if we're assuming the value in LRC is kept
unchanged,
maybe a comment would make sense, to avoid incorrect reuse?
But instead, since it is used 4 times, a local var is fully justified.
-Tomasz
> +}
> +
> static void guc_exec_queue_start(struct xe_exec_queue *q)
> {
> struct xe_gpu_scheduler *sched = &q->guc->sched;
>
> if (!exec_queue_killed_or_banned_or_wedged(q)) {
> + struct xe_sched_job *job;
> int i;
>
> + job = xe_sched_first_pending_job(&q->guc->sched);
> +
> trace_xe_exec_queue_resubmit(q);
> - for (i = 0; i < q->width; ++i)
> - xe_lrc_set_ring_head(q->lrc[i], q->lrc[i]->ring.tail);
> + for (i = 0; i < q->width; ++i) {
> + int offset = !job ? 0 :
> + guc_lrc_offset(q->lrc[i], job->ptrs[i].head);
> +
> + xe_lrc_set_ring_head(q->lrc[i], (q->lrc[i]->ring.tail +
> + offset) % q->lrc[i]->ring.size);
> + }
> xe_sched_resubmit_jobs(sched);
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
> index 5f15360d14bf..4dad28f0614d 100644
> --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> @@ -245,12 +245,14 @@ static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i)
>
> /* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */
> static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc,
> - u64 batch_addr, u32 seqno)
> + u64 batch_addr, u32 *head, u32 seqno)
> {
> u32 dw[MAX_JOB_SIZE_DW], i = 0;
> u32 ppgtt_flag = get_ppgtt_flag(job);
> struct xe_gt *gt = job->q->gt;
>
> + *head = lrc->ring.tail;
> +
> i = emit_copy_timestamp(lrc, dw, i);
>
> if (job->ring_ops_flush_tlb) {
> @@ -296,7 +298,7 @@ static bool has_aux_ccs(struct xe_device *xe)
> }
>
> static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
> - u64 batch_addr, u32 seqno)
> + u64 batch_addr, u32 *head, u32 seqno)
> {
> u32 dw[MAX_JOB_SIZE_DW], i = 0;
> u32 ppgtt_flag = get_ppgtt_flag(job);
> @@ -304,6 +306,8 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
> struct xe_device *xe = gt_to_xe(gt);
> bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
>
> + *head = lrc->ring.tail;
> +
> i = emit_copy_timestamp(lrc, dw, i);
>
> dw[i++] = preparser_disable(true);
> @@ -346,7 +350,8 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
>
> static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
> struct xe_lrc *lrc,
> - u64 batch_addr, u32 seqno)
> + u64 batch_addr, u32 *head,
> + u32 seqno)
> {
> u32 dw[MAX_JOB_SIZE_DW], i = 0;
> u32 ppgtt_flag = get_ppgtt_flag(job);
> @@ -355,6 +360,8 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
> bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
> u32 mask_flags = 0;
>
> + *head = lrc->ring.tail;
> +
> i = emit_copy_timestamp(lrc, dw, i);
>
> dw[i++] = preparser_disable(true);
> @@ -396,11 +403,14 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
> }
>
> static void emit_migration_job_gen12(struct xe_sched_job *job,
> - struct xe_lrc *lrc, u32 seqno)
> + struct xe_lrc *lrc, u32 *head,
> + u32 seqno)
> {
> u32 saddr = xe_lrc_start_seqno_ggtt_addr(lrc);
> u32 dw[MAX_JOB_SIZE_DW], i = 0;
>
> + *head = lrc->ring.tail;
> +
> i = emit_copy_timestamp(lrc, dw, i);
>
> i = emit_store_imm_ggtt(saddr, seqno, dw, i);
> @@ -434,6 +444,7 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
>
> __emit_job_gen12_simple(job, job->q->lrc[0],
> job->ptrs[0].batch_addr,
> + &job->ptrs[0].head,
> xe_sched_job_lrc_seqno(job));
> }
>
> @@ -443,6 +454,7 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
>
> if (xe_sched_job_is_migration(job->q)) {
> emit_migration_job_gen12(job, job->q->lrc[0],
> + &job->ptrs[0].head,
> xe_sched_job_lrc_seqno(job));
> return;
> }
> @@ -450,6 +462,7 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
> for (i = 0; i < job->q->width; ++i)
> __emit_job_gen12_simple(job, job->q->lrc[i],
> job->ptrs[i].batch_addr,
> + &job->ptrs[i].head,
> xe_sched_job_lrc_seqno(job));
> }
>
> @@ -461,6 +474,7 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
> for (i = 0; i < job->q->width; ++i)
> __emit_job_gen12_video(job, job->q->lrc[i],
> job->ptrs[i].batch_addr,
> + &job->ptrs[i].head,
> xe_sched_job_lrc_seqno(job));
> }
>
> @@ -471,6 +485,7 @@ static void emit_job_gen12_render_compute(struct xe_sched_job *job)
> for (i = 0; i < job->q->width; ++i)
> __emit_job_gen12_render_compute(job, job->q->lrc[i],
> job->ptrs[i].batch_addr,
> + &job->ptrs[i].head,
> xe_sched_job_lrc_seqno(job));
> }
>
> diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
> index dbf260dded8d..359f93b0cdca 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> @@ -24,6 +24,8 @@ struct xe_job_ptrs {
> struct dma_fence_chain *chain_fence;
> /** @batch_addr: Batch buffer address. */
> u64 batch_addr;
> + /** @head: The head pointer of the LRC when the job was submitted */
> + u32 head;
> };
>
> /**
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20250812/55662e74/attachment-0001.htm>
More information about the Intel-xe
mailing list