[PATCH v3 1/5] drm/xe: Decouple job seqno and lrc seqno

Fri May 24 12:56:51 UTC 2024

Hi, Rodrigo,

Thanks for reviewing,

On Fri, 2024-05-24 at 08:26 -0400, Rodrigo Vivi wrote:
> On Fri, May 24, 2024 at 09:19:36AM +0200, Thomas Hellström wrote:
> > From: Matthew Brost <matthew.brost at intel.com>
> > 
> > Tightly coupling these seqno presents problems if alternative
> > fences for
> > jobs are used. Decouple these for correctness.
> > 
> > v2:
> > - Slightly reword commit message (Thomas)
> > - Make sure the lrc fence ops are used in comparison (Thomas)
> > - Assume seqno is unsigned rather than signed in format string
> > (Thomas)
> > 
> > Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_exec_queue.c      |  2 +-
> >  drivers/gpu/drm/xe/xe_guc_submit.c      |  5 +++--
> >  drivers/gpu/drm/xe/xe_ring_ops.c        | 12 ++++++------
> >  drivers/gpu/drm/xe/xe_sched_job.c       | 16 ++++++++--------
> >  drivers/gpu/drm/xe/xe_sched_job.h       |  5 +++++
> >  drivers/gpu/drm/xe/xe_sched_job_types.h |  2 ++
> >  drivers/gpu/drm/xe/xe_trace.h           |  7 +++++--
> >  7 files changed, 30 insertions(+), 19 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index 0fd61fb4d104..e8bf250f5b6a 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -98,7 +98,7 @@ static struct xe_exec_queue
> > *__xe_exec_queue_alloc(struct xe_device *xe,
> >  
> >  	if (xe_exec_queue_is_parallel(q)) {
> >  		q->parallel.composite_fence_ctx =
> > dma_fence_context_alloc(1);
> > -		q->parallel.composite_fence_seqno =
> > XE_FENCE_INITIAL_SEQNO;
> > +		q->parallel.composite_fence_seqno = 0;
> 
> Why do you change this case to 0 instead of the -127?
> Shouldn't it be a separate patch since it is not directly the new
> lrc_seqno
> that this patch is introducing? 

This change was done by Matthew, but I think the intention was to
explicitly separate the composite fence seqno and the lrc fence seqno
to ensure they are different and we'd see any remaining fallouts
because of that.

> 
> >  	}
> >  
> >  	return q;
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> > b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 54778189cfd5..53ab98c5ef5e 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -940,8 +940,9 @@ guc_exec_queue_timedout_job(struct
> > drm_sched_job *drm_job)
> >  		return DRM_GPU_SCHED_STAT_NOMINAL;
> >  	}
> >  
> > -	drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d,
> > flags=0x%lx",
> > -		   xe_sched_job_seqno(job), q->guc->id, q->flags);
> > +	drm_notice(&xe->drm, "Timedout job: seqno=%u,
> > lrc_seqno=%u, guc_id=%d, flags=0x%lx",
> > +		   xe_sched_job_seqno(job),
> > xe_sched_job_lrc_seqno(job),
> > +		   q->guc->id, q->flags);
> >  	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
> >  		   "Kernel-submitted job timed out\n");
> >  	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM &&
> > !exec_queue_killed(q),
> > diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c
> > b/drivers/gpu/drm/xe/xe_ring_ops.c
> > index a3ca718456f6..2705d1f9d572 100644
> > --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> > +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> > @@ -398,7 +398,7 @@ static void emit_job_gen12_gsc(struct
> > xe_sched_job *job)
> >  
> >  	__emit_job_gen12_simple(job, job->q->lrc,
> >  				job->batch_addr[0],
> > -				xe_sched_job_seqno(job));
> > +				xe_sched_job_lrc_seqno(job));
> 
> Looking at this I got a bit confused now.
> If you are emitting the job, why you are using the lrc seqno and not
> the job seqno?
> I mean, the other seqno that remains looks like a fence_seqno and not
> a job_seqno,
> right? In this case wouldn't it be better if we could rename the
> other one to make
> it more clear the split so the next developer changes the seqnos
> don't get confused
> in which to use when/where?

You mean like xe_sched_job_seqno() and xe_sched_composite_seqno()?
I'll let Matt comment on that, but it's only parallel jobs that have
composite seqnos.

> 
> >  }
> >  
> >  static void emit_job_gen12_copy(struct xe_sched_job *job)
> > @@ -407,14 +407,14 @@ static void emit_job_gen12_copy(struct
> > xe_sched_job *job)
> >  
> >  	if (xe_sched_job_is_migration(job->q)) {
> >  		emit_migration_job_gen12(job, job->q->lrc,
> > -					 xe_sched_job_seqno(job));
> > +					
> > xe_sched_job_lrc_seqno(job));
> >  		return;
> >  	}
> >  
> >  	for (i = 0; i < job->q->width; ++i)
> >  		__emit_job_gen12_simple(job, job->q->lrc + i,
> > -				        job->batch_addr[i],
> > -				        xe_sched_job_seqno(job));
> > +					job->batch_addr[i],
> > +					xe_sched_job_lrc_seqno(job
> > ));
> >  }
> >  
> >  static void emit_job_gen12_video(struct xe_sched_job *job)
> > @@ -425,7 +425,7 @@ static void emit_job_gen12_video(struct
> > xe_sched_job *job)
> >  	for (i = 0; i < job->q->width; ++i)
> >  		__emit_job_gen12_video(job, job->q->lrc + i,
> >  				       job->batch_addr[i],
> > -				       xe_sched_job_seqno(job));
> > +				      
> > xe_sched_job_lrc_seqno(job));
> >  }
> >  
> >  static void emit_job_gen12_render_compute(struct xe_sched_job
> > *job)
> > @@ -435,7 +435,7 @@ static void
> > emit_job_gen12_render_compute(struct xe_sched_job *job)
> >  	for (i = 0; i < job->q->width; ++i)
> >  		__emit_job_gen12_render_compute(job, job->q->lrc +
> > i,
> >  						job-
> > >batch_addr[i],
> > -
> > 						xe_sched_job_seqno(job));
> > +						xe_sched_job_lrc_s
> > eqno(job));
> >  }
> >  
> >  static const struct xe_ring_ops ring_ops_gen12_gsc = {
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> > b/drivers/gpu/drm/xe/xe_sched_job.c
> > index a4e030f5e019..874450be327e 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > @@ -117,6 +117,7 @@ struct xe_sched_job *xe_sched_job_create(struct
> > xe_exec_queue *q,
> >  			err = PTR_ERR(job->fence);
> >  			goto err_sched_job;
> >  		}
> > +		job->lrc_seqno = job->fence->seqno;
> >  	} else {
> >  		struct dma_fence_array *cf;
> >  
> > @@ -132,6 +133,8 @@ struct xe_sched_job *xe_sched_job_create(struct
> > xe_exec_queue *q,
> >  				err = PTR_ERR(fences[j]);
> >  				goto err_fences;
> >  			}
> > +			if (!j)
> > +				job->lrc_seqno = fences[0]->seqno;
> 
> why the lrc_seqno is = the first fence seqno?

In a parallell job, all LRC fences have the same seqno so we can
grab any.

> 
> >  		}
> >  
> >  		cf = dma_fence_array_create(q->width, fences,
> > @@ -144,10 +147,6 @@ struct xe_sched_job
> > *xe_sched_job_create(struct xe_exec_queue *q,
> >  			goto err_fences;
> >  		}
> >  
> > -		/* Sanity check */
> > -		for (j = 0; j < q->width; ++j)
> > -			xe_assert(job_to_xe(job), cf->base.seqno
> > == fences[j]->seqno);
> > -
> >  		job->fence = &cf->base;
> >  	}
> >  
> > @@ -229,9 +228,9 @@ bool xe_sched_job_started(struct xe_sched_job
> > *job)
> >  {
> >  	struct xe_lrc *lrc = job->q->lrc;
> >  
> > -	return !__dma_fence_is_later(xe_sched_job_seqno(job),
> > +	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
> >  				     xe_lrc_start_seqno(lrc),
> > -				     job->fence->ops);
> > +				     dma_fence_array_first(job-
> > >fence)->ops);
> >  }
> >  
> >  bool xe_sched_job_completed(struct xe_sched_job *job)
> > @@ -243,8 +242,9 @@ bool xe_sched_job_completed(struct xe_sched_job
> > *job)
> >  	 * parallel handshake is done.
> >  	 */
> >  
> > -	return !__dma_fence_is_later(xe_sched_job_seqno(job),
> > xe_lrc_seqno(lrc),
> > -				     job->fence->ops);
> > +	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
> > +				     xe_lrc_seqno(lrc),
> > +				     dma_fence_array_first(job-
> > >fence)->ops);
> 
> the s/job->fence->ops/dma_fence_array_first(job->fence)->ops
> looks like a good canditate for a separate patch, no?!

Since we're now explicitly using the LRC seqno for comparison here, we
need to also get at the lrc fence ops, which is the ops of one of the
contained fences. (If (job->fence) is not a dma_fence_array, the
function just returns (job->fence)).

So in all, since we change the first argument to __dma_fence_is_later,
it makes sense to also change the third to align.

Thanks,
Thomas

> 
> >  }
> >  
> >  void xe_sched_job_arm(struct xe_sched_job *job)
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.h
> > b/drivers/gpu/drm/xe/xe_sched_job.h
> > index c75018f4660d..002c3b5c0a5c 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> > @@ -73,6 +73,11 @@ static inline u32 xe_sched_job_seqno(struct
> > xe_sched_job *job)
> >  	return job->fence->seqno;
> >  }
> >  
> > +static inline u32 xe_sched_job_lrc_seqno(struct xe_sched_job *job)
> > +{
> > +	return job->lrc_seqno;
> > +}
> > +
> >  static inline void
> >  xe_sched_job_add_migrate_flush(struct xe_sched_job *job, u32
> > flags)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > index 5e12724219fd..990ddac55ed6 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > @@ -37,6 +37,8 @@ struct xe_sched_job {
> >  		/** @user_fence.value: write back value */
> >  		u64 value;
> >  	} user_fence;
> > +	/** @lrc_seqno: LRC seqno */
> > +	u32 lrc_seqno;
> >  	/** @migrate_flush_flags: Additional flush flags for
> > migration jobs */
> >  	u32 migrate_flush_flags;
> >  	/** @ring_ops_flush_tlb: The ring ops need to flush TLB
> > before payload. */
> > diff --git a/drivers/gpu/drm/xe/xe_trace.h
> > b/drivers/gpu/drm/xe/xe_trace.h
> > index 2d56cfc09e42..6c6cecc58f63 100644
> > --- a/drivers/gpu/drm/xe/xe_trace.h
> > +++ b/drivers/gpu/drm/xe/xe_trace.h
> > @@ -254,6 +254,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
> >  
> >  		    TP_STRUCT__entry(
> >  			     __field(u32, seqno)
> > +			     __field(u32, lrc_seqno)
> >  			     __field(u16, guc_id)
> >  			     __field(u32, guc_state)
> >  			     __field(u32, flags)
> > @@ -264,6 +265,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
> >  
> >  		    TP_fast_assign(
> >  			   __entry->seqno =
> > xe_sched_job_seqno(job);
> > +			   __entry->lrc_seqno =
> > xe_sched_job_lrc_seqno(job);
> >  			   __entry->guc_id = job->q->guc->id;
> >  			   __entry->guc_state =
> >  			   atomic_read(&job->q->guc->state);
> > @@ -273,8 +275,9 @@ DECLARE_EVENT_CLASS(xe_sched_job,
> >  			   __entry->batch_addr = (u64)job-
> > >batch_addr[0];
> >  			   ),
> >  
> > -		    TP_printk("fence=%p, seqno=%u, guc_id=%d,
> > batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
> > -			      __entry->fence, __entry->seqno,
> > __entry->guc_id,
> > +		    TP_printk("fence=%p, seqno=%u, lrc_seqno=%u,
> > guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x,
> > error=%d",
> > +			      __entry->fence, __entry->seqno,
> > +			      __entry->lrc_seqno, __entry->guc_id,
> >  			      __entry->batch_addr, __entry-
> > >guc_state,
> >  			      __entry->flags, __entry->error)
> >  );
> > -- 
> > 2.44.0
> >