[PATCH v3 1/5] drm/xe: Decouple job seqno and lrc seqno

Fri May 24 12:26:18 UTC 2024

On Fri, May 24, 2024 at 09:19:36AM +0200, Thomas Hellström wrote:
> From: Matthew Brost <matthew.brost at intel.com>
> 
> Tightly coupling these seqno presents problems if alternative fences for
> jobs are used. Decouple these for correctness.
> 
> v2:
> - Slightly reword commit message (Thomas)
> - Make sure the lrc fence ops are used in comparison (Thomas)
> - Assume seqno is unsigned rather than signed in format string (Thomas)
> 
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
>  drivers/gpu/drm/xe/xe_exec_queue.c      |  2 +-
>  drivers/gpu/drm/xe/xe_guc_submit.c      |  5 +++--
>  drivers/gpu/drm/xe/xe_ring_ops.c        | 12 ++++++------
>  drivers/gpu/drm/xe/xe_sched_job.c       | 16 ++++++++--------
>  drivers/gpu/drm/xe/xe_sched_job.h       |  5 +++++
>  drivers/gpu/drm/xe/xe_sched_job_types.h |  2 ++
>  drivers/gpu/drm/xe/xe_trace.h           |  7 +++++--
>  7 files changed, 30 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index 0fd61fb4d104..e8bf250f5b6a 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -98,7 +98,7 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe,
>  
>  	if (xe_exec_queue_is_parallel(q)) {
>  		q->parallel.composite_fence_ctx = dma_fence_context_alloc(1);
> -		q->parallel.composite_fence_seqno = XE_FENCE_INITIAL_SEQNO;
> +		q->parallel.composite_fence_seqno = 0;

Why do you change this case to 0 instead of the -127?
Shouldn't it be a separate patch since it is not directly the new lrc_seqno
that this patch is introducing? 

>  	}
>  
>  	return q;
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 54778189cfd5..53ab98c5ef5e 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -940,8 +940,9 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
>  		return DRM_GPU_SCHED_STAT_NOMINAL;
>  	}
>  
> -	drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
> -		   xe_sched_job_seqno(job), q->guc->id, q->flags);
> +	drm_notice(&xe->drm, "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx",
> +		   xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> +		   q->guc->id, q->flags);
>  	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
>  		   "Kernel-submitted job timed out\n");
>  	xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
> diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
> index a3ca718456f6..2705d1f9d572 100644
> --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> @@ -398,7 +398,7 @@ static void emit_job_gen12_gsc(struct xe_sched_job *job)
>  
>  	__emit_job_gen12_simple(job, job->q->lrc,
>  				job->batch_addr[0],
> -				xe_sched_job_seqno(job));
> +				xe_sched_job_lrc_seqno(job));

Looking at this I got a bit confused now.
If you are emitting the job, why you are using the lrc seqno and not the job seqno?
I mean, the other seqno that remains looks like a fence_seqno and not a job_seqno,
right? In this case wouldn't it be better if we could rename the other one to make
it more clear the split so the next developer changes the seqnos don't get confused
in which to use when/where?

>  }
>  
>  static void emit_job_gen12_copy(struct xe_sched_job *job)
> @@ -407,14 +407,14 @@ static void emit_job_gen12_copy(struct xe_sched_job *job)
>  
>  	if (xe_sched_job_is_migration(job->q)) {
>  		emit_migration_job_gen12(job, job->q->lrc,
> -					 xe_sched_job_seqno(job));
> +					 xe_sched_job_lrc_seqno(job));
>  		return;
>  	}
>  
>  	for (i = 0; i < job->q->width; ++i)
>  		__emit_job_gen12_simple(job, job->q->lrc + i,
> -				        job->batch_addr[i],
> -				        xe_sched_job_seqno(job));
> +					job->batch_addr[i],
> +					xe_sched_job_lrc_seqno(job));
>  }
>  
>  static void emit_job_gen12_video(struct xe_sched_job *job)
> @@ -425,7 +425,7 @@ static void emit_job_gen12_video(struct xe_sched_job *job)
>  	for (i = 0; i < job->q->width; ++i)
>  		__emit_job_gen12_video(job, job->q->lrc + i,
>  				       job->batch_addr[i],
> -				       xe_sched_job_seqno(job));
> +				       xe_sched_job_lrc_seqno(job));
>  }
>  
>  static void emit_job_gen12_render_compute(struct xe_sched_job *job)
> @@ -435,7 +435,7 @@ static void emit_job_gen12_render_compute(struct xe_sched_job *job)
>  	for (i = 0; i < job->q->width; ++i)
>  		__emit_job_gen12_render_compute(job, job->q->lrc + i,
>  						job->batch_addr[i],
> -						xe_sched_job_seqno(job));
> +						xe_sched_job_lrc_seqno(job));
>  }
>  
>  static const struct xe_ring_ops ring_ops_gen12_gsc = {
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
> index a4e030f5e019..874450be327e 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.c
> +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> @@ -117,6 +117,7 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  			err = PTR_ERR(job->fence);
>  			goto err_sched_job;
>  		}
> +		job->lrc_seqno = job->fence->seqno;
>  	} else {
>  		struct dma_fence_array *cf;
>  
> @@ -132,6 +133,8 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  				err = PTR_ERR(fences[j]);
>  				goto err_fences;
>  			}
> +			if (!j)
> +				job->lrc_seqno = fences[0]->seqno;

why the lrc_seqno is = the first fence seqno?

>  		}
>  
>  		cf = dma_fence_array_create(q->width, fences,
> @@ -144,10 +147,6 @@ struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
>  			goto err_fences;
>  		}
>  
> -		/* Sanity check */
> -		for (j = 0; j < q->width; ++j)
> -			xe_assert(job_to_xe(job), cf->base.seqno == fences[j]->seqno);
> -
>  		job->fence = &cf->base;
>  	}
>  
> @@ -229,9 +228,9 @@ bool xe_sched_job_started(struct xe_sched_job *job)
>  {
>  	struct xe_lrc *lrc = job->q->lrc;
>  
> -	return !__dma_fence_is_later(xe_sched_job_seqno(job),
> +	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
>  				     xe_lrc_start_seqno(lrc),
> -				     job->fence->ops);
> +				     dma_fence_array_first(job->fence)->ops);
>  }
>  
>  bool xe_sched_job_completed(struct xe_sched_job *job)
> @@ -243,8 +242,9 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
>  	 * parallel handshake is done.
>  	 */
>  
> -	return !__dma_fence_is_later(xe_sched_job_seqno(job), xe_lrc_seqno(lrc),
> -				     job->fence->ops);
> +	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
> +				     xe_lrc_seqno(lrc),
> +				     dma_fence_array_first(job->fence)->ops);

the s/job->fence->ops/dma_fence_array_first(job->fence)->ops
looks like a good canditate for a separate patch, no?!

>  }
>  
>  void xe_sched_job_arm(struct xe_sched_job *job)
> diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h
> index c75018f4660d..002c3b5c0a5c 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job.h
> @@ -73,6 +73,11 @@ static inline u32 xe_sched_job_seqno(struct xe_sched_job *job)
>  	return job->fence->seqno;
>  }
>  
> +static inline u32 xe_sched_job_lrc_seqno(struct xe_sched_job *job)
> +{
> +	return job->lrc_seqno;
> +}
> +
>  static inline void
>  xe_sched_job_add_migrate_flush(struct xe_sched_job *job, u32 flags)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h
> index 5e12724219fd..990ddac55ed6 100644
> --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> @@ -37,6 +37,8 @@ struct xe_sched_job {
>  		/** @user_fence.value: write back value */
>  		u64 value;
>  	} user_fence;
> +	/** @lrc_seqno: LRC seqno */
> +	u32 lrc_seqno;
>  	/** @migrate_flush_flags: Additional flush flags for migration jobs */
>  	u32 migrate_flush_flags;
>  	/** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
> diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
> index 2d56cfc09e42..6c6cecc58f63 100644
> --- a/drivers/gpu/drm/xe/xe_trace.h
> +++ b/drivers/gpu/drm/xe/xe_trace.h
> @@ -254,6 +254,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
>  
>  		    TP_STRUCT__entry(
>  			     __field(u32, seqno)
> +			     __field(u32, lrc_seqno)
>  			     __field(u16, guc_id)
>  			     __field(u32, guc_state)
>  			     __field(u32, flags)
> @@ -264,6 +265,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
>  
>  		    TP_fast_assign(
>  			   __entry->seqno = xe_sched_job_seqno(job);
> +			   __entry->lrc_seqno = xe_sched_job_lrc_seqno(job);
>  			   __entry->guc_id = job->q->guc->id;
>  			   __entry->guc_state =
>  			   atomic_read(&job->q->guc->state);
> @@ -273,8 +275,9 @@ DECLARE_EVENT_CLASS(xe_sched_job,
>  			   __entry->batch_addr = (u64)job->batch_addr[0];
>  			   ),
>  
> -		    TP_printk("fence=%p, seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
> -			      __entry->fence, __entry->seqno, __entry->guc_id,
> +		    TP_printk("fence=%p, seqno=%u, lrc_seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d",
> +			      __entry->fence, __entry->seqno,
> +			      __entry->lrc_seqno, __entry->guc_id,
>  			      __entry->batch_addr, __entry->guc_state,
>  			      __entry->flags, __entry->error)
>  );
> -- 
> 2.44.0
>