[PATCH v3 3/5] drm/xe: Don't initialize fences at xe_sched_job_create()

Fri May 24 13:22:33 UTC 2024

Hi, Rodrigo.

Thanks for reviewing.

On Fri, 2024-05-24 at 09:12 -0400, Rodrigo Vivi wrote:
> On Fri, May 24, 2024 at 09:19:38AM +0200, Thomas Hellström wrote:
> > Pre-allocate but don't initialize fences at xe_sched_job_create(),
> > and initialize / arm them instead at xe_sched_job_arm(). This
> > makes it possible to move xe_sched_job_create() with its memory
> > allocation out of any lock that is required for fence
> > initialization, and that may not allow memory allocation under it.
> > 
> > Replaces the struct dma_fence_array for parallell jobs with a
> > struct dma_fence_chain, since the former doesn't allow
> > a split-up between allocation and initialization.
> 
> I'm wondering if it would be possible/better to split this change
> in a separate patch.

It's possible to do it as a prereq, but then we'd have to remove all
the new code since we'd had to allocate and initialize in _job_create()
to be really consistent. So I think it is better here.

> 
> > 
> > v2:
> > - Rebase.
> > - Don't always use the first lrc when initializing parallel
> >   lrc fences.
> > - Use dma_fence_chain_contained() to access the lrc fences.
> > 
> > Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_exec_queue.c       |   5 -
> >  drivers/gpu/drm/xe/xe_exec_queue_types.h |  10 --
> >  drivers/gpu/drm/xe/xe_ring_ops.c         |  12 +-
> >  drivers/gpu/drm/xe/xe_sched_job.c        | 159 +++++++++++++------
> > ----
> >  drivers/gpu/drm/xe/xe_sched_job_types.h  |  18 ++-
> >  drivers/gpu/drm/xe/xe_trace.h            |   2 +-
> >  6 files changed, 113 insertions(+), 93 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c
> > b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index e8bf250f5b6a..e3cebec3de24 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -96,11 +96,6 @@ static struct xe_exec_queue
> > *__xe_exec_queue_alloc(struct xe_device *xe,
> >  		}
> >  	}
> >  
> > -	if (xe_exec_queue_is_parallel(q)) {
> > -		q->parallel.composite_fence_ctx =
> > dma_fence_context_alloc(1);
> > -		q->parallel.composite_fence_seqno = 0;
> > -	}
> > -
> >  	return q;
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > index ee78d497d838..f0c40e8ad80a 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > @@ -103,16 +103,6 @@ struct xe_exec_queue {
> >  		struct xe_guc_exec_queue *guc;
> >  	};
> >  
> > -	/**
> > -	 * @parallel: parallel submission state
> > -	 */
> > -	struct {
> > -		/** @parallel.composite_fence_ctx: context
> > composite fence */
> > -		u64 composite_fence_ctx;
> > -		/** @parallel.composite_fence_seqno: seqno for
> > composite fence */
> > -		u32 composite_fence_seqno;
> > -	} parallel;
> > -
> >  	/** @sched_props: scheduling properties */
> >  	struct {
> >  		/** @sched_props.timeslice_us: timeslice period in
> > micro-seconds */
> > diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c
> > b/drivers/gpu/drm/xe/xe_ring_ops.c
> > index 2705d1f9d572..f75756e7a87b 100644
> > --- a/drivers/gpu/drm/xe/xe_ring_ops.c
> > +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
> > @@ -366,7 +366,7 @@ static void emit_migration_job_gen12(struct
> > xe_sched_job *job,
> >  
> >  	dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again
> > below */
> >  
> > -	i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i);
> > +	i = emit_bb_start(job->ptrs[0].batch_addr, BIT(8), dw, i);
> >  
> >  	if (!IS_SRIOV_VF(gt_to_xe(job->q->gt))) {
> >  		/* XXX: Do we need this? Leaving for now. */
> > @@ -375,7 +375,7 @@ static void emit_migration_job_gen12(struct
> > xe_sched_job *job,
> >  		dw[i++] = preparser_disable(false);
> >  	}
> >  
> > -	i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i);
> > +	i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i);
> >  
> >  	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job-
> > >migrate_flush_flags |
> >  		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW;
> > @@ -397,7 +397,7 @@ static void emit_job_gen12_gsc(struct
> > xe_sched_job *job)
> >  	xe_gt_assert(gt, job->q->width <= 1); /* no parallel
> > submission for GSCCS */
> >  
> >  	__emit_job_gen12_simple(job, job->q->lrc,
> > -				job->batch_addr[0],
> > +				job->ptrs[0].batch_addr,
> >  				xe_sched_job_lrc_seqno(job));
> >  }
> >  
> > @@ -413,7 +413,7 @@ static void emit_job_gen12_copy(struct
> > xe_sched_job *job)
> >  
> >  	for (i = 0; i < job->q->width; ++i)
> >  		__emit_job_gen12_simple(job, job->q->lrc + i,
> > -					job->batch_addr[i],
> > +					job->ptrs[i].batch_addr,
> >  					xe_sched_job_lrc_seqno(job
> > ));
> >  }
> >  
> > @@ -424,7 +424,7 @@ static void emit_job_gen12_video(struct
> > xe_sched_job *job)
> >  	/* FIXME: Not doing parallel handshake for now */
> >  	for (i = 0; i < job->q->width; ++i)
> >  		__emit_job_gen12_video(job, job->q->lrc + i,
> > -				       job->batch_addr[i],
> > +				       job->ptrs[i].batch_addr,
> >  				      
> > xe_sched_job_lrc_seqno(job));
> >  }
> >  
> > @@ -434,7 +434,7 @@ static void
> > emit_job_gen12_render_compute(struct xe_sched_job *job)
> >  
> >  	for (i = 0; i < job->q->width; ++i)
> >  		__emit_job_gen12_render_compute(job, job->q->lrc +
> > i,
> > -						job-
> > >batch_addr[i],
> > +						job-
> > >ptrs[i].batch_addr,
> >  						xe_sched_job_lrc_s
> > eqno(job));
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> > b/drivers/gpu/drm/xe/xe_sched_job.c
> > index 874450be327e..4537c7f9099f 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > @@ -6,7 +6,7 @@
> >  #include "xe_sched_job.h"
> >  
> >  #include <drm/xe_drm.h>
> > -#include <linux/dma-fence-array.h>
> > +#include <linux/dma-fence-chain.h>
> >  #include <linux/slab.h>
> >  
> >  #include "xe_device.h"
> > @@ -29,7 +29,7 @@ int __init xe_sched_job_module_init(void)
> >  	xe_sched_job_slab =
> >  		kmem_cache_create("xe_sched_job",
> >  				  sizeof(struct xe_sched_job) +
> > -				  sizeof(u64), 0,
> > +				  sizeof(struct xe_job_ptrs), 0,
> >  				  SLAB_HWCACHE_ALIGN, NULL);
> >  	if (!xe_sched_job_slab)
> >  		return -ENOMEM;
> > @@ -37,7 +37,7 @@ int __init xe_sched_job_module_init(void)
> >  	xe_sched_job_parallel_slab =
> >  		kmem_cache_create("xe_sched_job_parallel",
> >  				  sizeof(struct xe_sched_job) +
> > -				  sizeof(u64) *
> > +				  sizeof(struct xe_job_ptrs) *
> >  				  XE_HW_ENGINE_MAX_INSTANCE, 0,
> >  				  SLAB_HWCACHE_ALIGN, NULL);
> >  	if (!xe_sched_job_parallel_slab) {
> > @@ -79,26 +79,33 @@ static struct xe_device *job_to_xe(struct
> > xe_sched_job *job)
> >  	return gt_to_xe(job->q->gt);
> >  }
> >  
> > +/* Free unused pre-allocated fences */
> > +static void xe_sched_job_free_fences(struct xe_sched_job *job)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < job->q->width; ++i) {
> > +		struct xe_job_ptrs *ptrs = &job->ptrs[i];
> > +
> > +		if (ptrs->lrc_fence)
> > +			xe_lrc_free_seqno_fence(ptrs->lrc_fence);
> > +		if (ptrs->chain_fence)
> > +			dma_fence_chain_free(ptrs->chain_fence);
> > +	}
> > +}
> > +
> >  struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q,
> >  					 u64 *batch_addr)
> >  {
> > -	struct xe_sched_job *job;
> > -	struct dma_fence **fences;
> >  	bool is_migration = xe_sched_job_is_migration(q);
> > +	struct xe_sched_job *job;
> >  	int err;
> > -	int i, j;
> > +	int i;
> >  	u32 width;
> >  
> >  	/* only a kernel context can submit a vm-less job */
> >  	XE_WARN_ON(!q->vm && !(q->flags &
> > EXEC_QUEUE_FLAG_KERNEL));
> >  
> > -	/* Migration and kernel engines have their own locking */
> > -	if (!(q->flags & (EXEC_QUEUE_FLAG_KERNEL |
> > EXEC_QUEUE_FLAG_VM))) {
> > -		lockdep_assert_held(&q->vm->lock);
> > -		if (!xe_vm_in_lr_mode(q->vm))
> > -			xe_vm_assert_held(q->vm);
> > -	}
> > -
> >  	job = job_alloc(xe_exec_queue_is_parallel(q) ||
> > is_migration);
> >  	if (!job)
> >  		return ERR_PTR(-ENOMEM);
> > @@ -111,43 +118,25 @@ struct xe_sched_job
> > *xe_sched_job_create(struct xe_exec_queue *q,
> >  	if (err)
> >  		goto err_free;
> >  
> > -	if (!xe_exec_queue_is_parallel(q)) {
> > -		job->fence = xe_lrc_create_seqno_fence(q->lrc);
> > -		if (IS_ERR(job->fence)) {
> > -			err = PTR_ERR(job->fence);
> > -			goto err_sched_job;
> > -		}
> > -		job->lrc_seqno = job->fence->seqno;
> > -	} else {
> > -		struct dma_fence_array *cf;
> > +	for (i = 0; i < q->width; ++i) {
> > +		struct dma_fence *fence =
> > xe_lrc_alloc_seqno_fence();
> > +		struct dma_fence_chain *chain;
> >  
> > -		fences = kmalloc_array(q->width, sizeof(*fences),
> > GFP_KERNEL);
> > -		if (!fences) {
> > -			err = -ENOMEM;
> > +		if (IS_ERR(fence)) {
> > +			err = PTR_ERR(fence);
> >  			goto err_sched_job;
> >  		}
> > +		job->ptrs[i].lrc_fence = fence;
> >  
> > -		for (j = 0; j < q->width; ++j) {
> > -			fences[j] = xe_lrc_create_seqno_fence(q-
> > >lrc + j);
> > -			if (IS_ERR(fences[j])) {
> > -				err = PTR_ERR(fences[j]);
> > -				goto err_fences;
> > -			}
> > -			if (!j)
> > -				job->lrc_seqno = fences[0]->seqno;
> > -		}
> > +		if (i + 1 == q->width)
> > +			continue;
> >  
> > -		cf = dma_fence_array_create(q->width, fences,
> > -					    q-
> > >parallel.composite_fence_ctx,
> > -					    q-
> > >parallel.composite_fence_seqno++,
> > -					    false);
> > -		if (!cf) {
> > -			--q->parallel.composite_fence_seqno;
> > +		chain = dma_fence_chain_alloc();
> > +		if (!chain) {
> >  			err = -ENOMEM;
> > -			goto err_fences;
> > +			goto err_sched_job;
> >  		}
> > -
> > -		job->fence = &cf->base;
> > +		job->ptrs[i].chain_fence = chain;
> >  	}
> >  
> >  	width = q->width;
> > @@ -155,19 +144,14 @@ struct xe_sched_job
> > *xe_sched_job_create(struct xe_exec_queue *q,
> >  		width = 2;
> >  
> >  	for (i = 0; i < width; ++i)
> > -		job->batch_addr[i] = batch_addr[i];
> > +		job->ptrs[i].batch_addr = batch_addr[i];
> >  
> >  	xe_pm_runtime_get_noresume(job_to_xe(job));
> >  	trace_xe_sched_job_create(job);
> >  	return job;
> >  
> > -err_fences:
> > -	for (j = j - 1; j >= 0; --j) {
> > -		--q->lrc[j].fence_ctx.next_seqno;
> > -		dma_fence_put(fences[j]);
> > -	}
> > -	kfree(fences);
> >  err_sched_job:
> > +	xe_sched_job_free_fences(job);
> >  	drm_sched_job_cleanup(&job->drm);
> >  err_free:
> >  	xe_exec_queue_put(q);
> > @@ -188,6 +172,7 @@ void xe_sched_job_destroy(struct kref *ref)
> >  		container_of(ref, struct xe_sched_job, refcount);
> >  	struct xe_device *xe = job_to_xe(job);
> >  
> > +	xe_sched_job_free_fences(job);
> >  	xe_exec_queue_put(job->q);
> >  	dma_fence_put(job->fence);
> >  	drm_sched_job_cleanup(&job->drm);
> > @@ -195,27 +180,32 @@ void xe_sched_job_destroy(struct kref *ref)
> >  	xe_pm_runtime_put(xe);
> >  }
> >  
> > -void xe_sched_job_set_error(struct xe_sched_job *job, int error)
> > +/* Set the error status under the fence to avoid racing with
> > signaling */
> > +static bool xe_fence_set_error(struct dma_fence *fence, int error)
> >  {
> > -	if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence-
> > >flags))
> > -		return;
> > +	unsigned long irq_flags;
> > +	bool signaled;
> >  
> > -	dma_fence_set_error(job->fence, error);
> > +	spin_lock_irqsave(fence->lock, irq_flags);
> > +	signaled = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence-
> > >flags);
> > +	if (!signaled)
> > +		dma_fence_set_error(fence, error);
> > +	spin_unlock_irqrestore(fence->lock, irq_flags);
> > +
> > +	return signaled;
> > +}
> >  
> > -	if (dma_fence_is_array(job->fence)) {
> > -		struct dma_fence_array *array =
> > -			to_dma_fence_array(job->fence);
> > -		struct dma_fence **child = array->fences;
> > -		unsigned int nchild = array->num_fences;
> > +void xe_sched_job_set_error(struct xe_sched_job *job, int error)
> > +{
> > +	if (xe_fence_set_error(job->fence, error))
> > +		return;
> >  
> > -		do {
> > -			struct dma_fence *current_fence =
> > *child++;
> > +	if (dma_fence_is_chain(job->fence)) {
> > +		struct dma_fence *iter;
> >  
> > -			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> > -				     &current_fence->flags))
> > -				continue;
> > -			dma_fence_set_error(current_fence, error);
> > -		} while (--nchild);
> > +		dma_fence_chain_for_each(iter, job->fence)
> > +			xe_fence_set_error(dma_fence_chain_contain
> > ed(iter),
> > +					   error);
> >  	}
> >  
> >  	trace_xe_sched_job_set_error(job);
> > @@ -230,7 +220,7 @@ bool xe_sched_job_started(struct xe_sched_job
> > *job)
> >  
> >  	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
> >  				     xe_lrc_start_seqno(lrc),
> > -				     dma_fence_array_first(job-
> > >fence)->ops);
> > +				    
> > dma_fence_chain_contained(job->fence)->ops);
> >  }
> >  
> >  bool xe_sched_job_completed(struct xe_sched_job *job)
> > @@ -244,13 +234,24 @@ bool xe_sched_job_completed(struct
> > xe_sched_job *job)
> >  
> >  	return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job),
> >  				     xe_lrc_seqno(lrc),
> > -				     dma_fence_array_first(job-
> > >fence)->ops);
> > +				    
> > dma_fence_chain_contained(job->fence)->ops);
> >  }
> >  
> >  void xe_sched_job_arm(struct xe_sched_job *job)
> >  {
> >  	struct xe_exec_queue *q = job->q;
> > +	struct dma_fence *fence, *prev;
> >  	struct xe_vm *vm = q->vm;
> > +	u64 seqno = 0;
> > +	int i;
> > +
> > +	/* Migration and kernel engines have their own locking */
> > +	if (IS_ENABLED(CONFIG_LOCKDEP) &&
> > +	    !(q->flags & (EXEC_QUEUE_FLAG_KERNEL |
> > EXEC_QUEUE_FLAG_VM))) {
> > +		lockdep_assert_held(&q->vm->lock);
> > +		if (!xe_vm_in_lr_mode(q->vm))
> > +			xe_vm_assert_held(q->vm);
> > +	}
> >  
> >  	if (vm && !xe_sched_job_is_migration(q) &&
> > !xe_vm_in_lr_mode(vm) &&
> >  	    (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q-
> > >tlb_flush_seqno)) {
> > @@ -259,6 +260,25 @@ void xe_sched_job_arm(struct xe_sched_job
> > *job)
> >  		job->ring_ops_flush_tlb = true;
> >  	}
> >  
> > +	/* Arm the pre-allocated fences */
> > +	for (i = 0; i < q->width; prev = fence, ++i) {
> > +		struct dma_fence_chain *chain;
> > +
> > +		fence = job->ptrs[i].lrc_fence;
> > +		xe_lrc_init_seqno_fence(&q->lrc[i], fence);
> > +		job->ptrs[i].lrc_fence = NULL;
> > +		if (!i) {
> > +			job->lrc_seqno = fence->seqno;
> 
> same question of the first patch remains here...

I answered in the first patch thread.

> 
> > +			continue;
> 
> but here my doubt goes a bit beyond on why you only init
> the chain the non-first one.

The fence chain structure combines two fences into one.
Either two LRC fences, or a previous fence chain plus a LRC fence.
So on the first iteration, we don't yet have two fences to combine, so
that step is skipped. There's always one fence chain less than LRC
fences, and on non-parallel jobs there is no fence chain at all.

Thanks,
Thomas

> 
> > +		}
> > +
> > +		chain = job->ptrs[i - 1].chain_fence;
> > +		dma_fence_chain_init(chain, prev, fence, seqno++);
> > +		job->ptrs[i - 1].chain_fence = NULL;
> > +		fence = &chain->base;
> > +	}
> > +
> > +	job->fence = fence;
> >  	drm_sched_job_arm(&job->drm);
> >  }
> >  
> > @@ -318,7 +338,8 @@ xe_sched_job_snapshot_capture(struct
> > xe_sched_job *job)
> >  
> >  	snapshot->batch_addr_len = q->width;
> >  	for (i = 0; i < q->width; i++)
> > -		snapshot->batch_addr[i] =
> > xe_device_uncanonicalize_addr(xe, job->batch_addr[i]);
> > +		snapshot->batch_addr[i] =
> > +			xe_device_uncanonicalize_addr(xe, job-
> > >ptrs[i].batch_addr);
> >  
> >  	return snapshot;
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > index 990ddac55ed6..0d3f76fb05ce 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > @@ -11,6 +11,20 @@
> >  #include <drm/gpu_scheduler.h>
> >  
> >  struct xe_exec_queue;
> > +struct dma_fence;
> > +struct dma_fence_chain;
> > +
> > +/**
> > + * struct xe_job_ptrs - Per hw engine instance data
> > + */
> > +struct xe_job_ptrs {
> > +	/** @lrc_fence: Pre-allocated uinitialized lrc fence.*/
> > +	struct dma_fence *lrc_fence;
> > +	/** @chain_fence: Pre-allocated ninitialized fence chain
> > node. */
> > +	struct dma_fence_chain *chain_fence;
> > +	/** @batch_addr: Batch buffer address. */
> > +	u64 batch_addr;
> > +};
> >  
> >  /**
> >   * struct xe_sched_job - XE schedule job (batch buffer tracking)
> > @@ -43,8 +57,8 @@ struct xe_sched_job {
> >  	u32 migrate_flush_flags;
> >  	/** @ring_ops_flush_tlb: The ring ops need to flush TLB
> > before payload. */
> >  	bool ring_ops_flush_tlb;
> > -	/** @batch_addr: batch buffer address of job */
> > -	u64 batch_addr[];
> > +	/** @ptrs: per instance pointers. */
> > +	struct xe_job_ptrs ptrs[];
> 
> instance and ptrs sounds so generic here...
> I'm bad with names as well... but I'm wondering if it wouldn't be
> better
> to s/xe_sched_job *job/xe_sched_jobs *jobs/
> and then this one here is 'a' job: struct xe_job job[];
> 
> Another bad option xe_batches_and_fences batches_and_fences[]; :/
> 
> >  };
> >  
> >  struct xe_sched_job_snapshot {
> > diff --git a/drivers/gpu/drm/xe/xe_trace.h
> > b/drivers/gpu/drm/xe/xe_trace.h
> > index 6c6cecc58f63..450f407c66e8 100644
> > --- a/drivers/gpu/drm/xe/xe_trace.h
> > +++ b/drivers/gpu/drm/xe/xe_trace.h
> > @@ -272,7 +272,7 @@ DECLARE_EVENT_CLASS(xe_sched_job,
> >  			   __entry->flags = job->q->flags;
> >  			   __entry->error = job->fence->error;
> >  			   __entry->fence = job->fence;
> > -			   __entry->batch_addr = (u64)job-
> > >batch_addr[0];
> > +			   __entry->batch_addr = (u64)job-
> > >ptrs[0].batch_addr;
> >  			   ),
> >  
> >  		    TP_printk("fence=%p, seqno=%u, lrc_seqno=%u,
> > guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x,
> > error=%d",
> > -- 
> > 2.44.0
> >