[PATCH 5/8] drm/xe/oa: Signal output fences

Dixit, Ashutosh ashutosh.dixit at intel.com
Fri Aug 9 04:15:05 UTC 2024


On Thu, 08 Aug 2024 16:19:30 -0700, Matthew Brost wrote:
>

Hi Matt,

> On Thu, Aug 08, 2024 at 10:41:36AM -0700, Ashutosh Dixit wrote:
> > Complete 'struct xe_oa_fence' to include the dma_fence used to signal
> > output fences in the xe_sync array. The fences are signalled
> > asynchronously. When there are no output fences to signal, the OA
> > configuration wait is synchronously re-introduced into the ioctl.
> >
> > Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_oa.c | 46 +++++++++++++++++++++++++++++++++++---
> >  1 file changed, 43 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
> > index 416e031ac454b..bc421cd0af6ba 100644
> > --- a/drivers/gpu/drm/xe/xe_oa.c
> > +++ b/drivers/gpu/drm/xe/xe_oa.c
> > @@ -96,6 +96,10 @@ struct xe_oa_config_bo {
> >  };
> >
> >  struct xe_oa_fence {
>
> I don't think you need this at all.
>
> The fence from the job can directly be installed via
> xe_sync_entry_signal between xe_sched_job_arm and
> xe_sched_job_push.

I think I understand what you are saying here, that
job->drm.s_fence->finished can be used to signal the output fences. But the
complication in OA is that an additional delay is needed after the register
programming batch has executed (see below where I point this out) before we
signal the output fences. So the sequence is:

* Submit a register programming batch
* Wait till this completes (signaled by the finished fence)
* Wait for an additional delay
* Now signal output fences after this delay (signal ofence->base)

So 'struct xe_oa_fence' is needed because of this additional
delay. Otherwise I agree, we could just have used the finished fence.

> Then before xe_sync_entry_cleanup check for !num_signal and directly
> wait on the job's fence.

Here also, we never want to wait in the direct ioctl call, we always want
to do the wait in the work item (so it has to be an asychronous wait).

>
> Matt
>
> > +	/* @base: dma fence base */
> > +	struct dma_fence base;
> > +	/* @lock: lock for the fence */
> > +	spinlock_t lock;
> >	/* @xe: pointer to xe device */
> >	struct xe_device *xe;
> >	/* @work: work to signal that OA configuration is applied */
> > @@ -953,9 +957,26 @@ static void xe_oa_fence_work_fn(struct work_struct *w)
> >	/* Additional empirical delay needed for NOA programming after registers are written */
> >	usleep_range(us, 2 * us);

This is the additional delay after which we signal output fences.

Please let me know if you agree with this or if there's another way to do
this.

Thanks.
--
Ashutosh



> >
> > -	kfree(ofence);
> > +	/* Now signal fence to indicate new OA configuration is active */
> > +	dma_fence_signal(&ofence->base);
> > +	dma_fence_put(&ofence->base);
> >  }
> >
> > +static const char *xe_oa_get_driver_name(struct dma_fence *fence)
> > +{
> > +	return "xe_oa";
> > +}
> > +
> > +static const char *xe_oa_get_timeline_name(struct dma_fence *fence)
> > +{
> > +	return "unbound";
> > +}
> > +
> > +static const struct dma_fence_ops xe_oa_fence_ops = {
> > +	.get_driver_name = xe_oa_get_driver_name,
> > +	.get_timeline_name = xe_oa_get_timeline_name,
> > +};
> > +
> >  static struct xe_oa_fence *xe_oa_fence_init(struct xe_device *xe, struct dma_fence *config_fence)
> >  {
> >	struct xe_oa_fence *ofence;
> > @@ -967,6 +988,8 @@ static struct xe_oa_fence *xe_oa_fence_init(struct xe_device *xe, struct dma_fen
> >	ofence->xe = xe;
> >	INIT_WORK(&ofence->work, xe_oa_fence_work_fn);
> >	ofence->config_fence = config_fence;
> > +	spin_lock_init(&ofence->lock);
> > +	dma_fence_init(&ofence->base, &xe_oa_fence_ops, &ofence->lock, 0, 0);
> >
> >	return ofence;
> >  }
> > @@ -975,8 +998,8 @@ static int xe_oa_emit_oa_config(struct xe_oa_stream *stream, struct xe_oa_config
> >  {
> >	struct xe_oa_config_bo *oa_bo;
> >	struct xe_oa_fence *ofence;
> > +	int i, err, num_signal = 0;
> >	struct dma_fence *fence;
> > -	int err;
> >
> >	/* Emit OA configuration batch */
> >	oa_bo = xe_oa_alloc_config_buffer(stream, config);
> > @@ -989,13 +1012,30 @@ static int xe_oa_emit_oa_config(struct xe_oa_stream *stream, struct xe_oa_config
> >	if (err)
> >		goto exit;
> >
> > +	/* Initialize and set fence to signal */
> >	ofence = xe_oa_fence_init(stream->oa->xe, fence);
> >	if (IS_ERR(ofence)) {
> >		err = PTR_ERR(ofence);
> >		goto put_fence;
> >	}
> >
> > -	xe_oa_fence_work_fn(&ofence->work);
> > +	for (i = 0; i < stream->num_syncs; i++)
> > +		xe_sync_entry_signal(&stream->syncs[i], &ofence->base);
> > +
> > +	/* Schedule work to signal the fence */
> > +	queue_work(system_unbound_wq, &ofence->work);
> > +
> > +	/* If nothing needs to be signaled we wait synchronously */
> > +	for (i = 0; i < stream->num_syncs; i++)
> > +		if (stream->syncs[i].flags & DRM_XE_SYNC_FLAG_SIGNAL)
> > +			num_signal++;
> > +	if (!num_signal)
> > +		flush_work(&ofence->work);
> > +
> > +	/* Done with syncs */
> > +	for (i = 0; i < stream->num_syncs; i++)
> > +		xe_sync_entry_cleanup(&stream->syncs[i]);
> > +	kfree(stream->syncs);
> >
> >	return 0;
> >  put_fence:
> > --
> > 2.41.0
> >


More information about the Intel-xe mailing list