[PATCH 3/7] drm/xe: Reserve fences where needed

Thu Mar 21 21:34:49 UTC 2024

On Thu, 2024-03-21 at 20:10 +0000, Matthew Brost wrote:
> On Thu, Mar 21, 2024 at 12:37:15PM +0100, Thomas Hellström wrote:
> > Reserve fence slots where actually needed rather than trying to
> > predict how many fence slots will be needed over a complete
> > wound-wait transaction.
> > 
> 
> You include this patch twice in the series.

Yes, the whole series got mixed up due to me forgetting to remove the
old series before sending patches out.

Please don't bother continue reviewing until v2 is out!

Some answers below.

> 
> > Fixes: 29f424eb8702 ("drm/xe/exec: move fence reservation")
> > Cc: Matthew Auld <matthew.auld at intel.com>
> > Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_exec.c         | 27 +---------------
> >  drivers/gpu/drm/xe/xe_gt_pagefault.c |  3 +-
> >  drivers/gpu/drm/xe/xe_pt.c           | 14 ++++++++
> >  drivers/gpu/drm/xe/xe_vm.c           | 48 +++++++++++++-----------
> > ----
> >  drivers/gpu/drm/xe/xe_vm.h           |  3 +-
> >  5 files changed, 40 insertions(+), 55 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec.c
> > b/drivers/gpu/drm/xe/xe_exec.c
> > index 7692ebfe7d47..511d23426a5e 100644
> > --- a/drivers/gpu/drm/xe/xe_exec.c
> > +++ b/drivers/gpu/drm/xe/xe_exec.c
> > @@ -96,41 +96,16 @@
> >  
> >  static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec)
> >  {
> > -	struct xe_vm *vm = container_of(vm_exec->vm, struct xe_vm,
> > gpuvm);
> >  	struct drm_gem_object *obj;
> >  	unsigned long index;
> > -	int num_fences;
> >  	int ret;
> >  
> >  	ret = drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec);
> >  	if (ret)
> >  		return ret;
> >  
> > -	/*
> > -	 * 1 fence slot for the final submit, and 1 more for every
> > per-tile for
> > -	 * GPU bind and 1 extra for CPU bind. Note that there are
> > potentially
> > -	 * many vma per object/dma-resv, however the fence slot
> > will just be
> > -	 * re-used, since they are largely the same timeline and
> > the seqno
> > -	 * should be in order. In the case of CPU bind there is
> > dummy fence used
> > -	 * for all CPU binds, so no need to have a per-tile slot
> > for that.
> > -	 */
> > -	num_fences = 1 + 1 + vm->xe->info.tile_count;
> > -
> > -	/*
> > -	 * We don't know upfront exactly how many fence slots we
> > will need at
> > -	 * the start of the exec, since the TTM bo_validate above
> > can consume
> > -	 * numerous fence slots. Also due to how the
> > dma_resv_reserve_fences()
> > -	 * works it only ensures that at least that many fence
> > slots are
> > -	 * available i.e if there are already 10 slots available
> > and we reserve
> > -	 * two more, it can just noop without reserving anything. 
> > With this it
> > -	 * is quite possible that TTM steals some of the fence
> > slots and then
> > -	 * when it comes time to do the vma binding and final exec
> > stage we are
> > -	 * lacking enough fence slots, leading to some nasty
> > BUG_ON() when
> > -	 * adding the fences. Hence just add our own fences here,
> > after the
> > -	 * validate stage.
> > -	 */
> >  	drm_exec_for_each_locked_object(&vm_exec->exec, index,
> > obj) {
> > -		ret = dma_resv_reserve_fences(obj->resv,
> > num_fences);
> > +		ret = dma_resv_reserve_fences(obj->resv, 1);
> >  		if (ret)
> >  			return ret;
> >  	}
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index 241c294270d9..fa9e9853c53b 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -100,10 +100,9 @@ static int xe_pf_begin(struct drm_exec *exec,
> > struct xe_vma *vma,
> >  {
> >  	struct xe_bo *bo = xe_vma_bo(vma);
> >  	struct xe_vm *vm = xe_vma_vm(vma);
> > -	unsigned int num_shared = 2; /* slots for bind + move */
> >  	int err;
> >  
> > -	err = xe_vm_prepare_vma(exec, vma, num_shared);
> > +	err = xe_vm_lock_vma(exec, vma);
> >  	if (err)
> >  		return err;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c
> > b/drivers/gpu/drm/xe/xe_pt.c
> > index 109c26e5689e..7add08b2954e 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -1235,6 +1235,13 @@ __xe_pt_bind_vma(struct xe_tile *tile,
> > struct xe_vma *vma, struct xe_exec_queue
> >  	err = xe_pt_prepare_bind(tile, vma, entries,
> > &num_entries);
> >  	if (err)
> >  		goto err;
> > +
> > +	err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
> > +	if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
> > +		err = dma_resv_reserve_fences(xe_vma_bo(vma)-
> > >ttm.base.resv, 1);
> > +	if (err)
> > +		goto err;
> > +
> 
> Again not a huge fan of updating __xe_pt_bind_vma /
> __xe_pt_unbind_vma
> here because these are going away [1] but again conceptually this is
> more or less correct the place.
> 
> [1] https://patchwork.freedesktop.org/series/125608/
> 
> >  	xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
> >  
> >  	xe_vm_dbg_print_entries(tile_to_xe(tile), entries,
> > num_entries);
> > @@ -1576,6 +1583,7 @@ __xe_pt_unbind_vma(struct xe_tile *tile,
> > struct xe_vma *vma, struct xe_exec_queu
> >  	struct dma_fence *fence = NULL;
> >  	struct invalidation_fence *ifence;
> >  	struct xe_range_fence *rfence;
> > +	int err;
> >  
> >  	LLIST_HEAD(deferred);
> >  
> > @@ -1593,6 +1601,12 @@ __xe_pt_unbind_vma(struct xe_tile *tile,
> > struct xe_vma *vma, struct xe_exec_queu
> >  	xe_pt_calc_rfence_interval(vma, &unbind_pt_update,
> > entries,
> >  				   num_entries);
> >  
> > +	err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
> > +	if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
> > +		err = dma_resv_reserve_fences(xe_vma_bo(vma)-
> > >ttm.base.resv, 1);
> > +	if (err)
> > +		return ERR_PTR(err);
> > +
> >  	ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
> >  	if (!ifence)
> >  		return ERR_PTR(-ENOMEM);
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c
> > b/drivers/gpu/drm/xe/xe_vm.c
> > index 80d43d75b1da..cff8db605743 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -485,14 +485,11 @@ static int xe_gpuvm_validate(struct
> > drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
> >  static int xe_preempt_work_begin(struct drm_exec *exec, struct
> > xe_vm *vm,
> >  				 bool *done)
> >  {
> > +	struct drm_gem_object *obj;
> > +	unsigned long index;
> >  	int err;
> >  
> > -	/*
> > -	 * 1 fence for each preempt fence plus a fence for each
> > tile from a
> > -	 * possible rebind
> > -	 */
> > -	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, vm-
> > >preempt.num_exec_queues +
> > -				   vm->xe->info.tile_count);
> > +	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
> >  	if (err)
> >  		return err;
> >  
> > @@ -507,7 +504,7 @@ static int xe_preempt_work_begin(struct
> > drm_exec *exec, struct xe_vm *vm,
> >  		return 0;
> >  	}
> >  
> > -	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, vm-
> > >preempt.num_exec_queues);
> > +	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
> 
> I think this needs to be 1 as drm_gpuvm_validate needs a fence slot,
> right?

You mean for the bo move? It should be allocated by TTM on demand.

> 
> >  	if (err)
> >  		return err;
> >  
> > @@ -515,7 +512,17 @@ static int xe_preempt_work_begin(struct
> > drm_exec *exec, struct xe_vm *vm,
> >  	if (err)
> >  		return err;
> >  
> > -	return drm_gpuvm_validate(&vm->gpuvm, exec);
> > +	err = drm_gpuvm_validate(&vm->gpuvm, exec);
> > +	if (err)
> > +		return err;
> > +
> > +	drm_exec_for_each_locked_object(exec, index, obj) {
> > +		err = dma_resv_reserve_fences(obj->resv, vm-
> > >preempt.num_exec_queues);
> 
> What if we only have external BOs? Do we allocate slots in the VM
> dma-resv slot with this loop?

Yes we do.

> 
> Also what is the behavior of?
> 
> dma_resv_reserve_fences(obj, 1);
> dma_resv_reserve_fences(obj, 1);
> 
> Do we have 2 slots now? e.g. to calls to reserve slots accumulate?

Nope, there is only guaranteed to be 1. (There have been several
attempts to change those semantics, but I don't know what happened to
those TBH). I think we need to follow the principle 

"Anything that attaches a fence needs to reserve it, and no recursion
to be future-proof".

> 
> Lastly, strickly speaking I don't we need to patch in this series, do
> we? It doesn't appaer to being fixing anything unless I'm missing
> something.

It's actually the removed comment:

" ...however the fence slot will just be
* re-used, since they are largely the same timeline and the seqno
* should be in order. In the case of CPU bind there is dummy
fence used..." 

That made me scared. With the change of invalidation fences, that is
not true anymore, and with multiple tiles we now do the right thing
with this patch AFAICT.

/Thomas

> 
> Matt
> 
> > +		if (err)
> > +			return err;
> > +	}
> > +
> > +	return 0;
> >  }
> >  
> >  static void preempt_rebind_work_func(struct work_struct *w)
> > @@ -1004,35 +1011,26 @@ static void xe_vma_destroy(struct xe_vma
> > *vma, struct dma_fence *fence)
> >  }
> >  
> >  /**
> > - * xe_vm_prepare_vma() - drm_exec utility to lock a vma
> > + * xe_vm_lock_vma() - drm_exec utility to lock a vma
> >   * @exec: The drm_exec object we're currently locking for.
> >   * @vma: The vma for witch we want to lock the vm resv and any
> > attached
> >   * object's resv.
> > - * @num_shared: The number of dma-fence slots to pre-allocate in
> > the
> > - * objects' reservation objects.
> >   *
> >   * Return: 0 on success, negative error code on error. In
> > particular
> >   * may return -EDEADLK on WW transaction contention and -EINTR if
> >   * an interruptible wait is terminated by a signal.
> >   */
> > -int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
> > -		      unsigned int num_shared)
> > +int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
> >  {
> >  	struct xe_vm *vm = xe_vma_vm(vma);
> >  	struct xe_bo *bo = xe_vma_bo(vma);
> >  	int err;
> >  
> >  	XE_WARN_ON(!vm);
> > -	if (num_shared)
> > -		err = drm_exec_prepare_obj(exec, xe_vm_obj(vm),
> > num_shared);
> > -	else
> > -		err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
> > -	if (!err && bo && !bo->vm) {
> > -		if (num_shared)
> > -			err = drm_exec_prepare_obj(exec, &bo-
> > >ttm.base, num_shared);
> > -		else
> > -			err = drm_exec_lock_obj(exec, &bo-
> > >ttm.base);
> > -	}
> > +
> > +	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
> > +	if (!err && bo && !bo->vm)
> > +		err = drm_exec_lock_obj(exec, &bo->ttm.base);
> >  
> >  	return err;
> >  }
> > @@ -1044,7 +1042,7 @@ static void xe_vma_destroy_unlocked(struct
> > xe_vma *vma)
> >  
> >  	drm_exec_init(&exec, 0, 0);
> >  	drm_exec_until_all_locked(&exec) {
> > -		err = xe_vm_prepare_vma(&exec, vma, 0);
> > +		err = xe_vm_lock_vma(&exec, vma);
> >  		drm_exec_retry_on_contention(&exec);
> >  		if (XE_WARN_ON(err))
> >  			break;
> > @@ -2511,7 +2509,7 @@ static int op_execute(struct drm_exec *exec,
> > struct xe_vm *vm,
> >  
> >  	lockdep_assert_held_write(&vm->lock);
> >  
> > -	err = xe_vm_prepare_vma(exec, vma, 1);
> > +	err = xe_vm_lock_vma(exec, vma);
> >  	if (err)
> >  		return err;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_vm.h
> > b/drivers/gpu/drm/xe/xe_vm.h
> > index 6df1f1c7f85d..47f3960120a0 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.h
> > +++ b/drivers/gpu/drm/xe/xe_vm.h
> > @@ -242,8 +242,7 @@ bool xe_vm_validate_should_retry(struct
> > drm_exec *exec, int err, ktime_t *end);
> >  
> >  int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int
> > gt_id);
> >  
> > -int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
> > -		      unsigned int num_shared);
> > +int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
> >  
> >  /**
> >   * xe_vm_resv() - Return's the vm's reservation object
> > -- 
> > 2.44.0
> >