[Intel-xe] [PATCH 2/2] drm/xe: Fix the separate bind-engine race using coarse-granularity dependencies

Sat Jul 1 04:21:29 UTC 2023

On Thu, Jun 29, 2023 at 10:51:34PM +0200, Thomas Hellström wrote:
> Separate bind-engines operating on the same VM range might race updating
> page-tables. To make sure that doesn't happen, each page-table update
> operation needs to collect internal dependencies to await before the
> job is executed.
> 
> Provide an infrastructure to do that. Initially we save a single dma-fence
> for the entire VM, which thus removes the benefit of separate bind-engines
> in favour of fixing the race, but a more fine-grained dependency
> tracking can be achieved by using, for example, the same method as the
> i915 vma_resources (an interval tree storing unsignaled fences). That
> of course comes with increasing code complexity.
> 
> This patch will break the xe_vm at bind-engines-independent igt test, but that
> would need an update anyway to avoid the independent binds using the
> same address range. In any case, such a test would not work with the
> initial xe implementation unless the binds were using different vms.
> 

We need to do better than this as this makes bind engines useless as
everything is serialized.

Hmm, how about a mtree where we store fences for un/bind jobs with the
key being the highest level in which the tree is pruned or unpruned?

Let's do an example on an empty tree with 48 bits of VA /w 4k pages

- Bind 0x0000 to 0x1000 <- Inserts mtree entry with key of 0x0 -> (0x1 << 39), fence A

- Bind 0x1000 to 0x2000 <- Waits on fence as lookup find fence A, no new
  fence inserted as the only entry inserted was a level 0 leaf

- Bind (0x1 << 39) to (0x1 << 39) + 0x1000 <- no need to wait on fence A
  as lookup fails, insert new fence B with key (0x1 << 39) -> (0x2 << 39)

- Unbind 0x1000 to 0x2000 <- no need to wait on fence A as lookup fails,
  no new fence inserted as the only entry removed was a level 0 leaf

- Unbind 0x0000 to 0x1000 <- Waits on fence as lookup find fence A,
  insert fence C with key of 0x0 -> (0x1 << 39)

I think this would be fairly simple to implement. The GPUVA series has
examples of how to implement mtrees with range keys [1].

One thing more thing is how to cleanup the mtree fences, I think a
garage collector which traverses mtree every so often which removes
signaled fences should work just fine.

What do you think? Crazy idea or does it seem reasonable? If it is the
later, lets talk on who should code this up.

Lastly, I have IGTs to expose these races, [2], [3], I think the IGTs
should work after these changes.

Matt

[1] https://patchwork.freedesktop.org/patch/544863/?series=120000&rev=3
[2] https://gitlab.freedesktop.org/drm/xe/igt-gpu-tools/-/merge_requests/13/diffs?commit_id=2de056f6e9213a804f8b0489bbd91b989834d158
[3] https://gitlab.freedesktop.org/drm/xe/igt-gpu-tools/-/merge_requests/13/diffs?commit_id=23ea98fce7523b2aa252f4fe19411f5591a5623b

> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
>  drivers/gpu/drm/xe/xe_migrate.c  |  2 ++
>  drivers/gpu/drm/xe/xe_migrate.h  |  2 ++
>  drivers/gpu/drm/xe/xe_pt.c       | 48 ++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_vm.c       |  1 +
>  drivers/gpu/drm/xe/xe_vm_types.h |  8 ++++++
>  5 files changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 41c90f6710ee..ff0a422f59a5 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -1073,6 +1073,7 @@ xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
>  		return ERR_PTR(-ETIME);
>  
>  	if (ops->pre_commit) {
> +		pt_update->job = NULL;
>  		err = ops->pre_commit(pt_update);
>  		if (err)
>  			return ERR_PTR(err);
> @@ -1294,6 +1295,7 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
>  		goto err_job;
>  
>  	if (ops->pre_commit) {
> +		pt_update->job = job;
>  		err = ops->pre_commit(pt_update);
>  		if (err)
>  			goto err_job;
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index 204337ea3b4e..b4135876e3f7 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -69,6 +69,8 @@ struct xe_migrate_pt_update {
>  	const struct xe_migrate_pt_update_ops *ops;
>  	/** @vma: The vma we're updating the pagetable for. */
>  	struct xe_vma *vma;
> +	/** @job: The job if a GPU page-table update. NULL otherwise */
> +	struct xe_sched_job *job;
>  };
>  
>  struct xe_migrate *xe_migrate_init(struct xe_tile *tile);
> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> index fe1c77b139e4..f38e7b5a3b32 100644
> --- a/drivers/gpu/drm/xe/xe_pt.c
> +++ b/drivers/gpu/drm/xe/xe_pt.c
> @@ -1119,6 +1119,42 @@ struct xe_pt_migrate_pt_update {
>  	bool locked;
>  };
>  
> +/*
> + * This function adds the needed dependencies to a page-table update job
> + * to make sure racing jobs for separate bind engines don't race writing
> + * to the same page-table range, wreaking havoc. Initially use a single
> + * fence for the entire VM. An optimization would use smaller granularity.
> + */
> +static int xe_pt_vm_dependencies(struct xe_sched_job *job, struct xe_vm *vm)
> +{
> +	int err;
> +
> +	if (!vm->last_update_fence)
> +		return 0;
> +
> +	if (dma_fence_is_signaled(vm->last_update_fence)) {
> +		dma_fence_put(vm->last_update_fence);
> +		vm->last_update_fence = NULL;
> +		return 0;
> +	}
> +
> +	/* Is this a CPU update? GPU is busy updating, so return an error */
> +	if (!job)
> +		return -ETIME;
> +
> +	dma_fence_get(vm->last_update_fence);
> +	err = drm_sched_job_add_dependency(&job->drm, vm->last_update_fence);
> +	if (err)
> +		dma_fence_put(vm->last_update_fence);
> +
> +	return err;
> +}
> +
> +static int xe_pt_pre_commit(struct xe_migrate_pt_update *pt_update)
> +{
> +	return xe_pt_vm_dependencies(pt_update->job, pt_update->vma->vm);
> +}
> +
>  static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
>  {
>  	struct xe_pt_migrate_pt_update *userptr_update =
> @@ -1126,6 +1162,10 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
>  	struct xe_vma *vma = pt_update->vma;
>  	unsigned long notifier_seq = vma->userptr.notifier_seq;
>  	struct xe_vm *vm = vma->vm;
> +	int err = xe_pt_vm_dependencies(pt_update->job, vm);
> +
> +	if (err)
> +		return err;
>  
>  	userptr_update->locked = false;
>  
> @@ -1164,6 +1204,7 @@ static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update)
>  
>  static const struct xe_migrate_pt_update_ops bind_ops = {
>  	.populate = xe_vm_populate_pgtable,
> +	.pre_commit = xe_pt_pre_commit,
>  };
>  
>  static const struct xe_migrate_pt_update_ops userptr_bind_ops = {
> @@ -1345,6 +1386,9 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_engine *e,
>  	if (!IS_ERR(fence)) {
>  		LLIST_HEAD(deferred);
>  
> +		dma_fence_put(vm->last_update_fence);
> +		vm->last_update_fence = dma_fence_get(fence);
> +
>  		/* TLB invalidation must be done before signaling rebind */
>  		if (ifence) {
>  			int err = invalidation_fence_init(tile->primary_gt, ifence, fence,
> @@ -1591,6 +1635,7 @@ xe_pt_commit_unbind(struct xe_vma *vma,
>  
>  static const struct xe_migrate_pt_update_ops unbind_ops = {
>  	.populate = xe_migrate_clear_pgtable_callback,
> +	.pre_commit = xe_pt_pre_commit,
>  };
>  
>  static const struct xe_migrate_pt_update_ops userptr_unbind_ops = {
> @@ -1666,6 +1711,9 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_engine *e
>  	if (!IS_ERR(fence)) {
>  		int err;
>  
> +		dma_fence_put(vm->last_update_fence);
> +		vm->last_update_fence = dma_fence_get(fence);
> +
>  		/* TLB invalidation must be done before signaling unbind */
>  		err = invalidation_fence_init(tile->primary_gt, ifence, fence, vma);
>  		if (err) {
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 8b8c9c5aeb01..f90f3a7c6ede 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -1517,6 +1517,7 @@ static void vm_destroy_work_func(struct work_struct *w)
>  
>  	trace_xe_vm_free(vm);
>  	dma_fence_put(vm->rebind_fence);
> +	dma_fence_put(vm->last_update_fence);
>  	dma_resv_fini(&vm->resv);
>  	kfree(vm);
>  }
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> index c148dd49a6ca..5d9eebe5c6bb 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -343,6 +343,14 @@ struct xe_vm {
>  		bool capture_once;
>  	} error_capture;
>  
> +	/**
> +	 * @last_update_fence: fence representing the last page-table
> +	 * update on this VM. Used to avoid races between separate
> +	 * bind engines. Ideally this should be an interval tree of
> +	 * unsignaled fences. Protected by the vm resv.
> +	 */
> +	struct dma_fence *last_update_fence;
> +
>  	/** @batch_invalidate_tlb: Always invalidate TLB before batch start */
>  	bool batch_invalidate_tlb;
>  };
> -- 
> 2.40.1
>